Cython
Contents
Cython#
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10,6)
%config InlineBackend.figure_format = 'retina'
import numpy as np
import warnings
warnings.filterwarnings("ignore")
Cython provides extra syntax allowing for static type declarations (remember: Python is generally dynamically typed)
Python code gets translated into optimised C/C++ code and compiled as Python extension modules
Cython allows you to write fast C code in a Python-like syntax.
Furthermore, linking to existing C libraries is simplified.
Pure Python Function
\(f(x)=-2x^3+5x^2+x\),
def f(x):
return -4*x**3 +3*x**2 +2*x
x = np.linspace(-1,1,100)
ax = plt.subplot(1,1,1)
ax.plot(x, f(x))
ax.fill_between(x, 0, f(x));
we compute integral \(\int_a^b f(x)dx\) numerically with \(N\) points.
def integrate_f_py(a,b,N):
s = 0
dx = (b - a) / (N-1)
for i in range(N-1): # we intentionally use the bad way to do this with a loop
x = a + i*dx
s += (f(x)+f(x+dx))/2
return s*dx
%timeit integrate_f_py(-1,1,10**3)
print(integrate_f_py(-1,1,1000))
741 µs ± 2.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174
%load_ext heat
%%heat
def f(x):
return -4*x**3 +3*x**2 +2*x
def integrate_f(a, b, N):
s = 0
dx = (b - a) / (N-1)
for i in range(N-1):
x = a + i*dx
s += (f(x)+f(x+dx))/2
return s*dx
integrate_f(0, 10, 1000)
Pure C function
%%file integral_f_c.c
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#define NB_RUNS 1000
double f(double x) {
return -4*x*x*x +3*x*x +2*x;
}
double integrate_f_c(double a, double b, int N) {
double s = 0;
double dx = (b - a) / (N-1);
for(int i=0; i<N-1; ++i){
double x = a + i*dx;
s += (f(x)+f(x+dx))/2.0;
}
return s*dx;
}
int main(int argc, char **argv)
{
double a = atof(argv[1]);
double b = atof(argv[2]);
int N = atoi(argv[3]);
double res = 0;
clock_t begin = clock();
for (int i=0; i<NB_RUNS; ++i)
res += integrate_f_c( a, b, N );
clock_t end = clock();
fprintf( stdout, "integral_f(%3.1f, %3.1f, %d) = %f \n", a, b, N, res / NB_RUNS );
fprintf( stdout, "time = %e ms \n", (double)(end - begin) / CLOCKS_PER_SEC );
return 0;
}
Writing integral_f_c.c
!gcc -O3 integral_f_c.c; ./a.out -1 1 1000
integral_f(-1.0, 1.0, 1000) = 2.000004
time = 4.632000e-03 ms
Cython compilation: Generating C code#
Load Cython in jupyter notebook.
%load_ext Cython
C Variable and Type definitions#
In general, use cdef
to declare C variables.
The command :
$ cython -a mycode.pyx
outputs an html file. It shows what parts of your code are C, which parts are Python, and where C-Python conversion occurs.
%%cython -a
cdef int i, j = 2, k = 3 # assigning values at declaration
i = 1 # assigning values afterwards
# avoid Python-C conversion! It's expensive:
a = 5
i = a
# same with C-Python conversion:
b = j
print("a = %d" % a)
print("i = %d" % i)
print("b = %d" % b)
a = 5
i = 5
b = 2
Generated by Cython 0.29.32
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
+01: cdef int i, j = 2, k = 3 # assigning values at declaration
__pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_j = 2; __pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_k = 3;
+02: i = 1 # assigning values afterwards
__pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_i = 1;
03: # avoid Python-C conversion! It's expensive:
+04: a = 5
if (PyDict_SetItem(__pyx_d, __pyx_n_s_a, __pyx_int_5) < 0) __PYX_ERR(0, 4, __pyx_L1_error)
+05: i = a
__Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_a); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_i = __pyx_t_2;
06: # same with C-Python conversion:
+07: b = j
__pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_j); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_b, __pyx_t_1) < 0) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+08: print("a = %d" % a)
__Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_a); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = __Pyx_PyUnicode_FormatSafe(__pyx_kp_u_a_d, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+09: print("i = %d" % i)
__pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_i); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_i_d, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+10: print("b = %d" % b)
__Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_b); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = __Pyx_PyUnicode_FormatSafe(__pyx_kp_u_b_d, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
Another Python vs. Cython coloring guide#
%%cython -a
cdef int m, n
cdef double cy_total = 0.0
for m in range(10):
n = 2*m
cy_total += n
a, b = 0, 0
py_total = 0.0
for a in range(10):
b = 2*a
py_total += b
print(cy_total, py_total)
90.0 90.0
Generated by Cython 0.29.32
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01: cdef int m, n
+02: cdef double cy_total = 0.0
__pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total = 0.0;
+03: for m in range(10):
for (__pyx_t_1 = 0; __pyx_t_1 < 10; __pyx_t_1+=1) { __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_m = __pyx_t_1;
+04: n = 2*m
__pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_n = (2 * __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_m);
+05: cy_total += n
__pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total = (__pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total + __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_n); }
+06: a, b = 0, 0
__pyx_t_2 = __pyx_int_0; __Pyx_INCREF(__pyx_t_2); __pyx_t_3 = __pyx_int_0; __Pyx_INCREF(__pyx_t_3); if (PyDict_SetItem(__pyx_d, __pyx_n_s_a, __pyx_t_2) < 0) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; if (PyDict_SetItem(__pyx_d, __pyx_n_s_b, __pyx_t_3) < 0) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+07: py_total = 0.0
if (PyDict_SetItem(__pyx_d, __pyx_n_s_py_total, __pyx_float_0_0) < 0) __PYX_ERR(0, 7, __pyx_L1_error)
+08: for a in range(10):
for (__pyx_t_4 = 0; __pyx_t_4 < 10; __pyx_t_4+=1) { __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); if (PyDict_SetItem(__pyx_d, __pyx_n_s_a, __pyx_t_3) < 0) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+09: b = 2*a
__Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_a); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_2 = PyNumber_Multiply(__pyx_int_2, __pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; if (PyDict_SetItem(__pyx_d, __pyx_n_s_b, __pyx_t_2) < 0) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+10: py_total += b
__Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_n_s_py_total); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_b); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_5 = PyNumber_InPlaceAdd(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; if (PyDict_SetItem(__pyx_d, __pyx_n_s_py_total, __pyx_t_5) < 0) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; }
+11: print(cy_total, py_total)
__pyx_t_5 = PyFloat_FromDouble(__pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_py_total); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_5); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_3); __pyx_t_5 = 0; __pyx_t_3 = 0; __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_print, __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
%%cython -a
cdef struct Grail:
int age
float volume
cdef union Food:
char *spam
float *eggs
cdef enum CheeseType:
cheddar, edam,
camembert
cdef enum CheeseState:
hard = 1
soft = 2
runny = 3
cdef Grail holy
holy.age = 500
holy.volume = 10.0
print (holy.age, holy.volume)
500 10.0
Generated by Cython 0.29.32
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
+01: cdef struct Grail:
struct __pyx_t_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_Grail { int age; float volume; };
02: int age
03: float volume
04: cdef union Food:
05: char *spam
06: float *eggs
+07: cdef enum CheeseType:
enum __pyx_t_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_CheeseType { __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_cheddar, __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_edam, __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_camembert };
08: cheddar, edam,
09: camembert
+10: cdef enum CheeseState:
enum __pyx_t_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_CheeseState { __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_hard = 1, __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_soft = 2, __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_runny = 3 };
11: hard = 1
12: soft = 2
13: runny = 3
14: cdef Grail holy
+15: holy.age = 500
__pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.age = 0x1F4;
+16: holy.volume = 10.0
__pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.volume = 10.0;
+17: print (holy.age, holy.volume)
__pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.age); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_2 = PyFloat_FromDouble(__pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.volume); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_2); __pyx_t_1 = 0; __pyx_t_2 = 0; __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_print, __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
Cython Functions#
Use cdef to define a Cython function.
Cython function can accept either (inclusive) Python and C values as well as return either Python or C values,
Within a Cython module Python and Cython functions can call each other freely. However, only Python functions can be called from outside the module by Python code. (i.e. importing/exporting a Cython module into some Python code)
cpdef define a Cython function with a simple Python wrapper. However, when called from Cython the Cython / C code is called directly, bypassing the Python wrapper.
Writing pure code in Cython gives a small speed boost. Note that none of the code below is Cython-specific. Just add .pyx
instead of .py
extension.
%%file cython_f_example.pyx
def f(x):
return -4*x**3 +3*x**2 +2*x
def integrate_f(a, b, N):
s = 0
dx = (b - a) / (N-1)
for i in range(N-1):
x = a + i*dx
s += (f(x)+f(x+dx))/2
return s*dx
Writing cython_f_example.pyx
Cython Compilation#
The .pyx source file is compiled by Cython to a .c file.
The .c source file contains the code of a Python extension module.
The .c file is compiled by a C compiler to a .so (shared object library) file which can be imported directly into a Python session.
Build with CMake#
project(cython_f_example CXX)
include(UseCython) # Load Cython functions
# Set C++ output
set_source_file_properties(cython_f_example.pyx PROPERTIES CYTHON_IS_CXX TRUE )
# Build the extension module
cython_add_module( modname cython_f_example.pyx cython_f_example.cpp )
C/C++ generation with cython application#
cython -3 cython_f_example.pyx # create the C file for Python 3
cython -3 --cplus cython_f_example.pyx # create the C++ file for Python 3
build with a C/C++ compiler#
To build use the Makefile:
CC=gcc
CFLAGS=`python-config --cflags`
LDFLAGS=`python-config --ldflags`
cython_f_example:
${CC} -c $@.c ${CFLAGS}
${CC} $@.o -o $@.so -shared ${LDFLAGS}
Import the module in Python session
import cython_f_example
pyximport#
import Cython .pyx files as if they were .py files:
import pyximport
pyximport.install()
import cython_f_example
%timeit cython_f_example.integrate_f(-1,1,10**3)
print(cython_f_example.integrate_f(-1,1,1000))
553 µs ± 67.6 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174
Building a Cython module using distutils#
Create the setup.py script:
%%file setup.py
from distutils.core import setup
from Cython.Build import cythonize
setup(
name = 'Cython Example Integrate f Function',
ext_modules = cythonize("cython_f_example.pyx"),
)
Writing setup.py
%run setup.py build_ext --inplace --quiet
Compiling cython_f_example.pyx because it changed.
[1/1] Cythonizing cython_f_example.pyx
running build_ext
building 'cython_f_example' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
creating build
creating build/temp.linux-x86_64-cpython-310
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: cython_f_example.c
creating build/lib.linux-x86_64-cpython-310
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include build/temp.linux-x86_64-cpython-310/cython_f_example.o -o build/lib.linux-x86_64-cpython-310/cython_f_example.cpython-310-x86_64-linux-gnu.so
copying build/lib.linux-x86_64-cpython-310/cython_f_example.cpython-310-x86_64-linux-gnu.so ->
<Figure size 1000x600 with 0 Axes>
from cython_f_example import integrate_f
%timeit integrate_f(-1,1,10**3)
integrate_f(-1,1,10**3)
552 µs ± 202 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174
Why is it faster with Cython ?#
Python code is interpreted at every execution to machine code.
Compiled C code is already in machine code.
C is a statically-typed language. It gives to the compiler more information which allows it to optimize both computations and memory access.
To add two variables, Python checks the type before calling the right add function and store it to a value that can be new.
C just add the variables and return the result.
Add Cython types#
We coerce Python types to C types when calling the function. Still a “Python function” so callable from the global namespace.
%%cython
def f(x):
return -4*x**3 +3*x**2 +2*x
def cy_integrate_f(double a, double b, int N):
cdef int i
cdef double s, x, dx
s = 0
dx = (b - a) / (N-1)
for i in range(N-1):
x = a + i*dx
s += (f(x)+f(x+dx))/2
return s*dx
building '_cython_magic_e2206c17cfc7df224a573bc547e7c635' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_e2206c17cfc7df224a573bc547e7c635.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_e2206c17cfc7df224a573bc547e7c635.o -o /home/runner/.cache/ipython/cython/_cython_magic_e2206c17cfc7df224a573bc547e7c635.cpython-310-x86_64-linux-gnu.so
typing the iterator variable i with C semantics, tells Cython to compile the for-loop to pure C code.
typing a, s and dx is important as they are involved in arithmetic within the for-loop
Cython type declarations can make the source code less readable
Do not use them without good reason, i.e. only in performance critical sections.
%timeit cy_integrate_f(-1,1,10**3)
print(cy_integrate_f(-1,1,1000))
483 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174
Finally, we integrate a Cython function instead of a Python function. This eliminates the Python-C conversion at the function call as seen above thus giving a pure Cython/C algorithm.
The primary downside is not being allowed to call
the function cy_f
, from Python unless cpdef
is used.
%%cython
cdef double cy_f(double x):
return -4*x**3 +3*x**2 +2*x
def cycy_integrate_f(double a, double b, int N):
cdef int i
cdef double s, x, dx
s = 0
dx = (b - a) / (N-1)
for i in range(N-1):
x = a + i*dx
s += (cy_f(x)+cy_f(x+dx))/2
return s*dx
building '_cython_magic_771b85cb3308c54a0a536b0ad92f7de6' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_771b85cb3308c54a0a536b0ad92f7de6.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_771b85cb3308c54a0a536b0ad92f7de6.o -o /home/runner/.cache/ipython/cython/_cython_magic_771b85cb3308c54a0a536b0ad92f7de6.cpython-310-x86_64-linux-gnu.so
%timeit cycy_integrate_f(-1,1,10**3)
print(cycy_integrate_f(-1,1,1000))
50.2 µs ± 10.7 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
2.0000040080120174
Exercise : Cythonize the trivial exponential function.#
%%cython -a
def exp_python(x,terms=50):
sum = 0.
power = 1.
fact = 1.
for i in range(terms):
sum += power/fact
power *= x
fact *= i+1
return sum
building '_cython_magic_ad9ee808f280caa4b8920e8bbc858924' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_ad9ee808f280caa4b8920e8bbc858924.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_ad9ee808f280caa4b8920e8bbc858924.o -o /home/runner/.cache/ipython/cython/_cython_magic_ad9ee808f280caa4b8920e8bbc858924.cpython-310-x86_64-linux-gnu.so
Generated by Cython 0.29.32
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
+1: def exp_python(x,terms=50):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyMethodDef __pyx_mdef_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python = {"exp_python", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python, METH_VARARGS|METH_KEYWORDS, 0}; static PyObject *__pyx_pw_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_x = 0; PyObject *__pyx_v_terms = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("exp_python (wrapper)", 0); { static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_x,&__pyx_n_s_terms,0}; PyObject* values[2] = {0,0}; values[1] = ((PyObject *)__pyx_int_50); if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); switch (pos_args) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); CYTHON_FALLTHROUGH; case 0: break; default: goto __pyx_L5_argtuple_error; } kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_x)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (kw_args > 0) { PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_terms); if (value) { values[1] = value; kw_args--; } } } if (unlikely(kw_args > 0)) { if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "exp_python") < 0)) __PYX_ERR(0, 1, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); break; default: goto __pyx_L5_argtuple_error; } } __pyx_v_x = values[0]; __pyx_v_terms = values[1]; } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; __Pyx_RaiseArgtupleInvalid("exp_python", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 1, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_ad9ee808f280caa4b8920e8bbc858924.exp_python", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_exp_python(__pyx_self, __pyx_v_x, __pyx_v_terms); int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_exp_python(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_x, PyObject *__pyx_v_terms) { PyObject *__pyx_v_sum = NULL; PyObject *__pyx_v_power = NULL; PyObject *__pyx_v_fact = NULL; PyObject *__pyx_v_i = NULL; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("exp_python", 0); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_5); __Pyx_AddTraceback("_cython_magic_ad9ee808f280caa4b8920e8bbc858924.exp_python", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XDECREF(__pyx_v_sum); __Pyx_XDECREF(__pyx_v_power); __Pyx_XDECREF(__pyx_v_fact); __Pyx_XDECREF(__pyx_v_i); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple_ = PyTuple_Pack(6, __pyx_n_s_x, __pyx_n_s_terms, __pyx_n_s_sum, __pyx_n_s_power, __pyx_n_s_fact, __pyx_n_s_i); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple_); __Pyx_GIVEREF(__pyx_tuple_); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python, NULL, __pyx_n_s_cython_magic_ad9ee808f280caa4b8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_exp_python, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+2: sum = 0.
__Pyx_INCREF(__pyx_float_0_);
__pyx_v_sum = __pyx_float_0_;
+3: power = 1.
__Pyx_INCREF(__pyx_float_1_);
__pyx_v_power = __pyx_float_1_;
+4: fact = 1.
__Pyx_INCREF(__pyx_float_1_);
__pyx_v_fact = __pyx_float_1_;
+5: for i in range(terms):
__pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_terms); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { __pyx_t_2 = __pyx_t_1; __Pyx_INCREF(__pyx_t_2); __pyx_t_3 = 0; __pyx_t_4 = NULL; } else { __pyx_t_3 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_4 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 5, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { if (likely(!__pyx_t_4)) { if (likely(PyList_CheckExact(__pyx_t_2))) { if (__pyx_t_3 >= PyList_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_1); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 5, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_3 >= PyTuple_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_1); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 5, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } } else { __pyx_t_1 = __pyx_t_4(__pyx_t_2); if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); else __PYX_ERR(0, 5, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_1); } __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1); __pyx_t_1 = 0; /* … */ } __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+6: sum += power/fact
__pyx_t_1 = __Pyx_PyNumber_Divide(__pyx_v_power, __pyx_v_fact); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_5 = PyNumber_InPlaceAdd(__pyx_v_sum, __pyx_t_1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF_SET(__pyx_v_sum, __pyx_t_5); __pyx_t_5 = 0;
+7: power *= x
__pyx_t_5 = PyNumber_InPlaceMultiply(__pyx_v_power, __pyx_v_x); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF_SET(__pyx_v_power, __pyx_t_5); __pyx_t_5 = 0;
+8: fact *= i+1
__pyx_t_5 = __Pyx_PyInt_AddObjC(__pyx_v_i, __pyx_int_1, 1, 0, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __pyx_t_1 = PyNumber_InPlaceMultiply(__pyx_v_fact, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_DECREF_SET(__pyx_v_fact, __pyx_t_1); __pyx_t_1 = 0;
+9: return sum
__Pyx_XDECREF(__pyx_r); __Pyx_INCREF(__pyx_v_sum); __pyx_r = __pyx_v_sum; goto __pyx_L0;
%timeit exp_python(1.,50)
3.65 µs ± 2.44 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
%%cython
# %load solutions/cython/exponential.pyx
#cython: profile=False
#cython: cdivision=True
def exp_cython(double x, int terms = 50):
cdef double sum
cdef double power
cdef double fact
cdef int i
sum = 0.
power = 1.
fact = 1.
for i in range(terms):
sum += power/fact
power *= x
fact *= i+1
return sum
building '_cython_magic_68081c7e8272dfed20aa464238eed323' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_68081c7e8272dfed20aa464238eed323.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_68081c7e8272dfed20aa464238eed323.o -o /home/runner/.cache/ipython/cython/_cython_magic_68081c7e8272dfed20aa464238eed323.cpython-310-x86_64-linux-gnu.so
%timeit exp_cython(1.,50)
136 ns ± 0.0385 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
Cython and Numpy#
The Numpy library contains many fast numerics routines. Their speed comes from manipulating the low-level C-arrays that the numpy.array object wraps rather than computing over slow Python lists. Using Cython one can access those low-level arrays and implement their own fast algorithms while allowing the easy interaction afforded by Python + Numpy.
The examples below are various implementations of the naive matrix multiplication algorithm. We will start with a pure Python implementation and then incrementally add structures that allow Cython to exploit the low-level speed of the numpy.array object.
Pure Python implementation compiled in Cython without specific optimizations.#
%%cython
def matmul1(A, B, out=None):
assert A.shape[1] == B.shape[0]
for i in range(A.shape[0]):
for j in range(B.shape[1]):
s = 0
for k in range(A.shape[1]):
s += A[i,k] * B[k,j]
out[i,j] = s
return out
building '_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab.o -o /home/runner/.cache/ipython/cython/_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab.cpython-310-x86_64-linux-gnu.so
Import numpy as a Cython module#
We now take advantage of the ability to access the underlying C arrays in the numpy.array
object from Cython, thanks to a special numpy.pxd
file included with Cython. (The Cython developers worked closely with Numpy developers to make this optimal.)
To begin with, we have to cimport
numpy: that is, import numpy as a Cython module rather than a Python module. To do so, simply type:
cimport numpy as np
Another important thing to note is the type of Numpy indexers. There is a special Numpy variable type used for numpy.array
indices called Py_ssize_t
. To take full advantage of the speedups that Cython can provide we should make sure to type the variables used for indexing as such.
%%cython
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t # shorthand type. easy to change
def matmul2(np.ndarray[dtype_t, ndim=2] A,
np.ndarray[dtype_t, ndim=2] B,
np.ndarray[dtype_t, ndim=2] out=None):
cdef Py_ssize_t i, j, k
cdef dtype_t s
assert A.shape[1] == B.shape[0]
for i in range(A.shape[0]):
for j in range(B.shape[1]):
s = 0
for k in range(A.shape[1]):
s += A[i,k] * B[k,j]
out[i,j] = s
return out
building '_cython_magic_4e45a0b7bfe01179c777581a00217466' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
from /home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.c:769:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
17 | #warning "Using deprecated NumPy API, disable it with " \
| ^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.o -o /home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.cpython-310-x86_64-linux-gnu.so
import numpy as np
from timeit import timeit
A = np.random.random_sample((64,64))
B = np.random.random_sample((64,64))
C = np.zeros((64,64))
%timeit matmul1(A,B,C)
70.5 ms ± 43.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul2(A,B,C)
359 µs ± 55.1 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Tuning indexing#
The array lookups are still slowed down by two factors:
Bounds checking is performed.
Negative indices are checked for and handled correctly.
The code doesn’t use negative indices, and always access to arrays within bounds. We can add a decorator to disable bounds checking:
%%cython
cimport cython # cython tools
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def matmul3(np.ndarray[dtype_t, ndim=2] A,
np.ndarray[dtype_t, ndim=2] B,
np.ndarray[dtype_t, ndim=2] out=None):
cdef Py_ssize_t i, j, k
cdef dtype_t s
assert A.shape[1] == B.shape[0]
for i in range(A.shape[0]):
for j in range(B.shape[1]):
s = 0
for k in range(A.shape[1]):
s += A[i,k] * B[k,j]
out[i,j] = s
return out
building '_cython_magic_e56092fb4727525b8c555f699543ff80' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
from /home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.c:770:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
17 | #warning "Using deprecated NumPy API, disable it with " \
| ^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.o -o /home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.cpython-310-x86_64-linux-gnu.so
%timeit matmul3(A,B,C)
252 µs ± 73.6 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Cython Build Options#
boundcheck(True,False) : array bounds checking
wraparound(True,False) : negative indexing.
initializedcheck(True,False): checks that a memoryview is initialized
nonecheck(True,False) : Check if one argument is None
overflowcheck(True,False) : Check if int are too big
cdivision(True,False) : If False, adjust the remainder and quotient operators C types to match those of Python ints. Could be very effective when it is set to True.
profile (True / False) : Write hooks for Python profilers into the compiled C code. Default is False.
Numpy objects with external C program.#
Note that this can actually be slower because the C function is not the best implementation of matrix multiplication. Call cblas with same technique is an interesting exercise.
%%file mydgemm.c
void my_dgemm( int m, int n, int k,
double a[m][n], double b[n][k], float c[m][k] )
{
double ab = 0;
for( int j = 0 ; j < m ; j++ ) {
for( int i = 0 ; i < k ; i++ ) {
for( int l = 0 ; l < n ; l++ ){
ab += a[j][l] * b[l][i];
}
c[j][i] = ab ;
ab = 0;
}
}
}
Writing mydgemm.c
The
np.ndarray[double, ndim=2, mode="c"]
assures that you get a C-contiguous numpy array of doublesThe
&input[0,0]
passed in the address of the beginning of the data array.
from pyximport import install
import os
here = os.getcwd()
%%cython -I {here}
# do not forget to change the file path
cdef extern from "mydgemm.c":
void my_dgemm (int m, int n, int k,
double *A, double *B, double *C)
cimport cython
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t
@cython.boundscheck(False)
@cython.wraparound(False)
def matmul4(np.ndarray[dtype_t, ndim=2, mode="c"] A,
np.ndarray[dtype_t, ndim=2, mode="c"] B,
np.ndarray[dtype_t, ndim=2, mode="c"] C=None):
cdef int m = A.shape[0]
cdef int n = A.shape[1]
cdef int k = B.shape[1]
cdef dtype_t s
my_dgemm(m, n, k, &A[0,0], &B[0,0], &C[0,0])
return C
building '_cython_magic_4341c8542397f443eab54ed5120d50e2' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/home/runner/work/python-notebooks/python-notebooks/notebooks -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:774:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
17 | #warning "Using deprecated NumPy API, disable it with " \
| ^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c: In function '__pyx_pf_46_cython_magic_4341c8542397f443eab54ed5120d50e2_matmul4':
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:2023:46: warning: passing argument 4 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
2023 | my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
| ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| |
| __pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t * {aka double *}
In file included from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:771:
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:22: note: expected 'double (*)[(sizetype)(n)]' but argument is of type '__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *' {aka 'double *'}
2 | double a[m][n], double b[n][k], float c[m][k] )
| ~~~~~~~^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:2023:286: warning: passing argument 5 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
2023 | my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
| ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| |
| __pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t * {aka double *}
In file included from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:771:
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:38: note: expected 'double (*)[(sizetype)(k)]' but argument is of type '__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *' {aka 'double *'}
2 | double a[m][n], double b[n][k], float c[m][k] )
| ~~~~~~~^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:2023:526: warning: passing argument 6 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
2023 | my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
| ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| |
| __pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t * {aka double *}
In file included from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:771:
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:53: note: expected 'float (*)[(sizetype)(k)]' but argument is of type '__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *' {aka 'double *'}
2 | double a[m][n], double b[n][k], float c[m][k] )
| ~~~~~~^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.o -o /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.cpython-310-x86_64-linux-gnu.so
%timeit matmul4(A,B,C)
239 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Exercise : Find prime numbers < 10000#
# %load solutions/cython/is_prime0.py
def is_prime0(n):
if n < 4: return True
if n % 2 == 0 : return False
k = 3
while k*k <= n:
if n % k == 0: return False
k += 2
return True
[ p for p in range(20) if is_prime0(p)]
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
L = list(range(10000))
%timeit [ p for p in L if is_prime0(p)]
6.09 ms ± 971 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%cython
def is_prime1(n):
if n < 4: return True
if n % 2 == 0 : return False
k = 3
while k*k <= n:
if n % k == 0: return False
k += 2
return True
building '_cython_magic_aa9811354ee3c4d01a6231f3078a753d' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_aa9811354ee3c4d01a6231f3078a753d.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_aa9811354ee3c4d01a6231f3078a753d.o -o /home/runner/.cache/ipython/cython/_cython_magic_aa9811354ee3c4d01a6231f3078a753d.cpython-310-x86_64-linux-gnu.so
[ p for p in range(20) if is_prime1(p)]
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
%timeit [p for p in L if is_prime1(p)]
3.11 ms ± 4.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Add Cython types without modifying the Python Code#
%%cython
import cython
@cython.locals(n=int, k=int)
def is_prime2(n):
if n < 4: return True
if n % 2 == 0 : return False
k = 3
while k*k <= n:
if n % k == 0: return False
k += 2
return True
building '_cython_magic_ffcb8d83f7d9187218aa269cf4a20513' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_ffcb8d83f7d9187218aa269cf4a20513.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_ffcb8d83f7d9187218aa269cf4a20513.o -o /home/runner/.cache/ipython/cython/_cython_magic_ffcb8d83f7d9187218aa269cf4a20513.cpython-310-x86_64-linux-gnu.so
[ p for p in range(20) if is_prime2(p)]
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
%timeit [p for p in L if is_prime2(p) ]
532 µs ± 114 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Cython function#
%%cython
import cython
cdef bint is_prime3(int n):
if n < 4: return True
if n % 2 == 0: return False
cdef int k = 3
while k*k <= n:
if n % k == 0: return False
k += 2
return True
def prime_list(L):
return [p for p in L if is_prime3(p)]
building '_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf.o -o /home/runner/.cache/ipython/cython/_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf.cpython-310-x86_64-linux-gnu.so
prime_list(list(range(20)))
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
%timeit prime_list(L)
289 µs ± 51.1 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%cython
import cython
from numpy cimport ndarray
import numpy
cdef bint is_prime3(int n):
if n < 4: return True
if n % 2 == 0: return False
cdef int k = 3
while k*k <= n:
if n % k == 0: return False
k += 2
return True
def prime_array(ndarray[int, ndim=1] L):
cdef ndarray[int, ndim=1] res = ndarray(shape=(L.shape[0]),dtype=numpy.int32)
cdef int i
for i in range(L.shape[0]):
res[i] = is_prime3(L[i])
return L[res==1]
building '_cython_magic_fe1737e5897872041914915a6c8ea617' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
from /home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.c:771:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
17 | #warning "Using deprecated NumPy API, disable it with " \
| ^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.o -o /home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.cpython-310-x86_64-linux-gnu.so
import numpy as np
prime_array(np.arange(20,dtype=np.int32))
array([ 0, 1, 2, 3, 5, 7, 11, 13, 17, 19], dtype=int32)
npL = numpy.array(L,dtype=np.int32)
%timeit prime_array(npL)
281 µs ± 284 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Using Parallelism#
Cython supports native parallelism via OpenMP
by default, Python’s Global Interpreter Lock (GIL) prevents that several threads use the Python interpreter simultaneously
to use this kind of parallelism, the GIL must be released
If you have a default compiler with openmp support you can use this magic command in your notebook.
%%cython --compile-args=-fopenmp --link-args=-fopenmp
%%file cython_omp.pyx
import cython
from cython.parallel cimport parallel, prange # import parallel functions
import numpy as np
from numpy cimport ndarray
cdef bint is_prime4(int n) nogil: #release the gil
if n < 4: return True
if n % 2 == 0: return False
cdef int k = 3
while k*k <= n:
if n % k == 0: return False
k += 2
return True
@cython.boundscheck(False)
def prime_array_omp(ndarray[int, ndim=1] L):
cdef ndarray[int, ndim=1] res = ndarray(shape=(L.shape[0]),dtype=np.int32)
cdef Py_ssize_t i
with nogil, parallel(num_threads=4):
for i in prange(L.shape[0]): #Parallel loop
res[i] = is_prime4(L[i])
return L[res==1]
Writing cython_omp.pyx
To use the OpenMP support, you need to enable OpenMP. For gcc this can be done as follows in a setup.py:
%%file setup.py
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import os, sys
import numpy
if sys.platform == "darwin": # for omp, use gcc installed with brew
os.environ["CC"]="gcc-10"
os.environ["CXX"]="g++-10"
ext_modules = [
Extension(
"cython_omp",
["cython_omp.pyx"],
extra_compile_args=['-fopenmp'],
extra_link_args=['-fopenmp'],
include_dirs=[numpy.get_include()]
)
]
setup(
name='Cython OpenMP Example',
ext_modules=cythonize(ext_modules),
)
# python setup.py build_ext --inplace
Overwriting setup.py
%run setup.py build_ext --inplace --quiet
Compiling cython_omp.pyx because it changed.
[1/1] Cythonizing cython_omp.pyx
running build_ext
building 'cython_omp' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
extra options: '-fopenmp'
INFO: x86_64-conda-linux-gnu-cc: cython_omp.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include build/temp.linux-x86_64-cpython-310/cython_omp.o -o build/lib.linux-x86_64-cpython-310/cython_omp.cpython-310-x86_64-linux-gnu.so -fopenmp
copying build/lib.linux-x86_64-cpython-310/cython_omp.cpython-310-x86_64-linux-gnu.so ->
from cython_omp import prime_array_omp
prime_array_omp(np.arange(20,dtype=np.int32))
array([ 0, 1, 2, 3, 5, 7, 11, 13, 17, 19], dtype=int32)
%timeit prime_array_omp(npL)
202 µs ± 6.19 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
References#
Kurt W. Smith