Cython#

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10,6)
%config InlineBackend.figure_format = 'retina'
import numpy as np
import warnings
warnings.filterwarnings("ignore")

Cython logo

  • Cython provides extra syntax allowing for static type declarations (remember: Python is generally dynamically typed)

  • Python code gets translated into optimised C/C++ code and compiled as Python extension modules

  • Cython allows you to write fast C code in a Python-like syntax.

  • Furthermore, linking to existing C libraries is simplified.

  • Pure Python Function

\(f(x)=-2x^3+5x^2+x\),

def f(x):
    return -4*x**3 +3*x**2 +2*x

x = np.linspace(-1,1,100)
ax = plt.subplot(1,1,1)
ax.plot(x, f(x))
ax.fill_between(x, 0, f(x));

we compute integral \(\int_a^b f(x)dx\) numerically with \(N\) points.

def integrate_f_py(a,b,N):
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1): # we intentionally use the bad way to do this with a loop
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx
%timeit integrate_f_py(-1,1,10**3)
print(integrate_f_py(-1,1,1000))
472 μs ± 4.48 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174
%load_ext heat
%%heat
def f(x):
    return -4*x**3 +3*x**2 +2*x
def integrate_f(a, b, N):
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx

integrate_f(0, 10, 1000)
  • Pure C function

%%file integral_f_c.c

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#define NB_RUNS 1000

double f(double x) {
    return -4*x*x*x +3*x*x +2*x;
}

double integrate_f_c(double a, double b, int N) {
    double s  = 0;
    double dx = (b - a) / (N-1);
    for(int i=0; i<N-1; ++i){
        double x = a + i*dx;
        s += (f(x)+f(x+dx))/2.0;
    }
    return s*dx;
}

int main(int argc, char **argv)
{
  
  double a =  atof(argv[1]);
  double b =  atof(argv[2]);
  int N    =  atoi(argv[3]);
  double  res = 0;

  clock_t begin = clock();

  for (int i=0; i<NB_RUNS; ++i)
      res += integrate_f_c( a, b, N );
    
  clock_t end = clock();
     
  fprintf( stdout, "integral_f(%3.1f, %3.1f, %d) = %f \n", a, b, N, res / NB_RUNS );
  fprintf( stdout, "time = %e ms \n",  (double)(end - begin) / CLOCKS_PER_SEC );

  return 0;
}
Writing integral_f_c.c
!gcc -O3 integral_f_c.c; ./a.out -1 1 1000
integral_f(-1.0, 1.0, 1000) = 2.000004 
time = 1.471000e-03 ms 

Cython compilation: Generating C code#

Load Cython in jupyter notebook.

%load_ext Cython

C Variable and Type definitions#

In general, use cdef to declare C variables. The command :

$ cython -a mycode.pyx

outputs an html file. It shows what parts of your code are C, which parts are Python, and where C-Python conversion occurs.

%%cython -a
cdef int i, j = 2, k = 3      # assigning values at declaration
i = 1                         # assigning values afterwards
# avoid Python-C conversion! It's expensive:
a = 5
i = a
# same with C-Python conversion:
b = j
print("a = %d" % a)
print("i = %d" % i)
print("b = %d" % b)
a = 5
i = 5
b = 2

Generated by Cython 3.1.2

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+01: cdef int i, j = 2, k = 3      # assigning values at declaration
  __pyx_v_78_cython_magic_febcf32fa9fc027abbd59ad4f703aefe755660ddf88d0c33d4bf3bfeb1353434_j = 2;
  __pyx_v_78_cython_magic_febcf32fa9fc027abbd59ad4f703aefe755660ddf88d0c33d4bf3bfeb1353434_k = 3;
/* … */
  __pyx_t_2 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_test, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+02: i = 1                         # assigning values afterwards
  __pyx_v_78_cython_magic_febcf32fa9fc027abbd59ad4f703aefe755660ddf88d0c33d4bf3bfeb1353434_i = 1;
 03: # avoid Python-C conversion! It's expensive:
+04: a = 5
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_a, __pyx_mstate_global->__pyx_int_5) < 0) __PYX_ERR(0, 4, __pyx_L1_error)
+05: i = a
  __Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_mstate_global->__pyx_n_u_a); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 5, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_3 = __Pyx_PyLong_As_int(__pyx_t_2); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 5, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_v_78_cython_magic_febcf32fa9fc027abbd59ad4f703aefe755660ddf88d0c33d4bf3bfeb1353434_i = __pyx_t_3;
 06: # same with C-Python conversion:
+07: b = j
  __pyx_t_2 = __Pyx_PyLong_From_int(__pyx_v_78_cython_magic_febcf32fa9fc027abbd59ad4f703aefe755660ddf88d0c33d4bf3bfeb1353434_j); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 7, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_b, __pyx_t_2) < 0) __PYX_ERR(0, 7, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+08: print("a = %d" % a)
  __pyx_t_4 = NULL;
  __Pyx_INCREF(__pyx_builtin_print);
  __pyx_t_5 = __pyx_builtin_print; 
  __Pyx_GetModuleGlobalName(__pyx_t_6, __pyx_mstate_global->__pyx_n_u_a); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_6);
  __pyx_t_7 = __Pyx_PyUnicode_FormatSafe(__pyx_mstate_global->__pyx_kp_u_a_d, __pyx_t_6); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_7);
  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
  __pyx_t_8 = 1;
  {
    PyObject *__pyx_callargs[2] = {__pyx_t_4, __pyx_t_7};
    __pyx_t_2 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+__pyx_t_8, (2-__pyx_t_8) | (__pyx_t_8*__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET));
    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+09: print("i = %d" % i)
  __pyx_t_5 = NULL;
  __Pyx_INCREF(__pyx_builtin_print);
  __pyx_t_7 = __pyx_builtin_print; 
  __pyx_t_4 = __Pyx_PyLong_From_int(__pyx_v_78_cython_magic_febcf32fa9fc027abbd59ad4f703aefe755660ddf88d0c33d4bf3bfeb1353434_i); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_t_6 = PyUnicode_Format(__pyx_mstate_global->__pyx_kp_u_i_d, __pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_6);
  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
  __pyx_t_8 = 1;
  {
    PyObject *__pyx_callargs[2] = {__pyx_t_5, __pyx_t_6};
    __pyx_t_2 = __Pyx_PyObject_FastCall(__pyx_t_7, __pyx_callargs+__pyx_t_8, (2-__pyx_t_8) | (__pyx_t_8*__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET));
    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
    if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 9, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+10: print("b = %d" % b)
  __pyx_t_7 = NULL;
  __Pyx_INCREF(__pyx_builtin_print);
  __pyx_t_6 = __pyx_builtin_print; 
  __Pyx_GetModuleGlobalName(__pyx_t_5, __pyx_mstate_global->__pyx_n_u_b); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 10, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_5);
  __pyx_t_4 = __Pyx_PyUnicode_FormatSafe(__pyx_mstate_global->__pyx_kp_u_b_d, __pyx_t_5); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
  __pyx_t_8 = 1;
  {
    PyObject *__pyx_callargs[2] = {__pyx_t_7, __pyx_t_4};
    __pyx_t_2 = __Pyx_PyObject_FastCall(__pyx_t_6, __pyx_callargs+__pyx_t_8, (2-__pyx_t_8) | (__pyx_t_8*__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET));
    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
    if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;

Another Python vs. Cython coloring guide#

%%cython -a
cdef int m, n
cdef double cy_total = 0.0
for m in range(10):
    n = 2*m
    cy_total += n
a, b = 0, 0
py_total = 0.0
for a in range(10):
    b = 2*a
    py_total += b
print(cy_total, py_total)
90.0 90.0

Generated by Cython 3.1.2

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+01: cdef int m, n
  __pyx_t_6 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_6);
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_test, __pyx_t_6) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+02: cdef double cy_total = 0.0
  __pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_cy_total = 0.0;
+03: for m in range(10):
  for (__pyx_t_2 = 0; __pyx_t_2 < 10; __pyx_t_2+=1) {
    __pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_m = __pyx_t_2;
+04:     n = 2*m
    __pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_n = (2 * __pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_m);
+05:     cy_total += n
    __pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_cy_total = (__pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_cy_total + __pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_n);
  }
+06: a, b = 0, 0
  __pyx_t_3 = __pyx_mstate_global->__pyx_int_0;
  __Pyx_INCREF(__pyx_t_3);
  __pyx_t_4 = __pyx_mstate_global->__pyx_int_0;
  __Pyx_INCREF(__pyx_t_4);
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_a, __pyx_t_3) < 0) __PYX_ERR(0, 6, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_b, __pyx_t_4) < 0) __PYX_ERR(0, 6, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+07: py_total = 0.0
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_py_total, __pyx_mstate_global->__pyx_float_0_0) < 0) __PYX_ERR(0, 7, __pyx_L1_error)
+08: for a in range(10):
  for (__pyx_t_5 = 0; __pyx_t_5 < 10; __pyx_t_5+=1) {
    __pyx_t_4 = PyLong_FromSsize_t(__pyx_t_5); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_4);
    if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_a, __pyx_t_4) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+09:     b = 2*a
    __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_mstate_global->__pyx_n_u_a); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 9, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_4);
    __pyx_t_3 = __Pyx_PyLong_MultiplyCObj(__pyx_mstate_global->__pyx_int_2, __pyx_t_4, 2, 0, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 9, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_b, __pyx_t_3) < 0) __PYX_ERR(0, 9, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+10:     py_total += b
    __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_mstate_global->__pyx_n_u_py_total); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_mstate_global->__pyx_n_u_b); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_4);
    __pyx_t_6 = PyNumber_InPlaceAdd(__pyx_t_3, __pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_6);
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_py_total, __pyx_t_6) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
  }
+11: print(cy_total, py_total)
  __pyx_t_4 = NULL;
  __Pyx_INCREF(__pyx_builtin_print);
  __pyx_t_3 = __pyx_builtin_print; 
  __pyx_t_7 = PyFloat_FromDouble(__pyx_v_78_cython_magic_917676bb1325e1fcbf49143c68bf82f745579d07f0e9870b6ae71e3794f68524_cy_total); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 11, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_7);
  __Pyx_GetModuleGlobalName(__pyx_t_8, __pyx_mstate_global->__pyx_n_u_py_total); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 11, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_8);
  __pyx_t_9 = 1;
  {
    PyObject *__pyx_callargs[3] = {__pyx_t_4, __pyx_t_7, __pyx_t_8};
    __pyx_t_6 = __Pyx_PyObject_FastCall(__pyx_t_3, __pyx_callargs+__pyx_t_9, (3-__pyx_t_9) | (__pyx_t_9*__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET));
    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 11, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_6);
  }
  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
%%cython -a
cdef struct Grail:
    int age
    float volume
cdef union Food:
    char *spam
    float *eggs
cdef enum CheeseType:
    cheddar, edam,
    camembert
cdef enum CheeseState:
    hard = 1
    soft = 2
    runny = 3
cdef Grail holy
holy.age    = 500
holy.volume = 10.0
print (holy.age, holy.volume)
500 10.0

Generated by Cython 3.1.2

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+01: cdef struct Grail:
  __pyx_t_2 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_test, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
/* … */
struct __pyx_t_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_Grail {
  int age;
  float volume;
};
 02:     int age
 03:     float volume
 04: cdef union Food:
 05:     char *spam
 06:     float *eggs
+07: cdef enum CheeseType:
enum __pyx_t_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_CheeseType {
  __pyx_e_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_cheddar,
  __pyx_e_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_edam,
  __pyx_e_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_camembert
};
 08:     cheddar, edam,
 09:     camembert
+10: cdef enum CheeseState:
enum __pyx_t_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_CheeseState {
  __pyx_e_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_hard = 1,
  __pyx_e_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_soft = 2,
  __pyx_e_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_runny = 3
};
 11:     hard = 1
 12:     soft = 2
 13:     runny = 3
 14: cdef Grail holy
+15: holy.age    = 500
  __pyx_v_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_holy.age = 0x1F4;
+16: holy.volume = 10.0
  __pyx_v_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_holy.volume = 10.0;
+17: print (holy.age, holy.volume)
  __pyx_t_3 = NULL;
  __Pyx_INCREF(__pyx_builtin_print);
  __pyx_t_4 = __pyx_builtin_print; 
  __pyx_t_5 = __Pyx_PyLong_From_int(__pyx_v_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_holy.age); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_5);
  __pyx_t_6 = PyFloat_FromDouble(__pyx_v_78_cython_magic_e9357d3c7cee06888fe48fd8f7d33c631bcac8e311a561ea49c9041ca31db9b2_holy.volume); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_6);
  __pyx_t_7 = 1;
  {
    PyObject *__pyx_callargs[3] = {__pyx_t_3, __pyx_t_5, __pyx_t_6};
    __pyx_t_2 = __Pyx_PyObject_FastCall(__pyx_t_4, __pyx_callargs+__pyx_t_7, (3-__pyx_t_7) | (__pyx_t_7*__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET));
    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;

Cython Functions#

Use cdef to define a Cython function.

  • Cython function can accept either (inclusive) Python and C values as well as return either Python or C values,

  • Within a Cython module Python and Cython functions can call each other freely. However, only Python functions can be called from outside the module by Python code. (i.e. importing/exporting a Cython module into some Python code)

cpdef define a Cython function with a simple Python wrapper. However, when called from Cython the Cython / C code is called directly, bypassing the Python wrapper.

Writing pure code in Cython gives a small speed boost. Note that none of the code below is Cython-specific. Just add .pyx instead of .py extension.

%%file cython_f_example.pyx
def f(x):
    return -4*x**3 +3*x**2 +2*x
def integrate_f(a, b, N):
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx
Writing cython_f_example.pyx

Cython Compilation#

  • The .pyx source file is compiled by Cython to a .c file.

  • The .c source file contains the code of a Python extension module.

  • The .c file is compiled by a C compiler to a .so (shared object library) file which can be imported directly into a Python session.

Build with CMake#

project(cython_f_example CXX)
include(UseCython)  # Load Cython functions
# Set C++ output
set_source_file_properties(cython_f_example.pyx PROPERTIES CYTHON_IS_CXX TRUE )
# Build the extension module
cython_add_module( modname cython_f_example.pyx cython_f_example.cpp )

C/C++ generation with cython application#

cython -3 cython_f_example.pyx   # create the C file for Python 3
cython -3 --cplus cython_f_example.pyx  # create the C++ file for Python 3

build with a C/C++ compiler#

To build use the Makefile:

CC=gcc
CFLAGS=`python-config --cflags` 
LDFLAGS=`python-config --ldflags`
cython_f_example:
	 ${CC} -c $@.c ${CFLAGS}
	 ${CC} $@.o -o $@.so -shared ${LDFLAGS}

Import the module in Python session

import cython_f_example

pyximport#

import Cython .pyx files as if they were .py files:

import pyximport
pyximport.install()
import cython_f_example
%timeit cython_f_example.integrate_f(-1,1,10**3)
print(cython_f_example.integrate_f(-1,1,1000))
373 μs ± 968 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174

Building a Cython module using distutils#

Create the setup.py script:

%%file setup.py
from distutils.core import setup
from Cython.Build import cythonize

setup(
  name = 'Cython Example Integrate f Function',
  ext_modules = cythonize("cython_f_example.pyx"),
)
Writing setup.py
%run setup.py  build_ext --inplace --quiet
Compiling cython_f_example.pyx because it changed.
[1/1] Cythonizing cython_f_example.pyx
<Figure size 1000x600 with 0 Axes>
from cython_f_example import integrate_f
%timeit integrate_f(-1,1,10**3)
integrate_f(-1,1,10**3)
376 μs ± 2.32 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174

Why is it faster with Cython ?#

  • Python code is interpreted at every execution to machine code.

  • Compiled C code is already in machine code.

  • C is a statically-typed language. It gives to the compiler more information which allows it to optimize both computations and memory access.

  • To add two variables, Python checks the type before calling the right add function and store it to a value that can be new.

  • C just add the variables and return the result.

Add Cython types#

We coerce Python types to C types when calling the function. Still a “Python function” so callable from the global namespace.

%%cython
def f(x):
    return -4*x**3 +3*x**2 +2*x
def cy_integrate_f(double a, double b, int N):
    cdef int i
    cdef double s, x, dx
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx
  • typing the iterator variable i with C semantics, tells Cython to compile the for-loop to pure C code.

  • typing a, s and dx is important as they are involved in arithmetic within the for-loop

  • Cython type declarations can make the source code less readable

  • Do not use them without good reason, i.e. only in performance critical sections.

%timeit cy_integrate_f(-1,1,10**3)
print(cy_integrate_f(-1,1,1000))
331 μs ± 724 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174

Finally, we integrate a Cython function instead of a Python function. This eliminates the Python-C conversion at the function call as seen above thus giving a pure Cython/C algorithm.

The primary downside is not being allowed to call the function cy_f, from Python unless cpdef is used.

%%cython
cdef double cy_f(double x):
    return -4*x**3 +3*x**2 +2*x
def cycy_integrate_f(double a, double b, int N):
    cdef int i
    cdef double s, x, dx
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (cy_f(x)+cy_f(x+dx))/2
    return s*dx
%timeit cycy_integrate_f(-1,1,10**3)
print(cycy_integrate_f(-1,1,1000))
34 μs ± 63.6 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
2.0000040080120174

Exercise : Cythonize the trivial exponential function.#

%%cython -a
def exp_python(x,terms=50):
    sum = 0.
    power = 1.
    fact = 1.
    for i in range(terms):
        sum += power/fact
        power *= x
        fact *= i+1
    return sum

Generated by Cython 3.1.2

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+1: def exp_python(x,terms=50):
/* Python wrapper */
static PyObject *__pyx_pw_78_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275_1exp_python(PyObject *__pyx_self, 
#if CYTHON_METH_FASTCALL
PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
#else
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
static PyMethodDef __pyx_mdef_78_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275_1exp_python = {"exp_python", (PyCFunction)(void(*)(void))(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_78_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275_1exp_python, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
static PyObject *__pyx_pw_78_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275_1exp_python(PyObject *__pyx_self, 
#if CYTHON_METH_FASTCALL
PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
#else
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
) {
  PyObject *__pyx_v_x = 0;
  PyObject *__pyx_v_terms = 0;
  #if !CYTHON_METH_FASTCALL
  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
  #endif
  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("exp_python (wrapper)", 0);
  #if !CYTHON_METH_FASTCALL
  #if CYTHON_ASSUME_SAFE_SIZE
  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
  #else
  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
  #endif
  #endif
  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
  {
    PyObject ** const __pyx_pyargnames[] = {&__pyx_mstate_global->__pyx_n_u_x,&__pyx_mstate_global->__pyx_n_u_terms,0};
  PyObject* values[2] = {0,0};
    const Py_ssize_t __pyx_kwds_len = (__pyx_kwds) ? __Pyx_NumKwargs_FASTCALL(__pyx_kwds) : 0;
    if (unlikely(__pyx_kwds_len) < 0) __PYX_ERR(0, 1, __pyx_L3_error)
    if (__pyx_kwds_len > 0) {
      switch (__pyx_nargs) {
        case  2:
        values[1] = __Pyx_ArgRef_FASTCALL(__pyx_args, 1);
        if (!CYTHON_ASSUME_SAFE_MACROS && unlikely(!values[1])) __PYX_ERR(0, 1, __pyx_L3_error)
        CYTHON_FALLTHROUGH;
        case  1:
        values[0] = __Pyx_ArgRef_FASTCALL(__pyx_args, 0);
        if (!CYTHON_ASSUME_SAFE_MACROS && unlikely(!values[0])) __PYX_ERR(0, 1, __pyx_L3_error)
        CYTHON_FALLTHROUGH;
        case  0: break;
        default: goto __pyx_L5_argtuple_error;
      }
      const Py_ssize_t kwd_pos_args = __pyx_nargs;
      if (__Pyx_ParseKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values, kwd_pos_args, __pyx_kwds_len, "exp_python", 0) < 0) __PYX_ERR(0, 1, __pyx_L3_error)
      if (!values[1]) values[1] = __Pyx_NewRef(((PyObject *)((PyObject*)__pyx_mstate_global->__pyx_int_50)));
      for (Py_ssize_t i = __pyx_nargs; i < 1; i++) {
        if (unlikely(!values[i])) { __Pyx_RaiseArgtupleInvalid("exp_python", 0, 1, 2, i); __PYX_ERR(0, 1, __pyx_L3_error) }
      }
    } else {
      switch (__pyx_nargs) {
        case  2:
        values[1] = __Pyx_ArgRef_FASTCALL(__pyx_args, 1);
        if (!CYTHON_ASSUME_SAFE_MACROS && unlikely(!values[1])) __PYX_ERR(0, 1, __pyx_L3_error)
        CYTHON_FALLTHROUGH;
        case  1:
        values[0] = __Pyx_ArgRef_FASTCALL(__pyx_args, 0);
        if (!CYTHON_ASSUME_SAFE_MACROS && unlikely(!values[0])) __PYX_ERR(0, 1, __pyx_L3_error)
        break;
        default: goto __pyx_L5_argtuple_error;
      }
      if (!values[1]) values[1] = __Pyx_NewRef(((PyObject *)((PyObject*)__pyx_mstate_global->__pyx_int_50)));
    }
    __pyx_v_x = values[0];
    __pyx_v_terms = values[1];
  }
  goto __pyx_L6_skip;
  __pyx_L5_argtuple_error:;
  __Pyx_RaiseArgtupleInvalid("exp_python", 0, 1, 2, __pyx_nargs); __PYX_ERR(0, 1, __pyx_L3_error)
  __pyx_L6_skip:;
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L3_error:;
  for (Py_ssize_t __pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
    Py_XDECREF(values[__pyx_temp]);
  }
  __Pyx_AddTraceback("_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275.exp_python", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_78_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275_exp_python(__pyx_self, __pyx_v_x, __pyx_v_terms);
  int __pyx_lineno = 0;
  const char *__pyx_filename = NULL;
  int __pyx_clineno = 0;

  /* function exit code */
  for (Py_ssize_t __pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
    Py_XDECREF(values[__pyx_temp]);
  }
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_78_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275_exp_python(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_x, PyObject *__pyx_v_terms) {
  PyObject *__pyx_v_sum = NULL;
  PyObject *__pyx_v_power = NULL;
  PyObject *__pyx_v_fact = NULL;
  PyObject *__pyx_v_i = NULL;
  PyObject *__pyx_r = NULL;
/* … */
  __pyx_t_2 = __Pyx_CyFunction_New(&__pyx_mdef_78_cython_magic_8b5e6dd4b45808a3f8765cdbcdb69bc0bca86d4ed70a04372ccb964fdfa10275_1exp_python, 0, __pyx_mstate_global->__pyx_n_u_exp_python, NULL, __pyx_mstate_global->__pyx_n_u_cython_magic_8b5e6dd4b45808a3f8, __pyx_mstate_global->__pyx_d, ((PyObject *)__pyx_mstate_global->__pyx_codeobj_tab[0])); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_CyFunction_SetDefaultsTuple(__pyx_t_2, __pyx_mstate_global->__pyx_tuple[0]);
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_exp_python, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_t_2 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  if (PyDict_SetItem(__pyx_mstate_global->__pyx_d, __pyx_mstate_global->__pyx_n_u_test, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+2:     sum = 0.
  __Pyx_INCREF(__pyx_mstate_global->__pyx_float_0_);
  __pyx_v_sum = __pyx_mstate_global->__pyx_float_0_;
+3:     power = 1.
  __Pyx_INCREF(__pyx_mstate_global->__pyx_float_1_);
  __pyx_v_power = __pyx_mstate_global->__pyx_float_1_;
+4:     fact = 1.
  __Pyx_INCREF(__pyx_mstate_global->__pyx_float_1_);
  __pyx_v_fact = __pyx_mstate_global->__pyx_float_1_;
+5:     for i in range(terms):
  __pyx_t_2 = NULL;
  __Pyx_INCREF(__pyx_builtin_range);
  __pyx_t_3 = __pyx_builtin_range; 
  __pyx_t_4 = 1;
  {
    PyObject *__pyx_callargs[2] = {__pyx_t_2, __pyx_v_terms};
    __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_3, __pyx_callargs+__pyx_t_4, (2-__pyx_t_4) | (__pyx_t_4*__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET));
    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
  }
  if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) {
    __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3);
    __pyx_t_5 = 0;
    __pyx_t_6 = NULL;
  } else {
    __pyx_t_5 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 5, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    __pyx_t_6 = (CYTHON_COMPILING_IN_LIMITED_API) ? PyIter_Next : __Pyx_PyObject_GetIterNextFunc(__pyx_t_3); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 5, __pyx_L1_error)
  }
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  for (;;) {
    if (likely(!__pyx_t_6)) {
      if (likely(PyList_CheckExact(__pyx_t_3))) {
        {
          Py_ssize_t __pyx_temp = __Pyx_PyList_GET_SIZE(__pyx_t_3);
          #if !CYTHON_ASSUME_SAFE_SIZE
          if (unlikely((__pyx_temp < 0))) __PYX_ERR(0, 5, __pyx_L1_error)
          #endif
          if (__pyx_t_5 >= __pyx_temp) break;
        }
        __pyx_t_1 = __Pyx_PyList_GetItemRef(__pyx_t_3, __pyx_t_5);
        ++__pyx_t_5;
      } else {
        {
          Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_3);
          #if !CYTHON_ASSUME_SAFE_SIZE
          if (unlikely((__pyx_temp < 0))) __PYX_ERR(0, 5, __pyx_L1_error)
          #endif
          if (__pyx_t_5 >= __pyx_temp) break;
        }
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_1 = __Pyx_NewRef(PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_5));
        #else
        __pyx_t_1 = __Pyx_PySequence_ITEM(__pyx_t_3, __pyx_t_5);
        #endif
        ++__pyx_t_5;
      }
      if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
    } else {
      __pyx_t_1 = __pyx_t_6(__pyx_t_3);
      if (unlikely(!__pyx_t_1)) {
        PyObject* exc_type = PyErr_Occurred();
        if (exc_type) {
          if (unlikely(!__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) __PYX_ERR(0, 5, __pyx_L1_error)
          PyErr_Clear();
        }
        break;
      }
    }
    __Pyx_GOTREF(__pyx_t_1);
    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1);
    __pyx_t_1 = 0;
/* … */
  }
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+6:         sum += power/fact
    __pyx_t_1 = __Pyx_PyNumber_Divide(__pyx_v_power, __pyx_v_fact); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __pyx_t_2 = PyNumber_InPlaceAdd(__pyx_v_sum, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 6, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    __Pyx_DECREF_SET(__pyx_v_sum, __pyx_t_2);
    __pyx_t_2 = 0;
+7:         power *= x
    __pyx_t_2 = PyNumber_InPlaceMultiply(__pyx_v_power, __pyx_v_x); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 7, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __Pyx_DECREF_SET(__pyx_v_power, __pyx_t_2);
    __pyx_t_2 = 0;
+8:         fact *= i+1
    __pyx_t_2 = __Pyx_PyLong_AddObjC(__pyx_v_i, __pyx_mstate_global->__pyx_int_1, 1, 0, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __pyx_t_1 = PyNumber_InPlaceMultiply(__pyx_v_fact, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
    __Pyx_DECREF_SET(__pyx_v_fact, __pyx_t_1);
    __pyx_t_1 = 0;
+9:     return sum
  __Pyx_XDECREF(__pyx_r);
  __Pyx_INCREF(__pyx_v_sum);
  __pyx_r = __pyx_v_sum;
  goto __pyx_L0;
%timeit exp_python(1.,50)
3.31 μs ± 15.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
%%cython

# %load solutions/cython/exponential.pyx
#cython: profile=False
#cython: cdivision=True
def exp_cython(double x, int terms = 50):
   cdef double sum
   cdef double power
   cdef double fact
   cdef int i
   sum = 0.
   power = 1.
   fact = 1.
   for i in range(terms):
      sum += power/fact
      power *= x
      fact *= i+1
   return sum
%timeit exp_cython(1.,50)
101 ns ± 0.333 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

Cython and Numpy#

The Numpy library contains many fast numerics routines. Their speed comes from manipulating the low-level C-arrays that the numpy.array object wraps rather than computing over slow Python lists. Using Cython one can access those low-level arrays and implement their own fast algorithms while allowing the easy interaction afforded by Python + Numpy.

The examples below are various implementations of the naive matrix multiplication algorithm. We will start with a pure Python implementation and then incrementally add structures that allow Cython to exploit the low-level speed of the numpy.array object.

Pure Python implementation compiled in Cython without specific optimizations.#

%%cython
def matmul1(A, B, out=None):
    assert A.shape[1] == B.shape[0]
    for i in range(A.shape[0]):
        for j in range(B.shape[1]):
            s = 0
            for k in range(A.shape[1]):
                s += A[i,k] * B[k,j]
            out[i,j] = s
    return out

Import numpy as a Cython module#

We now take advantage of the ability to access the underlying C arrays in the numpy.array object from Cython, thanks to a special numpy.pxd file included with Cython. (The Cython developers worked closely with Numpy developers to make this optimal.)

To begin with, we have to cimport numpy: that is, import numpy as a Cython module rather than a Python module. To do so, simply type:

cimport numpy as np

Another important thing to note is the type of Numpy indexers. There is a special Numpy variable type used for numpy.array indices called Py_ssize_t. To take full advantage of the speedups that Cython can provide we should make sure to type the variables used for indexing as such.

%%cython
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t      # shorthand type. easy to change
def matmul2(np.ndarray[dtype_t, ndim=2] A,
            np.ndarray[dtype_t, ndim=2] B,
            np.ndarray[dtype_t, ndim=2] out=None):
    cdef Py_ssize_t i, j, k
    cdef dtype_t s
    assert A.shape[1] == B.shape[0]
    for i in range(A.shape[0]):
        for j in range(B.shape[1]):
            s = 0
            for k in range(A.shape[1]):
                s += A[i,k] * B[k,j]
            out[i,j] = s
    return out
Content of stderr:
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/ndarraytypes.h:1913,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/ndarrayobject.h:12,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/arrayobject.h:5,
                 from /home/runner/.cache/ipython/cython/_cython_magic_7e0dbe5655114b52cbb349af229e638dc8c7f90a81796d4f3538259d20565b72.c:1146:
/usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
   17 | #warning "Using deprecated NumPy API, disable it with " \
      |  ^~~~~~~
import numpy as np
from timeit import timeit
A = np.random.random_sample((64,64))
B = np.random.random_sample((64,64))
C = np.zeros((64,64))
%timeit matmul1(A,B,C)
64.8 ms ± 352 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul2(A,B,C)
277 μs ± 255 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Tuning indexing#

The array lookups are still slowed down by two factors:

  • Bounds checking is performed.

  • Negative indices are checked for and handled correctly.

The code doesn’t use negative indices, and always access to arrays within bounds. We can add a decorator to disable bounds checking:

%%cython
cimport cython                                       # cython tools
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def matmul3(np.ndarray[dtype_t, ndim=2] A,
            np.ndarray[dtype_t, ndim=2] B,
            np.ndarray[dtype_t, ndim=2] out=None):
    cdef Py_ssize_t i, j, k
    cdef dtype_t s
    assert A.shape[1] == B.shape[0]
    for i in range(A.shape[0]):
        for j in range(B.shape[1]):
            s = 0
            for k in range(A.shape[1]):
                s += A[i,k] * B[k,j]
            out[i,j] = s
    return out
Content of stderr:
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/ndarraytypes.h:1913,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/ndarrayobject.h:12,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/arrayobject.h:5,
                 from /home/runner/.cache/ipython/cython/_cython_magic_addb67e864d33fbcf55f2d8579b244b02c3f9feab8d5709f2f31d7d9e3565394.c:1147:
/usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
   17 | #warning "Using deprecated NumPy API, disable it with " \
      |  ^~~~~~~
%timeit matmul3(A,B,C)
187 μs ± 80.4 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Cython Build Options#

  • boundcheck(True,False) : array bounds checking

  • wraparound(True,False) : negative indexing.

  • initializedcheck(True,False): checks that a memoryview is initialized

  • nonecheck(True,False) : Check if one argument is None

  • overflowcheck(True,False) : Check if int are too big

  • cdivision(True,False) : If False, adjust the remainder and quotient operators C types to match those of Python ints. Could be very effective when it is set to True.

  • profile (True / False) : Write hooks for Python profilers into the compiled C code. Default is False.

Cython Compiler directives

Numpy objects with external C program.#

Note that this can actually be slower because the C function is not the best implementation of matrix multiplication. Call cblas with same technique is an interesting exercise.

%%file mydgemm.c 
void my_dgemm( int m, int n, int k, 
              double a[m][n], double b[n][k], float c[m][k] )
{
  double ab = 0;
  for( int j = 0 ; j < m ; j++ ) {
    for( int i = 0 ; i < k ; i++ ) {
      for( int l = 0 ; l < n ; l++ ){
        ab += a[j][l] * b[l][i];
      }
      c[j][i] = ab ;
      ab = 0;
    }
  }
}
Writing mydgemm.c
  • The np.ndarray[double, ndim=2, mode="c"] assures that you get a C-contiguous numpy array of doubles

  • The &input[0,0] passed in the address of the beginning of the data array.

from pyximport import install
import os
here = os.getcwd()
%%cython -I {here}
# do not forget to change the file path
cdef extern from "mydgemm.c":
    void my_dgemm (int m, int n, int k, 
                          double *A, double *B, double *C)
cimport cython
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t
@cython.boundscheck(False)
@cython.wraparound(False) 
def matmul4(np.ndarray[dtype_t, ndim=2, mode="c"] A,
            np.ndarray[dtype_t, ndim=2, mode="c"] B,
            np.ndarray[dtype_t, ndim=2, mode="c"] C=None):
    cdef int m = A.shape[0]
    cdef int n = A.shape[1]
    cdef int k = B.shape[1]
    cdef dtype_t s
    
    my_dgemm(m, n, k, &A[0,0], &B[0,0], &C[0,0])
                                                  
    return C
Content of stderr:
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/ndarraytypes.h:1913,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/ndarrayobject.h:12,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/arrayobject.h:5,
                 from /home/runner/.cache/ipython/cython/_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565.c:1151:
/usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/numpy/_core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
   17 | #warning "Using deprecated NumPy API, disable it with " \
      |  ^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565.c: In function '__pyx_pf_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_matmul4':
/home/runner/.cache/ipython/cython/_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565.c:4909:46: error: passing argument 4 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
 4909 |   my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
      |                                             ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                              |
      |                                              __pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t * {aka double *}
In file included from /home/runner/.cache/ipython/cython/_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565.c:1145:
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:22: note: expected 'double (*)[n]' but argument is of type '__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *' {aka 'double *'}
    2 |               double a[m][n], double b[n][k], float c[m][k] )
      |               ~~~~~~~^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565.c:4909:318: error: passing argument 5 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
 4909 |   my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
      |                                                                                                                                                                                                                                                                                                                             ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                                                                                                                                                                                                                                                                                                              |
      |                                                                                                                                                                                                                                                                                                                              __pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t * {aka double *}
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:38: note: expected 'double (*)[k]' but argument is of type '__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *' {aka 'double *'}
    2 |               double a[m][n], double b[n][k], float c[m][k] )
      |                               ~~~~~~~^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565.c:4909:590: error: passing argument 6 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
 4909 |   my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
      |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
      |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              __pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t * {aka double *}
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:53: note: expected 'float (*)[k]' but argument is of type '__pyx_t_78_cython_magic_ef109d78c19f8e2b8b39870be178fc722b925efc7c6ae8a708ff74527d1ae565_dtype_t *' {aka 'double *'}
    2 |               double a[m][n], double b[n][k], float c[m][k] )
      |                                               ~~~~~~^~~~~~~
%timeit matmul4(A,B,C)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[37], line 1
----> 1 get_ipython().run_line_magic('timeit', 'matmul4(A,B,C)')

File /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/IPython/core/interactiveshell.py:2488, in InteractiveShell.run_line_magic(self, magic_name, line, _stack_depth)
   2486     kwargs['local_ns'] = self.get_local_scope(stack_depth)
   2487 with self.builtin_trap:
-> 2488     result = fn(*args, **kwargs)
   2490 # The code below prevents the output from being displayed
   2491 # when using magics with decorator @output_can_be_silenced
   2492 # when the last Python token in the expression is a ';'.
   2493 if getattr(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, False):

File /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/IPython/core/magics/execution.py:1225, in ExecutionMagics.timeit(self, line, cell, local_ns)
   1223 for index in range(0, 10):
   1224     number = 10 ** index
-> 1225     time_number = timer.timeit(number)
   1226     if time_number >= 0.2:
   1227         break

File /usr/share/miniconda3/envs/runenv/lib/python3.13/site-packages/IPython/core/magics/execution.py:183, in Timer.timeit(self, number)
    181 gc.disable()
    182 try:
--> 183     timing = self.inner(it, self.timer)
    184 finally:
    185     if gcold:

File <magic-timeit>:1, in inner(_it, _timer)

NameError: name 'matmul4' is not defined

Exercise : Find prime numbers < 10000#

# %load solutions/cython/is_prime0.py

def is_prime0(n):
    if n < 4: return True
    if n % 2 == 0 : return False
    k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
[ p for p in range(20) if is_prime0(p)]
L = list(range(10000))
%timeit [ p for p in L if is_prime0(p)]
%%cython
def is_prime1(n):
    if n < 4: return True
    if n % 2 == 0 : return False
    k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
[ p for p in range(20) if is_prime1(p)]
%timeit [p  for p in L if is_prime1(p)]

Add Cython types without modifying the Python Code#

%%cython
import cython
@cython.locals(n=int, k=int)
def is_prime2(n):
    if n < 4: return True
    if n % 2 == 0 : return False
    k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
[ p for p in range(20) if is_prime2(p)]
%timeit [p for p in L if is_prime2(p) ]

Cython function#

%%cython
import cython
cdef bint is_prime3(int n):
    if n < 4: return True
    if n % 2 == 0: return False
    cdef int k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
def prime_list(L):
    return [p for p in L if is_prime3(p)]
prime_list(list(range(20)))
%timeit prime_list(L)
%%cython
import cython
from numpy cimport ndarray
import numpy

cdef bint is_prime3(int n):
    if n < 4: return True
    if n % 2 == 0: return False
    cdef int k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True

def prime_array(ndarray[int, ndim=1] L):
    cdef ndarray[int, ndim=1] res = ndarray(shape=(L.shape[0]),dtype=numpy.int32)
    cdef int i
    for i in range(L.shape[0]):
        res[i] = is_prime3(L[i])
    return L[res==1]
import numpy as np
prime_array(np.arange(20,dtype=np.int32))
npL = numpy.array(L,dtype=np.int32)
%timeit prime_array(npL)

Using Parallelism#

  • Cython supports native parallelism via OpenMP

  • by default, Python’s Global Interpreter Lock (GIL) prevents that several threads use the Python interpreter simultaneously

  • to use this kind of parallelism, the GIL must be released

If you have a default compiler with openmp support you can use this magic command in your notebook.

%%cython --compile-args=-fopenmp --link-args=-fopenmp
%%file cython_omp.pyx
import cython
from cython.parallel cimport parallel, prange  # import parallel functions
import numpy as np
from numpy cimport ndarray

cdef bint is_prime4(int n) nogil:      #release the gil 
    if n < 4: return True
    if n % 2 == 0: return False
    cdef int k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True

@cython.boundscheck(False)
def prime_array_omp(ndarray[int, ndim=1] L):
    cdef ndarray[int, ndim=1] res = ndarray(shape=(L.shape[0]),dtype=np.int32)
    cdef Py_ssize_t i
    with nogil, parallel(num_threads=4):
        for i in prange(L.shape[0]):     #Parallel loop
            res[i] = is_prime4(L[i])
    return L[res==1]

To use the OpenMP support, you need to enable OpenMP. For gcc this can be done as follows in a setup.py:

%%file setup.py
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import os, sys
import numpy

if sys.platform == "darwin": # for omp, use gcc installed with brew
    os.environ["CC"]="gcc-10"
    os.environ["CXX"]="g++-10"

ext_modules = [
    Extension(
        "cython_omp",
        ["cython_omp.pyx"],
        extra_compile_args=['-fopenmp'],
        extra_link_args=['-fopenmp'],
        include_dirs=[numpy.get_include()]
    )
]

setup(
    name='Cython OpenMP Example',
    ext_modules=cythonize(ext_modules),
)
# python setup.py build_ext --inplace
%run setup.py build_ext --inplace --quiet
from cython_omp import prime_array_omp
prime_array_omp(np.arange(20,dtype=np.int32))
%timeit prime_array_omp(npL)

References#