Cython#

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10,6)
%config InlineBackend.figure_format = 'retina'
import numpy as np
import warnings
warnings.filterwarnings("ignore")

Cython logo

  • Cython provides extra syntax allowing for static type declarations (remember: Python is generally dynamically typed)

  • Python code gets translated into optimised C/C++ code and compiled as Python extension modules

  • Cython allows you to write fast C code in a Python-like syntax.

  • Furthermore, linking to existing C libraries is simplified.

  • Pure Python Function

\(f(x)=-2x^3+5x^2+x\),

def f(x):
    return -4*x**3 +3*x**2 +2*x

x = np.linspace(-1,1,100)
ax = plt.subplot(1,1,1)
ax.plot(x, f(x))
ax.fill_between(x, 0, f(x));
_images/17-Cython_6_0.png

we compute integral \(\int_a^b f(x)dx\) numerically with \(N\) points.

def integrate_f_py(a,b,N):
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1): # we intentionally use the bad way to do this with a loop
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx
%timeit integrate_f_py(-1,1,10**3)
print(integrate_f_py(-1,1,1000))
741 µs ± 2.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174
%load_ext heat
%%heat
def f(x):
    return -4*x**3 +3*x**2 +2*x
def integrate_f(a, b, N):
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx

integrate_f(0, 10, 1000)
_images/17-Cython_11_0.png
  • Pure C function

%%file integral_f_c.c

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#define NB_RUNS 1000

double f(double x) {
    return -4*x*x*x +3*x*x +2*x;
}

double integrate_f_c(double a, double b, int N) {
    double s  = 0;
    double dx = (b - a) / (N-1);
    for(int i=0; i<N-1; ++i){
        double x = a + i*dx;
        s += (f(x)+f(x+dx))/2.0;
    }
    return s*dx;
}

int main(int argc, char **argv)
{
  
  double a =  atof(argv[1]);
  double b =  atof(argv[2]);
  int N    =  atoi(argv[3]);
  double  res = 0;

  clock_t begin = clock();

  for (int i=0; i<NB_RUNS; ++i)
      res += integrate_f_c( a, b, N );
    
  clock_t end = clock();
     
  fprintf( stdout, "integral_f(%3.1f, %3.1f, %d) = %f \n", a, b, N, res / NB_RUNS );
  fprintf( stdout, "time = %e ms \n",  (double)(end - begin) / CLOCKS_PER_SEC );

  return 0;
}
Writing integral_f_c.c
!gcc -O3 integral_f_c.c; ./a.out -1 1 1000
integral_f(-1.0, 1.0, 1000) = 2.000004 
time = 4.632000e-03 ms 

Cython compilation: Generating C code#

Load Cython in jupyter notebook.

%load_ext Cython

C Variable and Type definitions#

In general, use cdef to declare C variables. The command :

$ cython -a mycode.pyx

outputs an html file. It shows what parts of your code are C, which parts are Python, and where C-Python conversion occurs.

%%cython -a
cdef int i, j = 2, k = 3      # assigning values at declaration
i = 1                         # assigning values afterwards
# avoid Python-C conversion! It's expensive:
a = 5
i = a
# same with C-Python conversion:
b = j
print("a = %d" % a)
print("i = %d" % i)
print("b = %d" % b)
a = 5
i = 5
b = 2
Cython: _cython_magic_132c4357e1dc505a5a4b91c4f7627576.pyx

Generated by Cython 0.29.32

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+01: cdef int i, j = 2, k = 3      # assigning values at declaration
  __pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_j = 2;
  __pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_k = 3;
+02: i = 1                         # assigning values afterwards
  __pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_i = 1;
 03: # avoid Python-C conversion! It's expensive:
+04: a = 5
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_a, __pyx_int_5) < 0) __PYX_ERR(0, 4, __pyx_L1_error)
+05: i = a
  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_a); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 5, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_i = __pyx_t_2;
 06: # same with C-Python conversion:
+07: b = j
  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_j); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_b, __pyx_t_1) < 0) __PYX_ERR(0, 7, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+08: print("a = %d" % a)
  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_a); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_3 = __Pyx_PyUnicode_FormatSafe(__pyx_kp_u_a_d, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+09: print("i = %d" % i)
  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_46_cython_magic_132c4357e1dc505a5a4b91c4f7627576_i); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_i_d, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+10: print("b = %d" % b)
  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_b); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_3 = __Pyx_PyUnicode_FormatSafe(__pyx_kp_u_b_d, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;

Another Python vs. Cython coloring guide#

%%cython -a
cdef int m, n
cdef double cy_total = 0.0
for m in range(10):
    n = 2*m
    cy_total += n
a, b = 0, 0
py_total = 0.0
for a in range(10):
    b = 2*a
    py_total += b
print(cy_total, py_total)
90.0 90.0
Cython: _cython_magic_6c1c529595d0157b648d279c74560506.pyx

Generated by Cython 0.29.32

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

 01: cdef int m, n
+02: cdef double cy_total = 0.0
  __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total = 0.0;
+03: for m in range(10):
  for (__pyx_t_1 = 0; __pyx_t_1 < 10; __pyx_t_1+=1) {
    __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_m = __pyx_t_1;
+04:     n = 2*m
    __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_n = (2 * __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_m);
+05:     cy_total += n
    __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total = (__pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total + __pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_n);
  }
+06: a, b = 0, 0
  __pyx_t_2 = __pyx_int_0;
  __Pyx_INCREF(__pyx_t_2);
  __pyx_t_3 = __pyx_int_0;
  __Pyx_INCREF(__pyx_t_3);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_a, __pyx_t_2) < 0) __PYX_ERR(0, 6, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_b, __pyx_t_3) < 0) __PYX_ERR(0, 6, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+07: py_total = 0.0
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_py_total, __pyx_float_0_0) < 0) __PYX_ERR(0, 7, __pyx_L1_error)
+08: for a in range(10):
  for (__pyx_t_4 = 0; __pyx_t_4 < 10; __pyx_t_4+=1) {
    __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    if (PyDict_SetItem(__pyx_d, __pyx_n_s_a, __pyx_t_3) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+09:     b = 2*a
    __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_a); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 9, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    __pyx_t_2 = PyNumber_Multiply(__pyx_int_2, __pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 9, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    if (PyDict_SetItem(__pyx_d, __pyx_n_s_b, __pyx_t_2) < 0) __PYX_ERR(0, 9, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+10:     py_total += b
    __Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_n_s_py_total); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_b); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    __pyx_t_5 = PyNumber_InPlaceAdd(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_5);
    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    if (PyDict_SetItem(__pyx_d, __pyx_n_s_py_total, __pyx_t_5) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
  }
+11: print(cy_total, py_total)
  __pyx_t_5 = PyFloat_FromDouble(__pyx_v_46_cython_magic_6c1c529595d0157b648d279c74560506_cy_total); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 11, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_5);
  __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_py_total); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 11, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_GIVEREF(__pyx_t_5);
  PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_5);
  __Pyx_GIVEREF(__pyx_t_3);
  PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_3);
  __pyx_t_5 = 0;
  __pyx_t_3 = 0;
  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_print, __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 11, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
%%cython -a
cdef struct Grail:
    int age
    float volume
cdef union Food:
    char *spam
    float *eggs
cdef enum CheeseType:
    cheddar, edam,
    camembert
cdef enum CheeseState:
    hard = 1
    soft = 2
    runny = 3
cdef Grail holy
holy.age    = 500
holy.volume = 10.0
print (holy.age, holy.volume)
500 10.0
Cython: _cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89.pyx

Generated by Cython 0.29.32

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+01: cdef struct Grail:
struct __pyx_t_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_Grail {
  int age;
  float volume;
};
 02:     int age
 03:     float volume
 04: cdef union Food:
 05:     char *spam
 06:     float *eggs
+07: cdef enum CheeseType:
enum __pyx_t_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_CheeseType {
  __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_cheddar,
  __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_edam,
  __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_camembert
};
 08:     cheddar, edam,
 09:     camembert
+10: cdef enum CheeseState:
enum __pyx_t_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_CheeseState {
  __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_hard = 1,
  __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_soft = 2,
  __pyx_e_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_runny = 3
};
 11:     hard = 1
 12:     soft = 2
 13:     runny = 3
 14: cdef Grail holy
+15: holy.age    = 500
  __pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.age = 0x1F4;
+16: holy.volume = 10.0
  __pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.volume = 10.0;
+17: print (holy.age, holy.volume)
  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.age); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_2 = PyFloat_FromDouble(__pyx_v_46_cython_magic_d8a0c5a491b5046e35e0044f8c9c4e89_holy.volume); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_GIVEREF(__pyx_t_1);
  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1);
  __Pyx_GIVEREF(__pyx_t_2);
  PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_2);
  __pyx_t_1 = 0;
  __pyx_t_2 = 0;
  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_print, __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;

Cython Functions#

Use cdef to define a Cython function.

  • Cython function can accept either (inclusive) Python and C values as well as return either Python or C values,

  • Within a Cython module Python and Cython functions can call each other freely. However, only Python functions can be called from outside the module by Python code. (i.e. importing/exporting a Cython module into some Python code)

cpdef define a Cython function with a simple Python wrapper. However, when called from Cython the Cython / C code is called directly, bypassing the Python wrapper.

Writing pure code in Cython gives a small speed boost. Note that none of the code below is Cython-specific. Just add .pyx instead of .py extension.

%%file cython_f_example.pyx
def f(x):
    return -4*x**3 +3*x**2 +2*x
def integrate_f(a, b, N):
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx
Writing cython_f_example.pyx

Cython Compilation#

  • The .pyx source file is compiled by Cython to a .c file.

  • The .c source file contains the code of a Python extension module.

  • The .c file is compiled by a C compiler to a .so (shared object library) file which can be imported directly into a Python session.

Build with CMake#

project(cython_f_example CXX)
include(UseCython)  # Load Cython functions
# Set C++ output
set_source_file_properties(cython_f_example.pyx PROPERTIES CYTHON_IS_CXX TRUE )
# Build the extension module
cython_add_module( modname cython_f_example.pyx cython_f_example.cpp )

C/C++ generation with cython application#

cython -3 cython_f_example.pyx   # create the C file for Python 3
cython -3 --cplus cython_f_example.pyx  # create the C++ file for Python 3

build with a C/C++ compiler#

To build use the Makefile:

CC=gcc
CFLAGS=`python-config --cflags` 
LDFLAGS=`python-config --ldflags`
cython_f_example:
	 ${CC} -c $@.c ${CFLAGS}
	 ${CC} $@.o -o $@.so -shared ${LDFLAGS}

Import the module in Python session

import cython_f_example

pyximport#

import Cython .pyx files as if they were .py files:

import pyximport
pyximport.install()
import cython_f_example
%timeit cython_f_example.integrate_f(-1,1,10**3)
print(cython_f_example.integrate_f(-1,1,1000))
553 µs ± 67.6 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174

Building a Cython module using distutils#

Create the setup.py script:

%%file setup.py
from distutils.core import setup
from Cython.Build import cythonize

setup(
  name = 'Cython Example Integrate f Function',
  ext_modules = cythonize("cython_f_example.pyx"),
)
Writing setup.py
%run setup.py  build_ext --inplace --quiet
Compiling cython_f_example.pyx because it changed.
[1/1] Cythonizing cython_f_example.pyx
running build_ext
building 'cython_f_example' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
creating build
creating build/temp.linux-x86_64-cpython-310
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: cython_f_example.c
creating build/lib.linux-x86_64-cpython-310
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include build/temp.linux-x86_64-cpython-310/cython_f_example.o -o build/lib.linux-x86_64-cpython-310/cython_f_example.cpython-310-x86_64-linux-gnu.so
copying build/lib.linux-x86_64-cpython-310/cython_f_example.cpython-310-x86_64-linux-gnu.so -> 
<Figure size 1000x600 with 0 Axes>
from cython_f_example import integrate_f
%timeit integrate_f(-1,1,10**3)
integrate_f(-1,1,10**3)
552 µs ± 202 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174

Why is it faster with Cython ?#

  • Python code is interpreted at every execution to machine code.

  • Compiled C code is already in machine code.

  • C is a statically-typed language. It gives to the compiler more information which allows it to optimize both computations and memory access.

  • To add two variables, Python checks the type before calling the right add function and store it to a value that can be new.

  • C just add the variables and return the result.

Add Cython types#

We coerce Python types to C types when calling the function. Still a “Python function” so callable from the global namespace.

%%cython
def f(x):
    return -4*x**3 +3*x**2 +2*x
def cy_integrate_f(double a, double b, int N):
    cdef int i
    cdef double s, x, dx
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (f(x)+f(x+dx))/2
    return s*dx
building '_cython_magic_e2206c17cfc7df224a573bc547e7c635' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_e2206c17cfc7df224a573bc547e7c635.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_e2206c17cfc7df224a573bc547e7c635.o -o /home/runner/.cache/ipython/cython/_cython_magic_e2206c17cfc7df224a573bc547e7c635.cpython-310-x86_64-linux-gnu.so
  • typing the iterator variable i with C semantics, tells Cython to compile the for-loop to pure C code.

  • typing a, s and dx is important as they are involved in arithmetic within the for-loop

  • Cython type declarations can make the source code less readable

  • Do not use them without good reason, i.e. only in performance critical sections.

%timeit cy_integrate_f(-1,1,10**3)
print(cy_integrate_f(-1,1,1000))
483 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
2.0000040080120174

Finally, we integrate a Cython function instead of a Python function. This eliminates the Python-C conversion at the function call as seen above thus giving a pure Cython/C algorithm.

The primary downside is not being allowed to call the function cy_f, from Python unless cpdef is used.

%%cython
cdef double cy_f(double x):
    return -4*x**3 +3*x**2 +2*x
def cycy_integrate_f(double a, double b, int N):
    cdef int i
    cdef double s, x, dx
    s  = 0
    dx = (b - a) / (N-1)
    for i in range(N-1):
        x = a + i*dx
        s += (cy_f(x)+cy_f(x+dx))/2
    return s*dx
building '_cython_magic_771b85cb3308c54a0a536b0ad92f7de6' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_771b85cb3308c54a0a536b0ad92f7de6.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_771b85cb3308c54a0a536b0ad92f7de6.o -o /home/runner/.cache/ipython/cython/_cython_magic_771b85cb3308c54a0a536b0ad92f7de6.cpython-310-x86_64-linux-gnu.so
%timeit cycy_integrate_f(-1,1,10**3)
print(cycy_integrate_f(-1,1,1000))
50.2 µs ± 10.7 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
2.0000040080120174

Exercise : Cythonize the trivial exponential function.#

%%cython -a
def exp_python(x,terms=50):
    sum = 0.
    power = 1.
    fact = 1.
    for i in range(terms):
        sum += power/fact
        power *= x
        fact *= i+1
    return sum
building '_cython_magic_ad9ee808f280caa4b8920e8bbc858924' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_ad9ee808f280caa4b8920e8bbc858924.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_ad9ee808f280caa4b8920e8bbc858924.o -o /home/runner/.cache/ipython/cython/_cython_magic_ad9ee808f280caa4b8920e8bbc858924.cpython-310-x86_64-linux-gnu.so
Cython: _cython_magic_ad9ee808f280caa4b8920e8bbc858924.pyx

Generated by Cython 0.29.32

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

+1: def exp_python(x,terms=50):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python = {"exp_python", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python, METH_VARARGS|METH_KEYWORDS, 0};
static PyObject *__pyx_pw_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
  PyObject *__pyx_v_x = 0;
  PyObject *__pyx_v_terms = 0;
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("exp_python (wrapper)", 0);
  {
    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_x,&__pyx_n_s_terms,0};
    PyObject* values[2] = {0,0};
    values[1] = ((PyObject *)__pyx_int_50);
    if (unlikely(__pyx_kwds)) {
      Py_ssize_t kw_args;
      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
      switch (pos_args) {
        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
        CYTHON_FALLTHROUGH;
        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
        CYTHON_FALLTHROUGH;
        case  0: break;
        default: goto __pyx_L5_argtuple_error;
      }
      kw_args = PyDict_Size(__pyx_kwds);
      switch (pos_args) {
        case  0:
        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_x)) != 0)) kw_args--;
        else goto __pyx_L5_argtuple_error;
        CYTHON_FALLTHROUGH;
        case  1:
        if (kw_args > 0) {
          PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_terms);
          if (value) { values[1] = value; kw_args--; }
        }
      }
      if (unlikely(kw_args > 0)) {
        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "exp_python") < 0)) __PYX_ERR(0, 1, __pyx_L3_error)
      }
    } else {
      switch (PyTuple_GET_SIZE(__pyx_args)) {
        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
        CYTHON_FALLTHROUGH;
        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
        break;
        default: goto __pyx_L5_argtuple_error;
      }
    }
    __pyx_v_x = values[0];
    __pyx_v_terms = values[1];
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L5_argtuple_error:;
  __Pyx_RaiseArgtupleInvalid("exp_python", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 1, __pyx_L3_error)
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_ad9ee808f280caa4b8920e8bbc858924.exp_python", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_exp_python(__pyx_self, __pyx_v_x, __pyx_v_terms);
  int __pyx_lineno = 0;
  const char *__pyx_filename = NULL;
  int __pyx_clineno = 0;

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_exp_python(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_x, PyObject *__pyx_v_terms) {
  PyObject *__pyx_v_sum = NULL;
  PyObject *__pyx_v_power = NULL;
  PyObject *__pyx_v_fact = NULL;
  PyObject *__pyx_v_i = NULL;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("exp_python", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_XDECREF(__pyx_t_5);
  __Pyx_AddTraceback("_cython_magic_ad9ee808f280caa4b8920e8bbc858924.exp_python", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_XDECREF(__pyx_v_sum);
  __Pyx_XDECREF(__pyx_v_power);
  __Pyx_XDECREF(__pyx_v_fact);
  __Pyx_XDECREF(__pyx_v_i);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple_ = PyTuple_Pack(6, __pyx_n_s_x, __pyx_n_s_terms, __pyx_n_s_sum, __pyx_n_s_power, __pyx_n_s_fact, __pyx_n_s_i); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple_);
  __Pyx_GIVEREF(__pyx_tuple_);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_ad9ee808f280caa4b8920e8bbc858924_1exp_python, NULL, __pyx_n_s_cython_magic_ad9ee808f280caa4b8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_exp_python, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+2:     sum = 0.
  __Pyx_INCREF(__pyx_float_0_);
  __pyx_v_sum = __pyx_float_0_;
+3:     power = 1.
  __Pyx_INCREF(__pyx_float_1_);
  __pyx_v_power = __pyx_float_1_;
+4:     fact = 1.
  __Pyx_INCREF(__pyx_float_1_);
  __pyx_v_fact = __pyx_float_1_;
+5:     for i in range(terms):
  __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_terms); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) {
    __pyx_t_2 = __pyx_t_1; __Pyx_INCREF(__pyx_t_2); __pyx_t_3 = 0;
    __pyx_t_4 = NULL;
  } else {
    __pyx_t_3 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 5, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __pyx_t_4 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 5, __pyx_L1_error)
  }
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  for (;;) {
    if (likely(!__pyx_t_4)) {
      if (likely(PyList_CheckExact(__pyx_t_2))) {
        if (__pyx_t_3 >= PyList_GET_SIZE(__pyx_t_2)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_1); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 5, __pyx_L1_error)
        #else
        __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        #endif
      } else {
        if (__pyx_t_3 >= PyTuple_GET_SIZE(__pyx_t_2)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_1); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 5, __pyx_L1_error)
        #else
        __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        #endif
      }
    } else {
      __pyx_t_1 = __pyx_t_4(__pyx_t_2);
      if (unlikely(!__pyx_t_1)) {
        PyObject* exc_type = PyErr_Occurred();
        if (exc_type) {
          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
          else __PYX_ERR(0, 5, __pyx_L1_error)
        }
        break;
      }
      __Pyx_GOTREF(__pyx_t_1);
    }
    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1);
    __pyx_t_1 = 0;
/* … */
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+6:         sum += power/fact
    __pyx_t_1 = __Pyx_PyNumber_Divide(__pyx_v_power, __pyx_v_fact); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __pyx_t_5 = PyNumber_InPlaceAdd(__pyx_v_sum, __pyx_t_1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 6, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_5);
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    __Pyx_DECREF_SET(__pyx_v_sum, __pyx_t_5);
    __pyx_t_5 = 0;
+7:         power *= x
    __pyx_t_5 = PyNumber_InPlaceMultiply(__pyx_v_power, __pyx_v_x); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 7, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_5);
    __Pyx_DECREF_SET(__pyx_v_power, __pyx_t_5);
    __pyx_t_5 = 0;
+8:         fact *= i+1
    __pyx_t_5 = __Pyx_PyInt_AddObjC(__pyx_v_i, __pyx_int_1, 1, 0, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_5);
    __pyx_t_1 = PyNumber_InPlaceMultiply(__pyx_v_fact, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    __Pyx_DECREF_SET(__pyx_v_fact, __pyx_t_1);
    __pyx_t_1 = 0;
+9:     return sum
  __Pyx_XDECREF(__pyx_r);
  __Pyx_INCREF(__pyx_v_sum);
  __pyx_r = __pyx_v_sum;
  goto __pyx_L0;
%timeit exp_python(1.,50)
3.65 µs ± 2.44 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
%%cython

# %load solutions/cython/exponential.pyx
#cython: profile=False
#cython: cdivision=True
def exp_cython(double x, int terms = 50):
   cdef double sum
   cdef double power
   cdef double fact
   cdef int i
   sum = 0.
   power = 1.
   fact = 1.
   for i in range(terms):
      sum += power/fact
      power *= x
      fact *= i+1
   return sum
building '_cython_magic_68081c7e8272dfed20aa464238eed323' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_68081c7e8272dfed20aa464238eed323.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_68081c7e8272dfed20aa464238eed323.o -o /home/runner/.cache/ipython/cython/_cython_magic_68081c7e8272dfed20aa464238eed323.cpython-310-x86_64-linux-gnu.so
%timeit exp_cython(1.,50)
136 ns ± 0.0385 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

Cython and Numpy#

The Numpy library contains many fast numerics routines. Their speed comes from manipulating the low-level C-arrays that the numpy.array object wraps rather than computing over slow Python lists. Using Cython one can access those low-level arrays and implement their own fast algorithms while allowing the easy interaction afforded by Python + Numpy.

The examples below are various implementations of the naive matrix multiplication algorithm. We will start with a pure Python implementation and then incrementally add structures that allow Cython to exploit the low-level speed of the numpy.array object.

Pure Python implementation compiled in Cython without specific optimizations.#

%%cython
def matmul1(A, B, out=None):
    assert A.shape[1] == B.shape[0]
    for i in range(A.shape[0]):
        for j in range(B.shape[1]):
            s = 0
            for k in range(A.shape[1]):
                s += A[i,k] * B[k,j]
            out[i,j] = s
    return out
building '_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab.o -o /home/runner/.cache/ipython/cython/_cython_magic_dd429e0ab02e8e1848cbe425d676f5ab.cpython-310-x86_64-linux-gnu.so

Import numpy as a Cython module#

We now take advantage of the ability to access the underlying C arrays in the numpy.array object from Cython, thanks to a special numpy.pxd file included with Cython. (The Cython developers worked closely with Numpy developers to make this optimal.)

To begin with, we have to cimport numpy: that is, import numpy as a Cython module rather than a Python module. To do so, simply type:

cimport numpy as np

Another important thing to note is the type of Numpy indexers. There is a special Numpy variable type used for numpy.array indices called Py_ssize_t. To take full advantage of the speedups that Cython can provide we should make sure to type the variables used for indexing as such.

%%cython
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t      # shorthand type. easy to change
def matmul2(np.ndarray[dtype_t, ndim=2] A,
            np.ndarray[dtype_t, ndim=2] B,
            np.ndarray[dtype_t, ndim=2] out=None):
    cdef Py_ssize_t i, j, k
    cdef dtype_t s
    assert A.shape[1] == B.shape[0]
    for i in range(A.shape[0]):
        for j in range(B.shape[1]):
            s = 0
            for k in range(A.shape[1]):
                s += A[i,k] * B[k,j]
            out[i,j] = s
    return out
building '_cython_magic_4e45a0b7bfe01179c777581a00217466' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.c:769:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
   17 | #warning "Using deprecated NumPy API, disable it with " \
      |  ^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.o -o /home/runner/.cache/ipython/cython/_cython_magic_4e45a0b7bfe01179c777581a00217466.cpython-310-x86_64-linux-gnu.so
import numpy as np
from timeit import timeit
A = np.random.random_sample((64,64))
B = np.random.random_sample((64,64))
C = np.zeros((64,64))
%timeit matmul1(A,B,C)
70.5 ms ± 43.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul2(A,B,C)
359 µs ± 55.1 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Tuning indexing#

The array lookups are still slowed down by two factors:

  • Bounds checking is performed.

  • Negative indices are checked for and handled correctly.

The code doesn’t use negative indices, and always access to arrays within bounds. We can add a decorator to disable bounds checking:

%%cython
cimport cython                                       # cython tools
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def matmul3(np.ndarray[dtype_t, ndim=2] A,
            np.ndarray[dtype_t, ndim=2] B,
            np.ndarray[dtype_t, ndim=2] out=None):
    cdef Py_ssize_t i, j, k
    cdef dtype_t s
    assert A.shape[1] == B.shape[0]
    for i in range(A.shape[0]):
        for j in range(B.shape[1]):
            s = 0
            for k in range(A.shape[1]):
                s += A[i,k] * B[k,j]
            out[i,j] = s
    return out
building '_cython_magic_e56092fb4727525b8c555f699543ff80' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.c:770:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
   17 | #warning "Using deprecated NumPy API, disable it with " \
      |  ^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.o -o /home/runner/.cache/ipython/cython/_cython_magic_e56092fb4727525b8c555f699543ff80.cpython-310-x86_64-linux-gnu.so
%timeit matmul3(A,B,C)
252 µs ± 73.6 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Cython Build Options#

  • boundcheck(True,False) : array bounds checking

  • wraparound(True,False) : negative indexing.

  • initializedcheck(True,False): checks that a memoryview is initialized

  • nonecheck(True,False) : Check if one argument is None

  • overflowcheck(True,False) : Check if int are too big

  • cdivision(True,False) : If False, adjust the remainder and quotient operators C types to match those of Python ints. Could be very effective when it is set to True.

  • profile (True / False) : Write hooks for Python profilers into the compiled C code. Default is False.

Cython Compiler directives

Numpy objects with external C program.#

Note that this can actually be slower because the C function is not the best implementation of matrix multiplication. Call cblas with same technique is an interesting exercise.

%%file mydgemm.c 
void my_dgemm( int m, int n, int k, 
              double a[m][n], double b[n][k], float c[m][k] )
{
  double ab = 0;
  for( int j = 0 ; j < m ; j++ ) {
    for( int i = 0 ; i < k ; i++ ) {
      for( int l = 0 ; l < n ; l++ ){
        ab += a[j][l] * b[l][i];
      }
      c[j][i] = ab ;
      ab = 0;
    }
  }
}
Writing mydgemm.c
  • The np.ndarray[double, ndim=2, mode="c"] assures that you get a C-contiguous numpy array of doubles

  • The &input[0,0] passed in the address of the beginning of the data array.

from pyximport import install
import os
here = os.getcwd()
%%cython -I {here}
# do not forget to change the file path
cdef extern from "mydgemm.c":
    void my_dgemm (int m, int n, int k, 
                          double *A, double *B, double *C)
cimport cython
import numpy as np
cimport numpy as np
ctypedef np.float64_t dtype_t
@cython.boundscheck(False)
@cython.wraparound(False) 
def matmul4(np.ndarray[dtype_t, ndim=2, mode="c"] A,
            np.ndarray[dtype_t, ndim=2, mode="c"] B,
            np.ndarray[dtype_t, ndim=2, mode="c"] C=None):
    cdef int m = A.shape[0]
    cdef int n = A.shape[1]
    cdef int k = B.shape[1]
    cdef dtype_t s
    
    my_dgemm(m, n, k, &A[0,0], &B[0,0], &C[0,0])
                                                  
    return C
building '_cython_magic_4341c8542397f443eab54ed5120d50e2' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/home/runner/work/python-notebooks/python-notebooks/notebooks -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:774:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
   17 | #warning "Using deprecated NumPy API, disable it with " \
      |  ^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c: In function '__pyx_pf_46_cython_magic_4341c8542397f443eab54ed5120d50e2_matmul4':
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:2023:46: warning: passing argument 4 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
 2023 |   my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
      |                                             ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                              |
      |                                              __pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t * {aka double *}
In file included from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:771:
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:22: note: expected 'double (*)[(sizetype)(n)]' but argument is of type '__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *' {aka 'double *'}
    2 |               double a[m][n], double b[n][k], float c[m][k] )
      |               ~~~~~~~^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:2023:286: warning: passing argument 5 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
 2023 |   my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
      |                                                                                                                                                                                                                                                                                             ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                                                                                                                                                                                                                                                                              |
      |                                                                                                                                                                                                                                                                                              __pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t * {aka double *}
In file included from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:771:
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:38: note: expected 'double (*)[(sizetype)(k)]' but argument is of type '__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *' {aka 'double *'}
    2 |               double a[m][n], double b[n][k], float c[m][k] )
      |                               ~~~~~~~^~~~~~~
/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:2023:526: warning: passing argument 6 of 'my_dgemm' from incompatible pointer type [-Wincompatible-pointer-types]
 2023 |   my_dgemm(__pyx_v_m, __pyx_v_n, __pyx_v_k, (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_A.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_A.diminfo[0].strides, __pyx_t_2, __pyx_pybuffernd_A.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_B.rcbuffer->pybuffer.buf, __pyx_t_3, __pyx_pybuffernd_B.diminfo[0].strides, __pyx_t_4, __pyx_pybuffernd_B.diminfo[1].strides))), (&(*__Pyx_BufPtrCContig2d(__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *, __pyx_pybuffernd_C.rcbuffer->pybuffer.buf, __pyx_t_5, __pyx_pybuffernd_C.diminfo[0].strides, __pyx_t_6, __pyx_pybuffernd_C.diminfo[1].strides))));
      |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
      |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              __pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t * {aka double *}
In file included from /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.c:771:
/home/runner/work/python-notebooks/python-notebooks/notebooks/mydgemm.c:2:53: note: expected 'float (*)[(sizetype)(k)]' but argument is of type '__pyx_t_46_cython_magic_4341c8542397f443eab54ed5120d50e2_dtype_t *' {aka 'double *'}
    2 |               double a[m][n], double b[n][k], float c[m][k] )
      |                                               ~~~~~~^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.o -o /home/runner/.cache/ipython/cython/_cython_magic_4341c8542397f443eab54ed5120d50e2.cpython-310-x86_64-linux-gnu.so
%timeit matmul4(A,B,C)
239 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Exercise : Find prime numbers < 10000#

# %load solutions/cython/is_prime0.py

def is_prime0(n):
    if n < 4: return True
    if n % 2 == 0 : return False
    k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
[ p for p in range(20) if is_prime0(p)]
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
L = list(range(10000))
%timeit [ p for p in L if is_prime0(p)]
6.09 ms ± 971 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%cython
def is_prime1(n):
    if n < 4: return True
    if n % 2 == 0 : return False
    k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
building '_cython_magic_aa9811354ee3c4d01a6231f3078a753d' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_aa9811354ee3c4d01a6231f3078a753d.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_aa9811354ee3c4d01a6231f3078a753d.o -o /home/runner/.cache/ipython/cython/_cython_magic_aa9811354ee3c4d01a6231f3078a753d.cpython-310-x86_64-linux-gnu.so
[ p for p in range(20) if is_prime1(p)]
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
%timeit [p  for p in L if is_prime1(p)]
3.11 ms ± 4.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Add Cython types without modifying the Python Code#

%%cython
import cython
@cython.locals(n=int, k=int)
def is_prime2(n):
    if n < 4: return True
    if n % 2 == 0 : return False
    k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
building '_cython_magic_ffcb8d83f7d9187218aa269cf4a20513' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_ffcb8d83f7d9187218aa269cf4a20513.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_ffcb8d83f7d9187218aa269cf4a20513.o -o /home/runner/.cache/ipython/cython/_cython_magic_ffcb8d83f7d9187218aa269cf4a20513.cpython-310-x86_64-linux-gnu.so
[ p for p in range(20) if is_prime2(p)]
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
%timeit [p for p in L if is_prime2(p) ]
532 µs ± 114 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Cython function#

%%cython
import cython
cdef bint is_prime3(int n):
    if n < 4: return True
    if n % 2 == 0: return False
    cdef int k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True
def prime_list(L):
    return [p for p in L if is_prime3(p)]
building '_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf.o -o /home/runner/.cache/ipython/cython/_cython_magic_9d904024e8d0b344e2f9c97a0bea1faf.cpython-310-x86_64-linux-gnu.so
prime_list(list(range(20)))
[0, 1, 2, 3, 5, 7, 11, 13, 17, 19]
%timeit prime_list(L)
289 µs ± 51.1 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%cython
import cython
from numpy cimport ndarray
import numpy

cdef bint is_prime3(int n):
    if n < 4: return True
    if n % 2 == 0: return False
    cdef int k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True

def prime_array(ndarray[int, ndim=1] L):
    cdef ndarray[int, ndim=1] res = ndarray(shape=(L.shape[0]),dtype=numpy.int32)
    cdef int i
    for i in range(L.shape[0]):
        res[i] = is_prime3(L[i])
    return L[res==1]
building '_cython_magic_fe1737e5897872041914915a6c8ea617' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
INFO: x86_64-conda-linux-gnu-cc: /home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.c
In file included from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.c:771:
/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning "Using deprecated NumPy API, disable it with " "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
   17 | #warning "Using deprecated NumPy API, disable it with " \
      |  ^~~~~~~
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include /home/runner/.cache/ipython/cython/home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.o -o /home/runner/.cache/ipython/cython/_cython_magic_fe1737e5897872041914915a6c8ea617.cpython-310-x86_64-linux-gnu.so
import numpy as np
prime_array(np.arange(20,dtype=np.int32))
array([ 0,  1,  2,  3,  5,  7, 11, 13, 17, 19], dtype=int32)
npL = numpy.array(L,dtype=np.int32)
%timeit prime_array(npL)
281 µs ± 284 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Using Parallelism#

  • Cython supports native parallelism via OpenMP

  • by default, Python’s Global Interpreter Lock (GIL) prevents that several threads use the Python interpreter simultaneously

  • to use this kind of parallelism, the GIL must be released

If you have a default compiler with openmp support you can use this magic command in your notebook.

%%cython --compile-args=-fopenmp --link-args=-fopenmp
%%file cython_omp.pyx
import cython
from cython.parallel cimport parallel, prange  # import parallel functions
import numpy as np
from numpy cimport ndarray

cdef bint is_prime4(int n) nogil:      #release the gil 
    if n < 4: return True
    if n % 2 == 0: return False
    cdef int k = 3
    while k*k <= n:
        if n % k == 0: return False
        k += 2
    return True

@cython.boundscheck(False)
def prime_array_omp(ndarray[int, ndim=1] L):
    cdef ndarray[int, ndim=1] res = ndarray(shape=(L.shape[0]),dtype=np.int32)
    cdef Py_ssize_t i
    with nogil, parallel(num_threads=4):
        for i in prange(L.shape[0]):     #Parallel loop
            res[i] = is_prime4(L[i])
    return L[res==1]
Writing cython_omp.pyx

To use the OpenMP support, you need to enable OpenMP. For gcc this can be done as follows in a setup.py:

%%file setup.py
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import os, sys
import numpy

if sys.platform == "darwin": # for omp, use gcc installed with brew
    os.environ["CC"]="gcc-10"
    os.environ["CXX"]="g++-10"

ext_modules = [
    Extension(
        "cython_omp",
        ["cython_omp.pyx"],
        extra_compile_args=['-fopenmp'],
        extra_link_args=['-fopenmp'],
        include_dirs=[numpy.get_include()]
    )
]

setup(
    name='Cython OpenMP Example',
    ext_modules=cythonize(ext_modules),
)
# python setup.py build_ext --inplace
Overwriting setup.py
%run setup.py build_ext --inplace --quiet
Compiling cython_omp.pyx because it changed.
[1/1] Cythonizing cython_omp.pyx
running build_ext
building 'cython_omp' extension
INFO: C compiler: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC -O2 -isystem /usr/share/miniconda3/envs/runenv/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include -fPIC
INFO: compile options: '-I/usr/share/miniconda3/envs/runenv/lib/python3.10/site-packages/numpy/core/include -I/usr/share/miniconda3/envs/runenv/include/python3.10 -c'
extra options: '-fopenmp'
INFO: x86_64-conda-linux-gnu-cc: cython_omp.c
INFO: /usr/share/miniconda3/envs/runenv/bin/x86_64-conda-linux-gnu-cc -shared -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/usr/share/miniconda3/envs/runenv/lib -Wl,-rpath-link,/usr/share/miniconda3/envs/runenv/lib -L/usr/share/miniconda3/envs/runenv/lib -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /usr/share/miniconda3/envs/runenv/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /usr/share/miniconda3/envs/runenv/include build/temp.linux-x86_64-cpython-310/cython_omp.o -o build/lib.linux-x86_64-cpython-310/cython_omp.cpython-310-x86_64-linux-gnu.so -fopenmp
copying build/lib.linux-x86_64-cpython-310/cython_omp.cpython-310-x86_64-linux-gnu.so -> 
from cython_omp import prime_array_omp
prime_array_omp(np.arange(20,dtype=np.int32))
array([ 0,  1,  2,  3,  5,  7, 11, 13, 17, 19], dtype=int32)
%timeit prime_array_omp(npL)
202 µs ± 6.19 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

References#