Cython Numpy Array Manipulation Slower than Python

Question

I would like to optimize this Python code with Cython:

def updated_centers(point, start, center):
    return np.array([__cluster_mean(point[start[c]:start[c + 1]], center[c]) for c in range(center.shape[0])])

def __cluster_mean(point, center):
    return (np.sum(point, axis=0) + center) / (point.shape[0] + 1)

My Cython code:

cimport cython
cimport numpy as np
import numpy as np

# C-compatible Numpy integer type.                                                                                        
DTYPE = np.intc

@cython.boundscheck(False)  # Deactivate bounds checking                                                                  
@cython.wraparound(False)   # Deactivate negative indexing.                                                               
@cython.cdivision(True)     # Deactivate division by 0 checking.                                                          
def updated_centers(double [:,:] point, int [:] label, double [:,:] center):
    if (point.shape[0] != label.size) or (point.shape[1] != center.shape[1]) or (center.shape[0] > point.shape[0]):
    raise ValueError("Incompatible dimensions")

    cdef Py_ssize_t i, c, j
    cdef Py_ssize_t n = point.shape[0]
    cdef Py_ssize_t m = point.shape[1]
    cdef Py_ssize_t nc = center.shape[0]

    # Updated centers. We accumulate point and center contributions into this array.                                      
    # Start by adding the (unscaled) center contributions.                                                                
    new_center = np.zeros([nc, m])
    new_center[:] = center

    # Counter array. Will contain cluster sizes (including center, whose contribution                                     
    # is again added here) at the end of the point loop.                                                                  
    cluster_size = np.ones([nc], dtype=DTYPE)

    # Add point contributions.                                                                                            
    for i in range(n):
        c = label[i]
        cluster_size[c] += 1
        for j in range(m):
            new_center[c, j] += point[i, j]

    # Scale center+point summation to be a mean.                                                                          
    for c in range(nc):
        for j in range(m):
        new_center[c, j] /= cluster_size[c]

    return new_center

However, Cython is slower than python:

Python: %timeit f.updated_centers(point, start, center)
331 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Cython: %timeit fx.updated_centers(point, label, center)
433 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

The HTML reveals that almost all lines are yellow: allocating the array, +=, /=. I expected Cython to be an order of magnitude faster. What am I doing wrong?

Joe Weston · Accepted Answer · 2018-06-21 08:37:49Z

1

You need to tell Cython that new_center and cluster_size are arrays:

cdef double[:, :] new_center = np.zeros((nc, m))
...
cdef int[:] cluster_size = np.ones((nc,), dtype=DTYPE)
...

Without these type annotations Cython cannot generate efficient C code, and has to call into the Python interpreter when you access those arrays.This is why the lines in the HTML output of cython -a where you access these arrays were yellow.

With just these two small modifications we immediately see the speedup we want:

%timeit python_updated_centers(point, start, center)
392 ms ± 41.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%timeit cython_updated_centers(point, start, center)
1.18 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

answered Jun 21, 2018 at 8:37

Joe Weston

1065 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Oren Over a year ago

Joe -- works like a charm! I'm getting 828 mu-s on my machine with the timing reported below (i.e., a further 30x speedup) with the cdef [:,:] change.

serge-sans-paille · Accepted Answer · 2018-06-21 09:14:21Z

For such simple kernels, you can also use pythran to get nice speedups:

#pythran export updated_centers(float64 [:, :], int32 [:] , float64 [:, :] )
import numpy as np
def updated_centers(point, start, center):
    return np.array([__cluster_mean(point[start[c]:start[c + 1]], center[c]) for c in range(center.shape[0])])

def __cluster_mean(point, center):
    return (np.sum(point, axis=0) + center) / (point.shape[0] + 1)

Compiled with pythran updated_centers.py and one get the following timings:

Numpy code (same code, not compiled):

$ python -m perf timeit -s 'import numpy as np; n, m = 100000, 5; k = n//2; point = np.random.rand(n, m); start = 2*np.arange(k+1, dtype=np.int32); center=np.random.rand(k, m); from updated_centers import updated_centers' 'updated_centers(point, start, center)'
.....................
Mean +- std dev: 271 ms +- 12 ms

Pythran (after compilation):

$ python -m perf timeit -s 'import numpy as np; n, m = 100000, 5; k = n//2; point = np.random.rand(n, m); start = 2*np.arange(k+1, dtype=np.int32); center=np.random.rand(k, m); from updated_centers import updated_centers' 'updated_centers(point, start, center)'
.....................
Mean +- std dev: 12.8 ms +- 0.3 ms

Oren · Accepted Answer · 2018-06-19 16:54:39Z

The key is to write the Cython code like the Python code, to access arrays only when necessary.

cimport cython
cimport numpy as np
import numpy as np

# C-compatible Numpy integer type.
DTYPE = np.intc


@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)  # Deactivate negative indexing.
@cython.cdivision(True)  # Deactivate division by 0 checking.
def updated_centers(double [:, :] point, int [:] start, double [:, :] center):
"""Returns the updated list of cluster centers (damped center of mass Pahkira scheme). Cluster c
(and center[c]) corresponds to the point range point[start[c]:start[c+1]]."""
if (point.shape[1] != center.shape[1]) or (center.shape[0] > point.shape[0]) or (start.size != center.shape[0] + 1):
    raise ValueError("Incompatible dimensions")

# Py_ssize_t is the proper C type for Python array indices.
cdef Py_ssize_t i, c, j, cluster_start, cluster_stop, cluster_size
cdef Py_ssize_t n = point.shape[0]
cdef Py_ssize_t m = point.shape[1]
cdef Py_ssize_t nc = center.shape[0]
cdef double center_of_mass

# Updated centers. We accumulate point and center contributions into this array.
# Start by adding the (unscaled) center contributions.
new_center = np.zeros([nc, m])

cluster_start = start[0]
for c in range(nc):
    cluster_stop = start[c + 1]
    cluster_size = cluster_stop - cluster_start + 1 
    for j in range(m):
    center_of_mass = center[c, j]
    for i in range(cluster_start, cluster_stop):
        center_of_mass += point[i, j]
    new_center[c, j] = center_of_mass / cluster_size
    cluster_start = cluster_stop

return np.asarray(new_center)

With the same API we get

n, m = 100000, 5; k = n//2; point = np.random.rand(n, m); start = 2*np.arange(k+1, dtype=np.intc); center=np.random.rand(k, m);

%timeit fx.updated_centers(point, start, center)
31 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

%timeit f.updated_centers(point, start, center)
734 ms ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Collectives™ on Stack Overflow

Cython Numpy Array Manipulation Slower than Python

3 Answers 3

1 Comment

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

3 Answers 3

1 Comment

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related