I would like to optimize this Python code with Cython:
def updated_centers(point, start, center):
return np.array([__cluster_mean(point[start[c]:start[c + 1]], center[c]) for c in range(center.shape[0])])
def __cluster_mean(point, center):
return (np.sum(point, axis=0) + center) / (point.shape[0] + 1)
My Cython code:
cimport cython
cimport numpy as np
import numpy as np
# C-compatible Numpy integer type.
DTYPE = np.intc
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
@cython.cdivision(True) # Deactivate division by 0 checking.
def updated_centers(double [:,:] point, int [:] label, double [:,:] center):
if (point.shape[0] != label.size) or (point.shape[1] != center.shape[1]) or (center.shape[0] > point.shape[0]):
raise ValueError("Incompatible dimensions")
cdef Py_ssize_t i, c, j
cdef Py_ssize_t n = point.shape[0]
cdef Py_ssize_t m = point.shape[1]
cdef Py_ssize_t nc = center.shape[0]
# Updated centers. We accumulate point and center contributions into this array.
# Start by adding the (unscaled) center contributions.
new_center = np.zeros([nc, m])
new_center[:] = center
# Counter array. Will contain cluster sizes (including center, whose contribution
# is again added here) at the end of the point loop.
cluster_size = np.ones([nc], dtype=DTYPE)
# Add point contributions.
for i in range(n):
c = label[i]
cluster_size[c] += 1
for j in range(m):
new_center[c, j] += point[i, j]
# Scale center+point summation to be a mean.
for c in range(nc):
for j in range(m):
new_center[c, j] /= cluster_size[c]
return new_center
However, Cython is slower than python:
Python: %timeit f.updated_centers(point, start, center)
331 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Cython: %timeit fx.updated_centers(point, label, center)
433 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
The HTML reveals that almost all lines are yellow: allocating the array, +=, /=. I expected Cython to be an order of magnitude faster. What am I doing wrong?