While this is very similar to: Find indexes of matching rows in two 2-D arrays I don't have the reputation to leave a comment.
However, based on that comment there appear to be two clear possibilities for a large matrix like yours:
def find_rows_searchsorted(a, b):
dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
a_view = np.ascontiguousarray(a).view(dt).ravel()
b_view = np.ascontiguousarray(b).view(dt).ravel()
sort_b = np.argsort(b_view)
where_in_b = np.searchsorted(b_view, a_view, sorter=sort_b)
return np.take(sort_b, where_in_b)
def find_rows_iterative(a, b):
answer = np.empty(a.shape[0], dtype=int)
for idx, row in enumerate(a):
answer[idx] = np.where(np.equal(b, row).all(1))[0]
return answer
def find_rows_list_comprehension(a, b):
return [np.where(b == x)[0][0] for x in a]
However, a little timing with a matrix of 10000 elements shows that the searchsorted based method is significantly faster than the brute force iterative method:
arr1 = np.random.randn(10000, 3)
shuffled_inds = np.arange(arr1.shape[0])
np.random.shuffle(shuffled_inds)
arr2 = arr1[new_inds, :]
np.array_equal(find_rows_searchsorted(arr2, arr1), new_inds)
>> True
np.array_equal(find_rows_iterative(arr2, arr1), new_inds)
>> True
np.array_equal(find_rows_list_comprehension(arr2, arr1), new_inds)
>> True
%timeit find_rows_iterative(arr2, arr1)
>> 1 loops, best of 3: 2.62 s per loop
%timeit find_rows_list_comprehension(arr2, arr1)
>> 1 loops, best of 3: 1.61 s per loop
%timeit find_rows_searchsorted(arr2, arr1)
>> 100 loops, best of 3: 6.53 ms per loop
Based off of HYRY's great responses I also added lexsort and kdball tests as well as a test of argsort for structured arrays.
def find_rows_lexsort(a, b):
idx1 = np.lexsort(a.T)
idx2 = np.lexsort(b.T)
return idx2[np.argsort(idx1)]
def find_rows_argsort(a, b):
a_rec = np.core.records.fromarrays(a.transpose())
b_rec = np.core.records.fromarrays(b.transpose())
idx1 = a_rec.argsort(order=a_rec.dtype.names).argsort()
return b_rec.argsort(order=b_rec.dtype.names)[idx1]
def find_rows_kdball(a, b):
from scipy import spatial
tree = spatial.cKDTree(b)
_, idx = tree.query(a)
return idx
%timeit find_rows_lexsort(arr2, arr1)
>> 100 loops, best of 3: 4.63 ms per loop
%timeit find_rows_argsort(arr2, arr1)
>> 100 loops, best of 3: 7.37 ms per loop
%timeit find_rows_kdball(arr2, arr1)
>> 100 loops, best of 3: 18.5 ms per loop