Fast way to remove array of specific row values from 2D numpy array

Question

I have a 2D array like this:

a = np.array([[25, 83, 18, 71],
       [75,  7,  0, 85],
       [25, 83, 18, 71],
       [25, 83, 18, 71],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [56, 75, 50,  0],
       [19, 49, 92, 57],
       [52, 93, 58,  9]])

and I want to remove rows that has specific values, for example:

b = np.array([[56, 75, 50,  0], [52, 93, 58,  9], [25, 83, 18, 71]])

What is the most efficient way to do this in numpy or pandas? Expected output:

np.array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])

Update

The fastest approach is dimensionality reduction but it requires quite strict limitations of ranges of columns in general. There is my perfplot:

import pandas as pd
import numexpr as ne
import perfplot
from time import time

def remove_pd(data):
    a,b = data
    dfa, dfb = pd.DataFrame(a), pd.DataFrame(b)
    return dfa.merge(dfb, how='left', indicator=True)\
    .query('_merge == "left_only"').drop(columns='_merge').values
    
def remove_smalldata(data):
    a,b = data
    return a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]

'''def remove_nploop(data):
    a, b = data
    for arr in b:
        a = a[np.all(~np.equal(a, arr), axis=1)]
    return a'''
        
def remove_looped(data): 
    a, b = data
    to_remain = [True]*len(a)
    ind = 0
    for vec_a in a:
        for vec_b in b:
            if np.array_equal(vec_a, vec_b):
                to_remain[ind] = False
                break
        ind += 1
    return a[to_remain]

def remove_looped_boost(data): 
    a, b = data
    to_remain = [True]*len(a)
    a_map = list(map(tuple, a.tolist()))
    b_map = set(map(tuple, b.tolist()))
    for i in range(len(a)):
        to_remain[i] = not(a_map[i] in b_map)
    return a[to_remain]

def remove_reducedim(data):
    a,b = data
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
    ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
    ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
    return a[~np.isin(ravel_a, ravel_b)]

def remove_reducedim_boost(data):
    a,b = data
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m1,m2,m3,m4 = np.min([ma, mb], axis=0)
    M1,M2,M3,M4 = np.max([MA, MB], axis=0)
    s1,s2,s3,s4 = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
    a1,a2,a3,a4 = a.T
    b1,b2,b3,b4 = b.T
    d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
        's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
    ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
    ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
    return a[~np.isin(ravel_a, ravel_b)]
    
def setup(x):
    a1 = np.random.randint(50000, size=(x,4))
    a2 = a1[np.random.randint(x, size=x)]
    return a1, a2
    
def build_args(figure):
    kernels = [remove_reducedim, remove_reducedim_boost, remove_pd, remove_looped, remove_looped_boost, remove_smalldata]
    return {'setup': setup,
    'kernels': {'A': kernels, 'B': kernels[:3]}[figure],
    'n_range': {'A': [2 ** k for k in range(12)], 'B': [2 ** k for k in range(11, 25)]}[figure],
     'xlabel': 'Remowing n rows from n rows',
     'title' : {'A':'Testing removal of small dataset', 'B':'Testing removal of large dataset'}[figure],
     'show_progress': False,
     'equality_check': lambda x,y: np.array_equal(x, y)}
    
t = time()
outs = [perfplot.bench(**build_args(n)) for n in ('A','B')]
fig = plt.figure(figsize=(20, 20))
for i in range(len(outs)):
    ax = fig.add_subplot(2, 1, i+1)
    ax.grid(True, which="both")
    outs[i].plot()
plt.show()
print('Overall testing time:', time()-t)

Output:

Overall testing time: 529.2596168518066

If multiple rows in a are the same as one row in b, are all of them expected to be removed? — GZ0
– GZ0, Commented Oct 22, 2020 at 13:56

Erfan · Accepted Answer · 2020-10-13 23:32:44Z

7

Here's a pandas approach doing a "anti join" using merge and query.

dfa = pd.DataFrame(a)
dfb = pd.DataFrame(b)

df = (
    dfa.merge(dfb, how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns='_merge')
)

    0   1   2   3
1  75   7   0  85
4  75  48   8  43
5   7  47  96  94
6   7  47  96  94
8  19  49  92  57

Note: a plain numpy solution should be faster, but this should do fine.

Plain numpy but with a single loop:

for arr in b:
    a = a[np.all(~np.equal(a, arr), axis=1)]

array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])

answered Oct 13, 2020 at 23:32

Erfan

43.3k10 gold badges75 silver badges86 bronze badges

Sign up to request clarification or add additional context in comments.

2 Comments

mathfux Over a year ago

Thanks. It's better to have multiple versions including in order to perfplot or time it other ways.

mathfux Over a year ago

So pandas did the best of all solutions that doesn't apply dimensionality reduction. It might be a fastest solution if integers are larger.

Divakar · Accepted Answer · 2020-10-22 08:36:20Z

3

+50

Approach #1 : Views + searchsorted

One approach with array-views to view each row as a scalar each -

# https://stackoverflow.com/a/45313353/ @Divakar
def view1D(a, b): # a, b are arrays
    a = np.ascontiguousarray(a)
    b = np.ascontiguousarray(b)
    void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return a.view(void_dt).ravel(),  b.view(void_dt).ravel()

A,B = view1D(a,b)

sidx = B.argsort()
idx = np.searchsorted(B, A, sorter=sidx)
idx[idx==len(B)] = 0
out = a[B[sidx[idx]] != A]

A variant of it would be with sorting B -

Bs = np.sort(B)
idx = np.searchsorted(Bs, A)
idx[idx==len(B)] = 0
out = a[Bs[idx] != A]

Another variant would using searchsorted on A given B -

sidx = A.argsort()
As = A[sidx]
idx = np.searchsorted(A, B, sorter=sidx)

id_ar = np.zeros(len(a), dtype=int)
id_accum = np.cumsum(np.r_[False, As[:-1] != As[1:]])
count = np.bincount(id_accum)
idx_end = idx + count[id_accum[idx]]

id_ar[idx] = 1
id_ar[idx_end] -= 1
out = a[sidx[id_ar.cumsum()==0]]

For the last step, if you need to maintain order, use :

a[np.sort(sidx[id_ar.cumsum()==0])]

Approach #2 : KDtree

Another with SciPy's cKDTree -

from scipy.spatial import cKDTree

dist,idx = cKDTree(b).query(a,k=1)
out = a[dist!=0]

Approach #3 : Masking with array-assignment

For smaller range of numbers, we can go for a boolean array assignment one :

s = np.maximum(a.max(0)-np.minimum(0,a.min(0)),b.max(0)-np.minimum(0,b.min(0)))+1
mask = np.empty(s, dtype=bool)
mask[tuple(a.T)] = 1
mask[tuple(b.T)] = 0
out = a[mask[tuple(a.T)]]

Note that this will initialize an array of shape (n,n,n,n) for that range of values n on 4 cols input datasets.

Approach #4 : With sorting

# https://stackoverflow.com/a/44999009/ @Divakar
def view1D_onevar(a): # a is array
    a = np.ascontiguousarray(a)
    void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return a.view(void_dt).ravel()

def app4(a,b):
    ba = np.vstack((b,a))
    sidx = np.lexsort(ba.T)
    basi = ba[sidx]
    v = view1D_onevar(basi)
    idar = np.r_[False,v[:-1] != v[1:]].cumsum()
    mapar = np.ones(idar[-1]+1, dtype=bool)
    mapar[idar[sidx<len(b)]] = 0
    out = basi[mapar[idar]]
    return out

Two more variants :

def app4_v2(a,b):
    ba = np.vstack((b,a))
    v0 = view1D_onevar(ba)
    sidx = v0.argsort()
    v = v0[sidx]
    idar = np.r_[False,v[:-1] != v[1:]].cumsum()
    mapar = np.ones(idar[-1]+1, dtype=bool)
    mapar[idar[sidx<len(b)]] = 0
    out = a[sidx[mapar[idar]]-len(b)]
    return out

def app4_v3(a,b):
    ba = np.vstack((b,a))
    sidx = view1D_onevar(ba).argsort()
    basi = ba[sidx]
    v = view1D_onevar(basi)
    idar = np.r_[False,v[:-1] != v[1:]].cumsum()
    mapar = np.ones(idar[-1]+1, dtype=bool)
    mapar[idar[sidx<len(b)]] = 0
    out = basi[mapar[idar]]
    return out

Note that order is not maintained.

That lexsort/argsort is the bottleneck, which could be improved if we are willing to go dimensionality-reduction.

edited Oct 22, 2020 at 8:36

answered Oct 17, 2020 at 17:04

Divakar

222k19 gold badges273 silver badges374 bronze badges

4 Comments

Darkonaut Over a year ago

div_app3() seems to be the killer.

Darkonaut Over a year ago

Seems I have to row back on div_app3(), it can't handle more than 5 columns.

Divakar Over a year ago

@Darkonaut Should be good, as long as you can allocate [n]*a.shape[1] boolean shaped array as noted there.

Darkonaut Over a year ago

It fails with MemoryError already with (2, 6)-shape for a because s is e.g. trying to "allocate array with shape (83, 93, 75, 75, 88, 100) and data type bool". So your numbers in the array would have to remain tiny to get this to work with more columns.

Quang Hoang · Accepted Answer · 2020-10-14 00:54:58Z

2

If the data are not too big, broadcast is another option:

a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]

Output:

array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])

answered Oct 14, 2020 at 0:54

Quang Hoang

151k11 gold badges64 silver badges86 bronze badges

1 Comment

mathfux Over a year ago

It appears that a simple Python beats this on small data :(

mathfux · Accepted Answer · 2020-10-16 14:24:36Z

The only way I can think of is based on dimensionality reduction:

def remove(a, b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
    ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
    ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
    return a[~np.isin(ravel_a, ravel_b)]

Since we need to do a lot of elementary algebra here, some performance boost can be achieved with numexpr:

import numexpr as ne
def remove_boost(a,b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m1,m2,m3,m4 = np.min([ma, mb], axis=0)
    M1,M2,M3,M4 = np.max([MA, MB], axis=0)
    s1,s2,s3,s4 = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
    a1,a2,a3,a4 = a.T
    b1,b2,b3,b4 = b.T
    d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
        's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
    ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
    ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
    return a[~np.isin(ravel_a, ravel_b)]

It's quite unexpected to see that dimensionality reduction is the only working way in numpy suitable to do this multidimensional removal on larger data :)

Output of both remove(a, b) and remove_boost(a, b):

[[75  7  0 85]
 [75 48  8 43]
 [ 7 47 96 94]
 [ 7 47 96 94]
 [19 49 92 57]]

Disadvantage: it's capable to work only with the boxes that are not larger than 2^63 (s1*s2*s3*s4 = np.prod(np.ptp(np.r_[a,b], axis=0)+1) should be less than 2^63).

QuantStats · Accepted Answer · 2020-10-13 23:13:50Z

1

You can try my solution with a loop comparison between the vectors in the two matrices.

Code

import np

a = np.array([[25, 83, 18, 71],
       [75,  7,  0, 85],
       [25, 83, 18, 71],
       [25, 83, 18, 71],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [56, 75, 50,  0],
       [19, 49, 92, 57],
       [52, 93, 58,  9]])

b = np.array([[56, 75, 50,  0], [52, 93, 58,  9], [25, 83, 18, 71]])

to_remain = [True]*len(a)

ind = 0

for vec_a in a:
  for vec_b in b:
    if np.array_equal(vec_a, vec_b):
      to_remain[ind] = False
      break
  
  ind += 1

output = a[to_remain]

print(output)

Output

[[75  7  0 85]
 [75 48  8 43]
 [ 7 47 96 94]
 [ 7 47 96 94]
 [19 49 92 57]]

answered Oct 13, 2020 at 23:13

QuantStats

1,5061 gold badge10 silver badges15 bronze badges

5 Comments

mathfux Over a year ago

This is not efficient because it uses looping. I expect some vectorised solution.

tclarke13 Over a year ago

What is a "vectorised" solution?

mathfux Over a year ago

@tclarke13 vectorized solution is a combination of numpy actions where all the loops or comprehension are replaced in order to work in C level (therefore much faster). You can search over the internet because this definition might be not ideal.

mathfux Over a year ago

@QuantStats a second loop is a real pain in performance. But you could use a tuple search in a set of tuples instead which is instant. In this case it outperforms a solution of Quang (look at my example).

QuantStats Over a year ago

@mathfux Thank you for the comment, and your time in compiling the performance time for all the suggested inputs. It is definitely educating.

meTchaikovsky · Accepted Answer · 2020-10-19 01:26:29Z

Here is another way of implementing dimensionality reduction

scales = np.array([1,10,100,1000])
a_scaled = np.sum(a * scales,axis=1)[:,None]
b_scaled = np.sum(b * scales,axis=1)[None,:]
a_slice = a[~np.any(a_scaled == b_scaled,axis=1)]

as you can see, we multiply numbers of different columns with different scales, and basically casting this problem into comparing two 1d arrays.

If the order of the columns does not matter, you also need to add np.sort

scales = np.array([1,10,100,1000])
a_scaled = np.sum(np.sort(a,axis=1) * scales,axis=1)[:,None]
b_scaled = np.sum(np.sort(b,axis=1) * scales,axis=1)[None,:]
a_slice = a[~np.any(a_scaled == b_scaled,axis=1)]

I guess there exist cases where this approach fails, so I would be happy about feedback.

4 revs · Accepted Answer · 2020-10-22 12:23:46Z

Benchmarking post

We will follow OP's strategy of first benchmarking of small to decent sized datasets as stage-1 and then dropping few among them for the larger datasets as stage-2. Also, we will keep it as 4 columns as used in OP's benchmarking codes.

Stage-1 : Upto 5000 sized datasets with b of length 5%-90% of length of a
Stage-2: Drop last two and then run for datasets of size upto 1000000 again for the same percentages for b.

Using benchit package (few benchmarking tools packaged together; disclaimer: I am its author) to benchmark proposed solutions.

Proposed working solutions :

import numpy as np
import pandas as pd
import numexpr as ne

def remove_pd(a,b):
    dfa, dfb = pd.DataFrame(a), pd.DataFrame(b)
    return dfa.merge(dfb, how='left', indicator=True)\
    .query('_merge == "left_only"').drop(columns='_merge').values
    
def remove_smalldata(a,b):
    return a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]
        
def remove_looped(a, b): 
    to_remain = [True]*len(a)
    ind = 0
    for vec_a in a:
        for vec_b in b:
            if np.array_equal(vec_a, vec_b):
                to_remain[ind] = False
                break
        ind += 1
    return a[to_remain]

def remove_looped_boost(a, b): 
    to_remain = [True]*len(a)
    a_map = list(map(tuple, a.tolist()))
    b_map = set(map(tuple, b.tolist()))
    for i in range(len(a)):
        to_remain[i] = not(a_map[i] in b_map)
    return a[to_remain]

def remove_reducedim(a, b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
    ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
    ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
    return a[~np.isin(ravel_a, ravel_b)]

def remove_reducedim_boost(a,b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m1,m2,m3,m4 = np.min([ma, mb], axis=0)
    M1,M2,M3,M4 = np.max([MA, MB], axis=0)
    s1,s2,s3,_ = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
    a1,a2,a3,a4 = a.T
    b1,b2,b3,b4 = b.T
    d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
        's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
    ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
    ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
    return a[~np.isin(ravel_a, ravel_b)]

def view1D(a, b): # a, b are arrays
    a = np.ascontiguousarray(a)
    b = np.ascontiguousarray(b)
    void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return a.view(void_dt).ravel(),  b.view(void_dt).ravel()

def div_app1(a, b):
    A,B = view1D(a,b)
    sidx = B.argsort()
    idx = np.searchsorted(B, A, sorter=sidx)
    idx[idx==len(B)] = 0
    return a[B[sidx[idx]] != A]

def div_app1_v2(a, b):
    A,B = view1D(a,b)
    Bs = np.sort(B)
    idx = np.searchsorted(Bs, A)
    idx[idx==len(B)] = 0
    return a[Bs[idx] != A]

from scipy.spatial import cKDTree

def div_app2(a, b):
    dist,idx = cKDTree(b).query(a,k=1)
    return a[dist!=0]

def div_app3(a, b):
    s = np.maximum(a.max(0)-np.minimum(0,a.min(0)), b.max(0)-np.minimum(0,b.min(0)))+1
    mask = np.empty(s, dtype=bool)
    mask[tuple(a.T)] = 1
    mask[tuple(b.T)] = 0
    return a[mask[tuple(a.T)]]

def Darkonaut(a, b):
    a_rows = a.view([('', a.dtype)] * a.shape[1])  # '' default fieldname
    b_rows = b.view([('', b.dtype)] * b.shape[1])        
    return np.setdiff1d(a_rows, b_rows, assume_unique=True)

Benchmarking code :

def setup(array_len, b_len_percentage):
    a = np.random.randint(0,100,(array_len,4))
    b_len = int(len(a)*b_len_percentage/100.)
    b = np.unique(a[np.random.choice(len(a), b_len, replace=False)], axis=0)
    return a,b

import benchit
benchit.setparams(rep=1) # use rep=2 or bigger if you have time & patience

funcs = [remove_pd, remove_smalldata, remove_looped, remove_looped_boost, 
         remove_reducedim, remove_reducedim_boost, div_app1, div_app1_v2,
         div_app2, div_app3, app4, app4_v2, app4_v3, Darkonaut]
percentages = np.r_[5,list(range(10,100,20))]
in_ = {(n,x):setup(n,x) for n in [100, 500, 1000, 2000, 5000] for x in percentages}
t = benchit.timings(funcs, in_, multivar=True, input_name=['Len_a', 'Len_b_as_Percentage'])
t.plot(logx=True, sp_ncols=2, legend_fontsize=8, rot=90, save='timings1.png',dpi=200)

funcs.remove(remove_smalldata)
funcs.remove(remove_looped)
in_ = {(n,x):setup(n,x) for n in np.linspace(10**4,10**6,5,dtype=int) for x in percentages}
t2 = benchit.timings(funcs, in_, multivar=True, input_name=['Len_a', 'Len_b_as_Percentage'])
t2.plot(logx=True, sp_ncols=2, legend_fontsize=8, rot=90, save='timings2.png',dpi=200)

Collectives™ on Stack Overflow

Fast way to remove array of specific row values from 2D numpy array

Update

7 Answers 7

2 Comments

4 Comments

1 Comment

Comments

5 Comments

Comments

Benchmarking post

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

Update

7 Answers 7

2 Comments

4 Comments

1 Comment

Comments

5 Comments

Comments

Benchmarking post

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related