6

I have a 2D array like this:

a = np.array([[25, 83, 18, 71],
       [75,  7,  0, 85],
       [25, 83, 18, 71],
       [25, 83, 18, 71],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [56, 75, 50,  0],
       [19, 49, 92, 57],
       [52, 93, 58,  9]])

and I want to remove rows that has specific values, for example:

b = np.array([[56, 75, 50,  0], [52, 93, 58,  9], [25, 83, 18, 71]])

What is the most efficient way to do this in numpy or pandas? Expected output:

np.array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])

Update

The fastest approach is dimensionality reduction but it requires quite strict limitations of ranges of columns in general. There is my perfplot:

import pandas as pd
import numexpr as ne
import perfplot
from time import time

def remove_pd(data):
    a,b = data
    dfa, dfb = pd.DataFrame(a), pd.DataFrame(b)
    return dfa.merge(dfb, how='left', indicator=True)\
    .query('_merge == "left_only"').drop(columns='_merge').values
    
def remove_smalldata(data):
    a,b = data
    return a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]

'''def remove_nploop(data):
    a, b = data
    for arr in b:
        a = a[np.all(~np.equal(a, arr), axis=1)]
    return a'''
        
def remove_looped(data): 
    a, b = data
    to_remain = [True]*len(a)
    ind = 0
    for vec_a in a:
        for vec_b in b:
            if np.array_equal(vec_a, vec_b):
                to_remain[ind] = False
                break
        ind += 1
    return a[to_remain]

def remove_looped_boost(data): 
    a, b = data
    to_remain = [True]*len(a)
    a_map = list(map(tuple, a.tolist()))
    b_map = set(map(tuple, b.tolist()))
    for i in range(len(a)):
        to_remain[i] = not(a_map[i] in b_map)
    return a[to_remain]

def remove_reducedim(data):
    a,b = data
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
    ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
    ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
    return a[~np.isin(ravel_a, ravel_b)]

def remove_reducedim_boost(data):
    a,b = data
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m1,m2,m3,m4 = np.min([ma, mb], axis=0)
    M1,M2,M3,M4 = np.max([MA, MB], axis=0)
    s1,s2,s3,s4 = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
    a1,a2,a3,a4 = a.T
    b1,b2,b3,b4 = b.T
    d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
        's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
    ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
    ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
    return a[~np.isin(ravel_a, ravel_b)]
    
def setup(x):
    a1 = np.random.randint(50000, size=(x,4))
    a2 = a1[np.random.randint(x, size=x)]
    return a1, a2
    
def build_args(figure):
    kernels = [remove_reducedim, remove_reducedim_boost, remove_pd, remove_looped, remove_looped_boost, remove_smalldata]
    return {'setup': setup,
    'kernels': {'A': kernels, 'B': kernels[:3]}[figure],
    'n_range': {'A': [2 ** k for k in range(12)], 'B': [2 ** k for k in range(11, 25)]}[figure],
     'xlabel': 'Remowing n rows from n rows',
     'title' : {'A':'Testing removal of small dataset', 'B':'Testing removal of large dataset'}[figure],
     'show_progress': False,
     'equality_check': lambda x,y: np.array_equal(x, y)}
    
t = time()
outs = [perfplot.bench(**build_args(n)) for n in ('A','B')]
fig = plt.figure(figsize=(20, 20))
for i in range(len(outs)):
    ax = fig.add_subplot(2, 1, i+1)
    ax.grid(True, which="both")
    outs[i].plot()
plt.show()
print('Overall testing time:', time()-t)

Output:

Overall testing time: 529.2596168518066

enter image description here

enter image description here

2
  • If multiple rows in a are the same as one row in b, are all of them expected to be removed? Commented Oct 22, 2020 at 13:56
  • 1
    @GZ0. Yes, it is illustrated in my example. Commented Oct 22, 2020 at 14:44

7 Answers 7

7

Here's a pandas approach doing a "anti join" using merge and query.

dfa = pd.DataFrame(a)
dfb = pd.DataFrame(b)

df = (
    dfa.merge(dfb, how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns='_merge')
)

    0   1   2   3
1  75   7   0  85
4  75  48   8  43
5   7  47  96  94
6   7  47  96  94
8  19  49  92  57

Note: a plain numpy solution should be faster, but this should do fine.


Plain numpy but with a single loop:

for arr in b:
    a = a[np.all(~np.equal(a, arr), axis=1)]

array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])
Sign up to request clarification or add additional context in comments.

2 Comments

Thanks. It's better to have multiple versions including in order to perfplot or time it other ways.
So pandas did the best of all solutions that doesn't apply dimensionality reduction. It might be a fastest solution if integers are larger.
3
+50

Approach #1 : Views + searchsorted

One approach with array-views to view each row as a scalar each -

# https://stackoverflow.com/a/45313353/ @Divakar
def view1D(a, b): # a, b are arrays
    a = np.ascontiguousarray(a)
    b = np.ascontiguousarray(b)
    void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return a.view(void_dt).ravel(),  b.view(void_dt).ravel()

A,B = view1D(a,b)

sidx = B.argsort()
idx = np.searchsorted(B, A, sorter=sidx)
idx[idx==len(B)] = 0
out = a[B[sidx[idx]] != A]

A variant of it would be with sorting B -

Bs = np.sort(B)
idx = np.searchsorted(Bs, A)
idx[idx==len(B)] = 0
out = a[Bs[idx] != A]

Another variant would using searchsorted on A given B -

sidx = A.argsort()
As = A[sidx]
idx = np.searchsorted(A, B, sorter=sidx)

id_ar = np.zeros(len(a), dtype=int)
id_accum = np.cumsum(np.r_[False, As[:-1] != As[1:]])
count = np.bincount(id_accum)
idx_end = idx + count[id_accum[idx]]

id_ar[idx] = 1
id_ar[idx_end] -= 1
out = a[sidx[id_ar.cumsum()==0]]

For the last step, if you need to maintain order, use :

a[np.sort(sidx[id_ar.cumsum()==0])]

Approach #2 : KDtree

Another with SciPy's cKDTree -

from scipy.spatial import cKDTree

dist,idx = cKDTree(b).query(a,k=1)
out = a[dist!=0]

Approach #3 : Masking with array-assignment

For smaller range of numbers, we can go for a boolean array assignment one :

s = np.maximum(a.max(0)-np.minimum(0,a.min(0)),b.max(0)-np.minimum(0,b.min(0)))+1
mask = np.empty(s, dtype=bool)
mask[tuple(a.T)] = 1
mask[tuple(b.T)] = 0
out = a[mask[tuple(a.T)]]

Note that this will initialize an array of shape (n,n,n,n) for that range of values n on 4 cols input datasets.

Approach #4 : With sorting

# https://stackoverflow.com/a/44999009/ @Divakar
def view1D_onevar(a): # a is array
    a = np.ascontiguousarray(a)
    void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return a.view(void_dt).ravel()

def app4(a,b):
    ba = np.vstack((b,a))
    sidx = np.lexsort(ba.T)
    basi = ba[sidx]
    v = view1D_onevar(basi)
    idar = np.r_[False,v[:-1] != v[1:]].cumsum()
    mapar = np.ones(idar[-1]+1, dtype=bool)
    mapar[idar[sidx<len(b)]] = 0
    out = basi[mapar[idar]]
    return out

Two more variants :

def app4_v2(a,b):
    ba = np.vstack((b,a))
    v0 = view1D_onevar(ba)
    sidx = v0.argsort()
    v = v0[sidx]
    idar = np.r_[False,v[:-1] != v[1:]].cumsum()
    mapar = np.ones(idar[-1]+1, dtype=bool)
    mapar[idar[sidx<len(b)]] = 0
    out = a[sidx[mapar[idar]]-len(b)]
    return out

def app4_v3(a,b):
    ba = np.vstack((b,a))
    sidx = view1D_onevar(ba).argsort()
    basi = ba[sidx]
    v = view1D_onevar(basi)
    idar = np.r_[False,v[:-1] != v[1:]].cumsum()
    mapar = np.ones(idar[-1]+1, dtype=bool)
    mapar[idar[sidx<len(b)]] = 0
    out = basi[mapar[idar]]
    return out

Note that order is not maintained.

That lexsort/argsort is the bottleneck, which could be improved if we are willing to go dimensionality-reduction.

4 Comments

div_app3() seems to be the killer.
Seems I have to row back on div_app3(), it can't handle more than 5 columns.
@Darkonaut Should be good, as long as you can allocate [n]*a.shape[1] boolean shaped array as noted there.
It fails with MemoryError already with (2, 6)-shape for a because s is e.g. trying to "allocate array with shape (83, 93, 75, 75, 88, 100) and data type bool". So your numbers in the array would have to remain tiny to get this to work with more columns.
2

If the data are not too big, broadcast is another option:

a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]

Output:

array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])

1 Comment

It appears that a simple Python beats this on small data :(
2

The only way I can think of is based on dimensionality reduction:

def remove(a, b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
    ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
    ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
    return a[~np.isin(ravel_a, ravel_b)]

Since we need to do a lot of elementary algebra here, some performance boost can be achieved with numexpr:

import numexpr as ne
def remove_boost(a,b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m1,m2,m3,m4 = np.min([ma, mb], axis=0)
    M1,M2,M3,M4 = np.max([MA, MB], axis=0)
    s1,s2,s3,s4 = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
    a1,a2,a3,a4 = a.T
    b1,b2,b3,b4 = b.T
    d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
        's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
    ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
    ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
    return a[~np.isin(ravel_a, ravel_b)]

It's quite unexpected to see that dimensionality reduction is the only working way in numpy suitable to do this multidimensional removal on larger data :)

Output of both remove(a, b) and remove_boost(a, b):

[[75  7  0 85]
 [75 48  8 43]
 [ 7 47 96 94]
 [ 7 47 96 94]
 [19 49 92 57]]

Disadvantage: it's capable to work only with the boxes that are not larger than 2^63 (s1*s2*s3*s4 = np.prod(np.ptp(np.r_[a,b], axis=0)+1) should be less than 2^63).

Comments

1

You can try my solution with a loop comparison between the vectors in the two matrices.

Code

import np

a = np.array([[25, 83, 18, 71],
       [75,  7,  0, 85],
       [25, 83, 18, 71],
       [25, 83, 18, 71],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [56, 75, 50,  0],
       [19, 49, 92, 57],
       [52, 93, 58,  9]])

b = np.array([[56, 75, 50,  0], [52, 93, 58,  9], [25, 83, 18, 71]])

to_remain = [True]*len(a)

ind = 0

for vec_a in a:
  for vec_b in b:
    if np.array_equal(vec_a, vec_b):
      to_remain[ind] = False
      break
  
  ind += 1

output = a[to_remain]

print(output)

Output

[[75  7  0 85]
 [75 48  8 43]
 [ 7 47 96 94]
 [ 7 47 96 94]
 [19 49 92 57]]

5 Comments

This is not efficient because it uses looping. I expect some vectorised solution.
What is a "vectorised" solution?
@tclarke13 vectorized solution is a combination of numpy actions where all the loops or comprehension are replaced in order to work in C level (therefore much faster). You can search over the internet because this definition might be not ideal.
@QuantStats a second loop is a real pain in performance. But you could use a tuple search in a set of tuples instead which is instant. In this case it outperforms a solution of Quang (look at my example).
@mathfux Thank you for the comment, and your time in compiling the performance time for all the suggested inputs. It is definitely educating.
1

Here is another way of implementing dimensionality reduction

scales = np.array([1,10,100,1000])
a_scaled = np.sum(a * scales,axis=1)[:,None]
b_scaled = np.sum(b * scales,axis=1)[None,:]
a_slice = a[~np.any(a_scaled == b_scaled,axis=1)]

as you can see, we multiply numbers of different columns with different scales, and basically casting this problem into comparing two 1d arrays.

If the order of the columns does not matter, you also need to add np.sort

scales = np.array([1,10,100,1000])
a_scaled = np.sum(np.sort(a,axis=1) * scales,axis=1)[:,None]
b_scaled = np.sum(np.sort(b,axis=1) * scales,axis=1)[None,:]
a_slice = a[~np.any(a_scaled == b_scaled,axis=1)]

I guess there exist cases where this approach fails, so I would be happy about feedback.

Comments

1

Benchmarking post

We will follow OP's strategy of first benchmarking of small to decent sized datasets as stage-1 and then dropping few among them for the larger datasets as stage-2. Also, we will keep it as 4 columns as used in OP's benchmarking codes.

  • Stage-1 : Upto 5000 sized datasets with b of length 5%-90% of length of a
  • Stage-2: Drop last two and then run for datasets of size upto 1000000 again for the same percentages for b.

Using benchit package (few benchmarking tools packaged together; disclaimer: I am its author) to benchmark proposed solutions.

Proposed working solutions :

import numpy as np
import pandas as pd
import numexpr as ne

def remove_pd(a,b):
    dfa, dfb = pd.DataFrame(a), pd.DataFrame(b)
    return dfa.merge(dfb, how='left', indicator=True)\
    .query('_merge == "left_only"').drop(columns='_merge').values
    
def remove_smalldata(a,b):
    return a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]
        
def remove_looped(a, b): 
    to_remain = [True]*len(a)
    ind = 0
    for vec_a in a:
        for vec_b in b:
            if np.array_equal(vec_a, vec_b):
                to_remain[ind] = False
                break
        ind += 1
    return a[to_remain]

def remove_looped_boost(a, b): 
    to_remain = [True]*len(a)
    a_map = list(map(tuple, a.tolist()))
    b_map = set(map(tuple, b.tolist()))
    for i in range(len(a)):
        to_remain[i] = not(a_map[i] in b_map)
    return a[to_remain]

def remove_reducedim(a, b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
    ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
    ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
    return a[~np.isin(ravel_a, ravel_b)]

def remove_reducedim_boost(a,b):
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m1,m2,m3,m4 = np.min([ma, mb], axis=0)
    M1,M2,M3,M4 = np.max([MA, MB], axis=0)
    s1,s2,s3,_ = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
    a1,a2,a3,a4 = a.T
    b1,b2,b3,b4 = b.T
    d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
        's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
    ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
    ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
    return a[~np.isin(ravel_a, ravel_b)]

def view1D(a, b): # a, b are arrays
    a = np.ascontiguousarray(a)
    b = np.ascontiguousarray(b)
    void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return a.view(void_dt).ravel(),  b.view(void_dt).ravel()

def div_app1(a, b):
    A,B = view1D(a,b)
    sidx = B.argsort()
    idx = np.searchsorted(B, A, sorter=sidx)
    idx[idx==len(B)] = 0
    return a[B[sidx[idx]] != A]

def div_app1_v2(a, b):
    A,B = view1D(a,b)
    Bs = np.sort(B)
    idx = np.searchsorted(Bs, A)
    idx[idx==len(B)] = 0
    return a[Bs[idx] != A]

from scipy.spatial import cKDTree

def div_app2(a, b):
    dist,idx = cKDTree(b).query(a,k=1)
    return a[dist!=0]

def div_app3(a, b):
    s = np.maximum(a.max(0)-np.minimum(0,a.min(0)), b.max(0)-np.minimum(0,b.min(0)))+1
    mask = np.empty(s, dtype=bool)
    mask[tuple(a.T)] = 1
    mask[tuple(b.T)] = 0
    return a[mask[tuple(a.T)]]

def Darkonaut(a, b):
    a_rows = a.view([('', a.dtype)] * a.shape[1])  # '' default fieldname
    b_rows = b.view([('', b.dtype)] * b.shape[1])        
    return np.setdiff1d(a_rows, b_rows, assume_unique=True)

Benchmarking code :

def setup(array_len, b_len_percentage):
    a = np.random.randint(0,100,(array_len,4))
    b_len = int(len(a)*b_len_percentage/100.)
    b = np.unique(a[np.random.choice(len(a), b_len, replace=False)], axis=0)
    return a,b

import benchit
benchit.setparams(rep=1) # use rep=2 or bigger if you have time & patience

funcs = [remove_pd, remove_smalldata, remove_looped, remove_looped_boost, 
         remove_reducedim, remove_reducedim_boost, div_app1, div_app1_v2,
         div_app2, div_app3, app4, app4_v2, app4_v3, Darkonaut]
percentages = np.r_[5,list(range(10,100,20))]
in_ = {(n,x):setup(n,x) for n in [100, 500, 1000, 2000, 5000] for x in percentages}
t = benchit.timings(funcs, in_, multivar=True, input_name=['Len_a', 'Len_b_as_Percentage'])
t.plot(logx=True, sp_ncols=2, legend_fontsize=8, rot=90, save='timings1.png',dpi=200)

funcs.remove(remove_smalldata)
funcs.remove(remove_looped)
in_ = {(n,x):setup(n,x) for n in np.linspace(10**4,10**6,5,dtype=int) for x in percentages}
t2 = benchit.timings(funcs, in_, multivar=True, input_name=['Len_a', 'Len_b_as_Percentage'])
t2.plot(logx=True, sp_ncols=2, legend_fontsize=8, rot=90, save='timings2.png',dpi=200)

enter image description here

enter image description here

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.