Search code examples
pythonnumpymatrixset-difference

Matrix row difference, output a boolean vector


I have an m x 3 matrix A and its row subset B (n x 3). Both are sets of indices into another, large 4D matrix; their data type is dtype('int64'). I would like to generate a boolean vector x, where x[i] = True if B does not contain row A[i,:].

There are no duplicate rows in either A or B.

I was wondering if there's an efficient way how to do this in Numpy? I found an answer that's somewhat related: https://stackoverflow.com/a/11903368/265289; however, it returns the actual rows (not a boolean vector).


Solution

  • You could follow the same pattern as shown in jterrace's answer, except use np.in1d instead of np.setdiff1d:

    import numpy as np
    np.random.seed(2015)
    
    m, n = 10, 5
    A = np.random.randint(10, size=(m,3))
    B = A[np.random.choice(m, n, replace=False)]
    print(A)
    # [[2 2 9]
    #  [6 8 5]
    #  [7 8 0]
    #  [6 7 8]
    #  [3 8 6]
    #  [9 2 3]
    #  [1 2 6]
    #  [2 9 8]
    #  [5 8 4]
    #  [8 9 1]]
    
    print(B)
    # [[2 2 9]
    #  [1 2 6]
    #  [2 9 8]
    #  [3 8 6]
    #  [9 2 3]]
    
    def using_view(A, B, assume_unique=False):
        Ad = np.ascontiguousarray(A).view([('', A.dtype)] * A.shape[1])
        Bd = np.ascontiguousarray(B).view([('', B.dtype)] * B.shape[1])
        return ~np.in1d(Ad, Bd, assume_unique=assume_unique)
    
    print(using_view(A, B, assume_unique=True))
    

    yields

    [False  True  True  True False False False False  True  True]
    

    You can use assume_unique=True (which can speed up the calculation) since there are no duplicate rows in A or B.


    Beware that A.view(...) will raise

    ValueError: new type not compatible with array.
    

    if A.flags['C_CONTIGUOUS'] is False (i.e. if A is not a C-contiguous array). Therefore, in general we need to use np.ascontiguous(A) before calling view.


    As B.M. suggests, you could instead view each row using the "void" dtype:

    def using_void(A, B):
        dtype = 'V{}'.format(A.dtype.itemsize * A.shape[-1])
        Ad = np.ascontiguousarray(A).view(dtype)
        Bd = np.ascontiguousarray(B).view(dtype)
        return ~np.in1d(Ad, Bd, assume_unique=True)
    

    This is safe to use with integer dtypes. However, note that

    In [342]: np.array([-0.], dtype='float64').view('V8') == np.array([0.], dtype='float64').view('V8')
    Out[342]: array([False], dtype=bool)
    

    so using np.in1d after viewing as void may return incorrect results for arrays with float dtype.


    Here is a benchmark of some of the proposed methods:

    import numpy as np
    np.random.seed(2015)
    
    m, n = 10000, 5000
    # Note A may contain duplicate rows, 
    # so don't use assume_unique=True for these benchmarks. 
    # In this case, using assume_unique=False does not improve the speed much anyway.
    A = np.random.randint(10, size=(2*m,3))
    # make A not C_CONTIGUOUS; the view methods fail for non-contiguous arrays
    A = A[::2]  
    B = A[np.random.choice(m, n, replace=False)]
    
    def using_view(A, B, assume_unique=False):
        Ad = np.ascontiguousarray(A).view([('', A.dtype)] * A.shape[1])
        Bd = np.ascontiguousarray(B).view([('', B.dtype)] * B.shape[1])
        return ~np.in1d(Ad, Bd, assume_unique=assume_unique)
    
    from scipy.spatial import distance
    def using_distance(A, B):
        return ~np.any(distance.cdist(A,B)==0,1)
    
    from functools import reduce 
    def using_loop(A, B):
        pred = lambda i: A[:, i:i+1] == B[:, i]
        return ~reduce(np.logical_and, map(pred, range(A.shape[1]))).any(axis=1)
    
    from pandas.core.groupby import get_group_index, _int64_overflow_possible
    from functools import partial
    def using_pandas(A, B):
        shape = [1 + max(A[:, i].max(), B[:, i].max()) for i in range(A.shape[1])]
        assert not _int64_overflow_possible(shape)
    
        encode = partial(get_group_index, shape=shape, sort=False, xnull=False)
        a1, b1 = map(encode, (A.T, B.T))
        return ~np.in1d(a1, b1)
    
    def using_void(A, B):
        dtype = 'V{}'.format(A.dtype.itemsize * A.shape[-1])
        Ad = np.ascontiguousarray(A).view(dtype)
        Bd = np.ascontiguousarray(B).view(dtype)
        return ~np.in1d(Ad, Bd)
    
    # Sanity check: make sure all the functions return the same result
    for func in (using_distance, using_loop, using_pandas, using_void):
        assert (func(A, B) == using_view(A, B)).all()
    

    In [384]: %timeit using_pandas(A, B)
    100 loops, best of 3: 1.99 ms per loop
    
    In [381]: %timeit using_void(A, B)
    100 loops, best of 3: 6.72 ms per loop
    
    In [378]: %timeit using_view(A, B)
    10 loops, best of 3: 35.6 ms per loop
    
    In [383]: %timeit using_loop(A, B)
    1 loops, best of 3: 342 ms per loop
    
    In [379]: %timeit using_distance(A, B)
    1 loops, best of 3: 502 ms per loop