Search code examples
arraysnumpypython-itertoolsnumpy-indexed

Efficiently find full row permutations in 2d array having repeated row pairs


Consider the array:

import numpy as np
import numpy_indexed as npi
from itertools import permutations

arr = np.array([[1, 2, 3, 4],
                [3, 3, 3, 6],
                [2, 0, 0, 2],
                [2, 0, 0, 2],  
                [8, 2, 8, 2],
                [4, 5, 4, 5], 
                [3, 3, 3, 6],
                [4, 5, 4, 5],
                [0, 9, 8, 7],
                [1, 2, 3, 4]])

I need to find all unique row permutations. What I currently do is find all 10! row permutations, then use npi.unique. Like this:

arr_perms = np.array([arr[i, :] for i in permutations(range(len(arr)))])
u, index = npi.unique(arr_perms, return_index = True, axis=0)

This works as expected, but it seems expensive because of the nature of the arrays I am working with. These arrays all have at least one pair (usually several pairs) of identical rows.

In the small example shown, the 10 rows hold 4 identical row pairs, so the total number of unique row permutqtions is only 10!/2^4 = 226800, a big reduction.

QUESTION: Is there a way to efficiently find the unique row permutations without first having to find the full set of permutations?


Solution

  • This here might help you. It still creates all permutations, but it is 10 times faster than the code you posted, since each subarray of length 4 is converted to an int using bit shifting.

    import math
    import numpy as np
    import numpy_indexed as npi
    
    
    def generate_permutations_matrix(n, k):
        """
        Generate a matrix of permutations for given values of n and k.
    
        Parameters:
        - n (int): Total number of elements.
        - k (int): Length of each permutation.
    
        Returns:
        - np.ndarray: Matrix of permutations.
        """
        result_matrix = np.zeros((math.perm(n, k), k), np.uint8)
        f = 1
        for m in range(n - k + 1, n + 1):
            sub_matrix = result_matrix[:f, n - m + 1:]
            for i in range(1, m):
                result_matrix[i * f: (i + 1) * f, n - m] = i
                result_matrix[i * f: (i + 1) * f, n - m + 1:] = sub_matrix + (
                        sub_matrix >= i
                )
            sub_matrix += 1
            f *= m
        return result_matrix
    
    
    def convert_to_integer(arr):
        """
        Convert a 2D array of integers to a single integer value.
    
        Parameters:
        - arr (np.ndarray): Input 2D array.
    
        Returns:
        - int: Integer representation of the array.
        """
        arr = arr.astype(np.uint64)
        integer_value = arr[..., 0].copy()
        for ival in range(1, arr.shape[1]):
            integer_value += arr[..., ival] << ival * 8
        return integer_value
    
    
    def get_permutations(arr, len_of_each):
        """
        Get unique permutations of rows in a 2D array.
    
        Parameters:
        - arr (np.ndarray): Input 2D array.
        - len_of_each (int): Length of each permutation.
    
        Returns:
        - np.ndarray: Permutations matrix.
        """
        integer_value = convert_to_integer(arr)
    
        # Create dictionaries for mapping integer values to unique identifiers
        lookup_dict = {}
        lookup_dict2 = {}
        alpha1 = []
        identifier = 0
    
        # Iterate through integer values and create mappings
        for ini, x in enumerate(integer_value):
            if x in lookup_dict:
                value = lookup_dict.get(x)
                alpha1.append((value, x))
                lookup_dict2[ini] = value
            else:
                alpha1.append((identifier, x))
                lookup_dict[x] = identifier
                lookup_dict2[ini] = identifier
                identifier += 1
    
        # Generate permutations matrix
        permutations_matrix = generate_permutations_matrix(len(alpha1), len_of_each)
    
        # Map original identifiers back to permutations matrix
        cond_list = []
        choice_list = []
        for nom in lookup_dict2.items():
            cond_list.append(permutations_matrix == nom[0])
            choice_list.append(nom[1])
    
        choice_list = np.array(choice_list, dtype=np.uint8)
        unique_permutations = npi.unique(
            np.select(cond_list, choice_list, 0), return_index=False, axis=0
        )
    
        # Map back to original identifiers
        cond_list2 = []
        choice_list2 = []
        for nom in lookup_dict.items():
            cond_list2.append(unique_permutations == nom[1])
            choice_list2.append(nom[0])
    
        result = np.select(cond_list2, choice_list2, 0)
    
        # Convert back to the original 2D array format
        return np.stack(
            [
                np.dstack([(result[..., g] >> rx * 8) & 255 for rx in range(arr.shape[1])])
                for g in range(len_of_each)
            ],
            axis=2,
        ).squeeze()
    
    
    # Example usage
    arr = np.array(
        [
            [1, 2, 3, 4],
            [3, 3, 3, 6],
            [2, 0, 0, 2],
            [2, 0, 0, 2],
            [8, 2, 8, 2],
            [4, 5, 4, 5],
            [3, 3, 3, 6],
            [4, 5, 4, 5],
            [0, 9, 8, 7],
            [1, 2, 3, 4],
        ]
    )
    result_permutations = get_permutations(arr=arr, len_of_each=10)
    
    from itertools import permutations
    
    
    def test(arr):
        arr_perms = np.array([arr[i, :] for i in permutations(range(len(arr)))])
    u, index = npi.unique(arr_perms, return_index=True, axis=0)
    return u, index
    
    %timeit result_permutations = get_permutations(arr=arr, len_of_each=10)
    1.52 s ± 27.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    %timeit test(arr)
    13.6 s ± 52.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)