Search code examples
c#pythonarraysbitmask

C# masking an array to exclude indexes as fast as in python


What I have:

//This data set contains columns (second index) having the same value in each row (first index)
double[][] dataSet = new double[][]
{
  new double[] {1, 2, 3, 4},
  new double[] {5, 6, 7, 4},
  new double[] {8, 9, 10, 4},
}; 

What i want to get:

// This data set has no column where the value in each row is the same
double[][] reducedDataSet = new double[][]
{
  new double[] {1, 2, 3},
  new double[] {5, 6, 7},
  new double[] {8, 9, 10},
}; 

In python this can be easily done by:

all_equal_value_indices = numpy.all(data_set == data_set[0, :], axis=0) // Finds the indices of all columns that have equal values in each row
reduced_data_set = data_set[:, ~all_equal_value_indices] // Takes all rows with only those columns where all_equal_value_indices is not 1

In C# I can get an array containing the indices that should be excluded relatively fast, but how can I use these indices as mask to get only those columns not contained in these indices?

What i tried:

var numberOfDeletedColumns = 0;
var reducedDataSet = dataSet;

foreach (var columnToDelete in columnsToDelete)
{
  reducedDataSet = reducedDataSet.RemoveColumn(columnToDelete - numberOfDeletedColumns++);
}

RemoveColumn is an extension provided by Accord.Net and has the following code:

/// <summary>Returns a new matrix without one of its columns.</summary>
public static T[][] RemoveColumn<T>(this T[][] matrix, int index)
{
  T[][] objArray = new T[matrix.Length][];
  for (int index1 = 0; index1 < matrix.Length; ++index1)
  {
    objArray[index1] = new T[matrix[index1].Length - 1];
    for (int index2 = 0; index2 < index; ++index2)
      objArray[index1][index2] = matrix[index1][index2];
    for (int index2 = index + 1; index2 < matrix[index1].Length; ++index2)
      objArray[index1][index2 - 1] = matrix[index1][index2];
  }
  return objArray;
}

But this is much slower than the implementation in python. Could someone suggest a faster method to achieve a reduced data set?


Solution

  • Array.Copy helps it run about 2x faster on my computer.

    static T[][] FastRemoveColumn<T>(T[][] matrix, int index)
    {
        T[][] objArray = new T[matrix.Length][];
        for (int i = 0; i < matrix.Length; i++)
        {
            var line = matrix[i];
            var reducedline = new T[line.Length - 1];
            Array.Copy(line, 0, reducedline, 0, index);
            Array.Copy(line, index + 1, reducedline, index, line.Length - index - 1);
            objArray[i] = reducedline;                
        }
        return objArray;
    }
    

    and I also tried multithread. It runs very slow:

    static T[][] MultiThreadRemoveColumn<T>(T[][] matrix, int index)
    {
        T[][] objArray = new T[matrix.Length][];
        Parallel.For(0, matrix.Length, i =>
        {
            var line = matrix[i];
            var reducedline = new T[line.Length - 1];
            Array.Copy(line, 0, reducedline, 0, index);
            Array.Copy(line, index + 1, reducedline, index, line.Length - index - 1);
            objArray[i] = reducedline;                
        });
        return objArray;
    }
    

    Test:

    // init
    double[][] arr = new double[2000][];
    for (int i = 0; i < arr.Length; i++)            
        arr[i] = new double[2000];
    
    double v = 0;
    for (int i = 0; i < arr.Length; i++)
    {
        for (int j = 0; j < arr[i].Length; j++)
        {
            arr[i][j] = v;
            v++;
        }
    }
    
    Stopwatch sw = Stopwatch.StartNew();
    var reducedArr = RemoveColumn(arr, 200);
    sw.Stop();
    Console.WriteLine(sw.ElapsedMilliseconds);
    sw.Restart();
    var reducedArr2 = FastRemoveColumn(arr, 200);    
    sw.Stop();        
    Console.WriteLine(sw.ElapsedMilliseconds);
    sw.Restart();
    var reducedArr3 = MultiThreadRemoveColumn(arr, 200); 
    sw.Stop();     
    Console.WriteLine(sw.ElapsedMilliseconds);
    
    // Check the result
    for (int i = 0; i < reducedArr.Length; i++)
    {
        for (int j = 0; j < reducedArr[i].Length; j++)
        {
            if(reducedArr[i][j] != reducedArr2[i][j]) throw new Exception();
            if(reducedArr[i][j] != reducedArr3[i][j]) throw new Exception();   
        }
    }
    

    Update

    Solution to remove several columns:

    public static T[][] DeleteColumns<T>(T[][] matrix, int[] columns)
    {
        if (matrix.Length == 0) return new T[0][];
        bool[] delColumns = new bool[matrix[0].Length];
        foreach (int col in columns) delColumns[col] = true;
        List<int> remainCols = new List<int>();
        for (int i = 0; i < delColumns.Length; i++)
        {
            if (!delColumns[i]) remainCols.Add(i);
        }
        var target = new T[matrix.Length][];
        for (int rowIndex = 0; rowIndex < matrix.Length; rowIndex++)
        {
            T[] sourceRow = matrix[rowIndex];
            T[] targetRow = new T[remainCols.Count];
            for (int i = 0; i < remainCols.Count; i++)
            {
                targetRow[i] = sourceRow[remainCols[i]];
            }
            target[rowIndex] = targetRow;    
        }
        return target;
    }
    

    Test on a 2000x2000 matrix. Comparing with Adam Brown's solution, testing removing all columns is absolutely unfair, but my solution is faster even if removing only one column.