What I have:
//This data set contains columns (second index) having the same value in each row (first index)
double[][] dataSet = new double[][]
{
new double[] {1, 2, 3, 4},
new double[] {5, 6, 7, 4},
new double[] {8, 9, 10, 4},
};
What i want to get:
// This data set has no column where the value in each row is the same
double[][] reducedDataSet = new double[][]
{
new double[] {1, 2, 3},
new double[] {5, 6, 7},
new double[] {8, 9, 10},
};
In python this can be easily done by:
all_equal_value_indices = numpy.all(data_set == data_set[0, :], axis=0) // Finds the indices of all columns that have equal values in each row
reduced_data_set = data_set[:, ~all_equal_value_indices] // Takes all rows with only those columns where all_equal_value_indices is not 1
In C# I can get an array containing the indices that should be excluded relatively fast, but how can I use these indices as mask to get only those columns not contained in these indices?
What i tried:
var numberOfDeletedColumns = 0;
var reducedDataSet = dataSet;
foreach (var columnToDelete in columnsToDelete)
{
reducedDataSet = reducedDataSet.RemoveColumn(columnToDelete - numberOfDeletedColumns++);
}
RemoveColumn
is an extension provided by Accord.Net and has the following code:
/// <summary>Returns a new matrix without one of its columns.</summary>
public static T[][] RemoveColumn<T>(this T[][] matrix, int index)
{
T[][] objArray = new T[matrix.Length][];
for (int index1 = 0; index1 < matrix.Length; ++index1)
{
objArray[index1] = new T[matrix[index1].Length - 1];
for (int index2 = 0; index2 < index; ++index2)
objArray[index1][index2] = matrix[index1][index2];
for (int index2 = index + 1; index2 < matrix[index1].Length; ++index2)
objArray[index1][index2 - 1] = matrix[index1][index2];
}
return objArray;
}
But this is much slower than the implementation in python. Could someone suggest a faster method to achieve a reduced data set?
Array.Copy
helps it run about 2x faster on my computer.
static T[][] FastRemoveColumn<T>(T[][] matrix, int index)
{
T[][] objArray = new T[matrix.Length][];
for (int i = 0; i < matrix.Length; i++)
{
var line = matrix[i];
var reducedline = new T[line.Length - 1];
Array.Copy(line, 0, reducedline, 0, index);
Array.Copy(line, index + 1, reducedline, index, line.Length - index - 1);
objArray[i] = reducedline;
}
return objArray;
}
and I also tried multithread. It runs very slow:
static T[][] MultiThreadRemoveColumn<T>(T[][] matrix, int index)
{
T[][] objArray = new T[matrix.Length][];
Parallel.For(0, matrix.Length, i =>
{
var line = matrix[i];
var reducedline = new T[line.Length - 1];
Array.Copy(line, 0, reducedline, 0, index);
Array.Copy(line, index + 1, reducedline, index, line.Length - index - 1);
objArray[i] = reducedline;
});
return objArray;
}
Test:
// init
double[][] arr = new double[2000][];
for (int i = 0; i < arr.Length; i++)
arr[i] = new double[2000];
double v = 0;
for (int i = 0; i < arr.Length; i++)
{
for (int j = 0; j < arr[i].Length; j++)
{
arr[i][j] = v;
v++;
}
}
Stopwatch sw = Stopwatch.StartNew();
var reducedArr = RemoveColumn(arr, 200);
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
sw.Restart();
var reducedArr2 = FastRemoveColumn(arr, 200);
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
sw.Restart();
var reducedArr3 = MultiThreadRemoveColumn(arr, 200);
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
// Check the result
for (int i = 0; i < reducedArr.Length; i++)
{
for (int j = 0; j < reducedArr[i].Length; j++)
{
if(reducedArr[i][j] != reducedArr2[i][j]) throw new Exception();
if(reducedArr[i][j] != reducedArr3[i][j]) throw new Exception();
}
}
Solution to remove several columns:
public static T[][] DeleteColumns<T>(T[][] matrix, int[] columns)
{
if (matrix.Length == 0) return new T[0][];
bool[] delColumns = new bool[matrix[0].Length];
foreach (int col in columns) delColumns[col] = true;
List<int> remainCols = new List<int>();
for (int i = 0; i < delColumns.Length; i++)
{
if (!delColumns[i]) remainCols.Add(i);
}
var target = new T[matrix.Length][];
for (int rowIndex = 0; rowIndex < matrix.Length; rowIndex++)
{
T[] sourceRow = matrix[rowIndex];
T[] targetRow = new T[remainCols.Count];
for (int i = 0; i < remainCols.Count; i++)
{
targetRow[i] = sourceRow[remainCols[i]];
}
target[rowIndex] = targetRow;
}
return target;
}
Test on a 2000x2000 matrix. Comparing with Adam Brown's solution, testing removing all columns is absolutely unfair, but my solution is faster even if removing only one column.