Lets say I have two matrices A and B given by
set.seed(123)
m1 = matrix(runif(10*5), nrow = 10, ncol = 5)
m2 = matrix(runif(10*5), nrow = 10, ncol = 5)
I want to find for each row in matrix A the row in matrix B which is closest to the row in matrix A. I know I can do this by looping over each row in A and comparing it to each row in B like this:
for(i in 1:nrow(m1)){
dist = 9999
index = -1
for(j in 1:nrow(m2)){
test = sqrt(sum(abs(m1[i,]-m2[j,])))
if (test < dist) {
dist = test
index = j
}
}
print(index)
}
I have however a million rows and it takes forever. I'm struggeling to find a efficient way. Any ideas?
Here is one base R solution using apply
:
apply(m1, 1, \(x) which.min(sqrt(colSums(abs(x - t(m2))))))
#[1] 8 3 2 3 3 1 2 3 6 10
Comparing it with your current solution it fares well :
set.seed(123)
m1 = matrix(runif(10 * 5), nrow = 10, ncol = 5)
m2 = matrix(runif(10 * 5), nrow = 10, ncol = 5)
baseR_sol <- function(m1, m2) {
apply(m1, 1, \(x) which.min(sqrt(colSums(abs(x - t(m2))))))
}
for_loop_sol <- function(m1, m2) {
for(i in 1:nrow(m1)){
dist = 9999
index = -1
for(j in 1:nrow(m2)){
test = sqrt(sum(abs(m1[i,]-m2[j,])))
if (test < dist) {
dist = test
index = j
}
}
print(index)
}
}
microbenchmark::microbenchmark(
baseR_sol = baseR_sol(m1, m2),
for_loop_sol = for_loop_sol(m1, m2), times = 10L
)
# expr min lq mean median uq max neval
# baseR_sol 158.0 185.2 865.81 195.35 224.8 6902.8 10
# for_loop_sol 764.6 830.2 1051.29 973.45 1312.0 1348.9 10