I have a large dataset, and it takes forever to get the results using for loops. It seems I can use lapply instead, but I'm having trouble using it for my analysis.
A sample code is below. I am using a data.table instead of dataframe.
library(data.table)
allCountries = rep(rep(LETTERS[1:3],3),3)
allYears = rep(rep(1991:1993, each=3),3)
myData = data.table(allCountries,allYears)
myData[,variable1 := rnorm(nrow(myData))]
myData[,variable2 := rnorm(nrow(myData))]
myData2 = myData[,.(variable3=mean(variable1)),by=.(allCountries,allYears)]
myData2[,variable4:=rnorm(nrow(myData2))]
myFunction = function(x,y){summary(lm(y~x))}
for(ii in unique(myData$allCountries)){
for(jj in unique(myData$allYears)){
xx=myData[allCountries==ii&allYears==jj,variable1]
yy=myData[allCountries==ii&allYears==jj,variable2]
test = myFunction(xx,yy)
a=test$coefficients[2]
myData2[allCountries==ii&allYears==jj,result:=a]
}
}
I'm trying to fit the model to the subset of the data and record the result in another dataset. I understand the logic of lapply, but struggling to implement it. Any help would be much appreciated!
The trick is to split the data by allCountries
and allYears
. This creates a list of data.tables and lapply
can operate on them.
library(data.table)
# original code
allCountries = rep(rep(LETTERS[1:3],3),3)
allYears = rep(rep(1991:1993, each=3),3)
# make the results reproducible
set.seed(2023)
myData = data.table(allCountries,allYears)
myData[,variable1 := rnorm(nrow(myData))]
myData[,variable2 := rnorm(nrow(myData))]
myData2 = myData[,.(variable3=mean(variable1)),by=.(allCountries,allYears)]
myData2[,variable4:=rnorm(nrow(myData2))]
myFunction = function(x,y){summary(lm(y~x))}
for(ii in unique(myData$allCountries)){
for(jj in unique(myData$allYears)){
xx=myData[allCountries==ii&allYears==jj,variable1]
yy=myData[allCountries==ii&allYears==jj,variable2]
test = myFunction(xx, yy)
a = test$coefficients[2]
myData2[allCountries==ii & allYears==jj, result := a]
}
}
# save to compare later
md2 <- myData2
# lapply code starts here
rm(list = ls(pattern = "^myData"))
# restart the pseudo-RNG and reproduce the data
set.seed(2023)
myData = data.table(allCountries,allYears)
myData[,variable1 := rnorm(nrow(myData))]
myData[,variable2 := rnorm(nrow(myData))]
#
myData2 = myData[,.(variable3=mean(variable1)),by=.(allCountries,allYears)]
myData2[,variable4:=rnorm(nrow(myData2))]
sp <- split(myData, list(myData$allCountries, myData$allYears))
# data.table transforms the data in place so 'res' is
# not stricktly needed but it avoids printing lapply's output
res <- lapply(sp, \(X) {
xx <- X[, variable1]
yy <- X[, variable2]
test <- myFunction(xx, yy)
a <- test$coefficients[2]
myData2[allCountries == X$allCountries[1] & allYears == X$allYears[1], result := a]
})
identical(md2, myData2)
#> [1] TRUE
rm(sp, res) # final clean-up
Created on 2023-02-24 with reprex v2.0.2
Here is a simplification of the lapply
loop above.
# this code is simpler than the lapply code above
# and their results (myData2) are identical()
res <- lapply(sp, \(X) {
test <- with(X, myFunction(variable1,variable2))
a <- test$coefficients[2]
myData2[allCountries == X$allCountries[1] & allYears == X$allYears[1], result := a]
})