So I have a data frame of 16 columns and ~17 million rows.
I would first like to do some ddply
on the data frame and then look at the correlations between the different columns. What’s the best and most efficient way to achieve this? My current approach takes too long:
The data frame is all_df
and column names are A
, B
, C
, ..., N
, O
, P
avB <- ddply(all_df, c(“A”), summarise, NB_av=mean(B), NB_sd=sd(B))
avC <- ddply(all_df, c(“A”), summarise, NC_av=mean(C), NC_sd=sd(C))
avD <- ddply(all_df, c(“A”), summarise, ND_av=mean(D), ND_sd=sd(D))
avE <- ddply(all_df, c(“A”), summarise, NE_av=mean(E), NE_sd=sd(E))
avF <- ddply(all_df, c(“A”), summarise, NF_av=mean(F), NF_sd=sd(F))
avG <- ddply(all_df, c(“A”), summarise, NG_av=mean(G), NG_sd=sd(G))
summary_df <- avB
summary_df <- merge(summary_df, avC, by=c(“A”))
summary_df <- merge(summary_df, avD, by=c(“A”))
summary_df <- merge(summary_df, avE, by=c(“A”))
summary_df <- merge(summary_df, avF, by=c(“A”))
summary_df <- merge(summary_df, avG, by=c(“A”))
#quick look at the correlation
plot((summary_df[,c(2,4,6,8,10,12)]), gap=0)
So, as it is, I am resolving to doing a lot of these, averages, standard deviations etc in MySQL and then doing the final co-relation analyses in R. However, I don't find that very elegant.
Why do I use a data frame instead of a data table? Because I am reading a MySQL table in to R, and the syntax dbGetQuery(con,"select * from mysql_table")
returns a data frame.
You could try
library(dplyr)
all_df %>%
group_by(A) %>%
summarise_each(funs(mean, sd), B:G)
Or another option is data.table
library(data.table)
setDT(all_df)[, lapply(.SD, function(x) c(mean(x), sd(x))), by = A,
.SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][]
NOTE: The results in the first form is in the wide format, while in the second we get the 'mean', 'sd' as alternative rows.
all_df1 <- all_df[rep(1:nrow(all_df), 1e5),]
system.time(all_df1%>% group_by(A) %>% summarise_each(funs(mean, sd), B:G))
# user system elapsed
# 0.189 0.000 0.189
DT1 <- as.data.table(all_df1)
system.time(DT1[,lapply(.SD, function(x) c(mean(x), sd(x))),
A, .SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][])
# user system elapsed
#0.232 0.002 0.235
set.seed(25)
m1 <- matrix(sample(1:20, 15*20, replace=TRUE), ncol=15)
set.seed(353)
all_df <- data.frame(sample(letters[1:3], 20, replace=TRUE), m1)
colnames(d1) <- LETTERS[1:ncol(d1)]