Usually I have to work with big spatial data, and high speed and memory efficiency are expected. Supposing I want to modify some numeric columns of a dataframe with a self-defined function in Rcpp, I am confused about the reference and copy mechanism of C++ and Rcpp. With the three minimal example code below, would you please help me clatifying the following questions:
Is updateDF3 the best function to do such a task with the highest speed and lowest memory required? This function is modified from a similiar question here, but I do not understand the warning given by the author, "There are issues associated with this approach. Your original data frame and the one you created share the same vectors and so bad things can happen." If I use this function only for a sub function as updateDF3 and called from R, is it safe?
Why is the difference of performance of updateDF1 and updateDF2 not significant? What is the difference between passing the parameter with or without reference (&)?
Is the function coded pooly and there is another way, such as DataFrame out=clone(df), tmpstr=asstd::string(colnames[v])?
Thanks in advance.
#include <Rcpp.h>
#include <iostream>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
bool contains(CharacterVector x, std::string y) {
return std::find(x.begin(), x.end(), y)!=x.end();
}
// [[Rcpp::export]]
DataFrame updateDF1(DataFrame df, Nullable<Rcpp::CharacterVector> vars=R_NilValue) {
DataFrame out=clone(df);
string tmpstr;
NumericVector tmpv;
if(vars.isNotNull()){
CharacterVector selvars(vars);
for(int v=0;v<selvars.size();v++){
tmpstr=as<std::string>(selvars[v]);
tmpv=df[tmpstr];
tmpv=tmpv+1.0;
out[tmpstr]=tmpv;
}
}
return out;
}
// [[Rcpp::export]]
DataFrame updateDF2(DataFrame& df, Nullable<Rcpp::CharacterVector> vars=R_NilValue) {
DataFrame out=clone(df);
string tmpstr;
NumericVector tmpv;
if(vars.isNotNull()){
CharacterVector selvars(vars);
for(int v=0;v<selvars.size();v++){
tmpstr=as<std::string>(selvars[v]);
tmpv=df[tmpstr];
tmpv=tmpv+1.0;
out[tmpstr]=tmpv;
}
}
return out;
}
// [[Rcpp::export]]
List updateDF3(DataFrame& df, Nullable<Rcpp::CharacterVector> vars=R_NilValue) {
List out(df.size());
CharacterVector colnames=df.attr("names");
string tmpstr;
NumericVector tmpv;
for(int v=0;v<df.size();v++){
if(vars.isNotNull()){
CharacterVector selvars(vars);
tmpstr=as<std::string>(colnames[v]);
if(contains(selvars,tmpstr)){
tmpv=df[tmpstr];
tmpv=tmpv+1.0;
out[v]=tmpv;
}else{
out[v]=df[tmpstr];
}
}else{
out[v]=df[tmpstr];
}
}
out.attr("class") = df.attr("class") ;
out.attr("row.names") = df.attr("row.names") ;
out.attr("names") = df.attr("names") ;
return out;
}
/*** R
df=as.data.frame(matrix(1:120000000,nrow=10000000))
names(df)=paste("band",1:ncol(df),sep="_")
df=cbind(x="charcol",df)
microbenchmark::microbenchmark(
x1<<-updateDF1(df,vars=names(df)[-1]),
x2<<-updateDF2(df,vars=names(df)[-1]),
x3<<-updateDF3(df,vars=names(df)[-1]),
times=10
)
identical(x1,x2)
identical(x1,x3)
*/
##performance
#Unit: milliseconds
# expr min lq mean median
# x1 <<- updateDF1(df, vars = names(df)[-1]) 587.6023 604.9242 711.8981 651.1242
# x2 <<- updateDF2(df, vars = names(df)[-1]) 581.7129 641.2876 882.9999 766.9354
# x3 <<- updateDF3(df, vars = names(df)[-1]) 406.1824 417.5892 542.2559 420.8485
According to the suggestion of @Roland, the best way using a reference method by modifying updateDF2, the code is as below:
// [[Rcpp::export]]
DataFrame updateDF(DataFrame& df, Nullable<Rcpp::CharacterVector> vars=R_NilValue) {
string tmpstr;
NumericVector tmpv;
if(vars.isNotNull()){
CharacterVector selvars(vars);
for(int v=0;v<selvars.size();v++){
tmpstr=selvars[v];
tmpv=df[tmpstr];
tmpv=tmpv+1.0;
df[tmpstr]=tmpv;
}
}
return df;
}
with the performance of:
Unit: milliseconds
expr min lq mean median
x1 <<- updateDF1(df, vars = names(df)[-1]) 573.8246 728.4211 990.8680 951.3108
x2 <<- updateDF2(df, vars = names(df)[-1]) 595.7339 694.0645 935.4226 941.7450
x3 <<- updateDF3(df, vars = names(df)[-1]) 197.7855 206.4767 377.4378 225.0290
x4 <<- updateDF(df, vars = names(df)[-1]) 148.5119 149.7321 247.1329 152.3744