Search code examples
rrcpprcpparmadillo

Is there an efficient way to obtain "pmax" other than using the R base function?


I would like to create a function using Rcpp that could outperform the pmax function from R base. I also tried to handle missing values inside of Rcpp function, and this might not be a good idea. All vector must have some missing values and they are all positive values. That is the reason I recoded missing to -1, so I could add it back in case a maximum value does not exists if all values are missing.

This was my first attempt, but no success yet:

library("benchr")
library("Rcpp")

Pmax <- function(...) {
  argd_list <- list(...)
  cppFunction("
  NumericVector cpp_pmax(List args) {
    List args0 = args[0];
    int n_arg = args.length();
    int n_vec = args0.length();
    NumericVector out(n_vec);
    out = args[0];
    for (int i = 1; i < n_arg; ++i) {
        NumericVector pa(n_vec);
        pa = args[i];
        for (int j = 0; j < n_vec; ++j) {
          if (R_IsNA(out[j])) {
            out[j] = -1;
          }
          if (R_IsNA(pa[j])) {
            pa[j] = -1;
          }
          out[j] = std::max(out[j], pa[j]);
        }
    }
    for (int j = 0; j < n_vec; ++j) {
      if (out[j] == -1) {
        out[j] = NA_REAL;
      }
    }
    return out;
  }
")
  output <- cpp_pmax(argd_list)
  return(output)
}


n <- 200000
x1 <- sample(0:1, n, replace = TRUE)
y1 <- sample(0:1, n, replace = TRUE)
z1 <- sample(0:1, n, replace = TRUE)
x1[sample(1:n, 90)]<-NA
y1[sample(1:n, 60)]<-NA
z1[sample(1:n, 70)]<-NA

pm1 <- Pmax(x1, y1, z1)
pm2 <- pmax(x1, y1, z1, na.rm = TRUE)

all(pm1 == pm2)

benchr::benchmark(pmax(x1, y1, z1, na.rm = TRUE),
                  Pmax(x1, y1, z1))

Benchmark summary:
  Time units : milliseconds 
expr                           n.eval   min lw.qu median  mean up.qu   max total relative
pmax(x1, y1, z1, na.rm = TRUE)    100  1.34  1.37   1.39  1.44  1.46  1.74   144     1.00
Pmax(x1, y1, z1)                  100 13.30 13.50  13.80 19.90 15.70 67.50  1990     9.88

Edit:

I've removed some loops and just replaced the -1 by NA outside of Rcpp, and it speed up a little bit, but still not outperform the R base pmax.

Although Rcpp::pmax is a nice implementation it only handles two vector and not sure if it can handle missing values. I got different results when there are missing vales.

The second attempt was:

Pmax1 <- function(...) {
  args_list <- list(...)
  cppFunction("
  NumericVector cpp_pmax(List args) {
    List args0 = args[0];
    int n_arg = args.length();
    int n_vec = args0.length();
    NumericVector out(n_vec);
    out = args[0];
    for (int i = 1; i < n_arg; ++i) {
        NumericVector pa(n_vec);
        pa = args[i];
        for (int j = 0; j < n_vec; ++j) {
          if (R_IsNA(out[j])) {
            out[j] = -1;
          }
          if (R_IsNA(pa[j])) {
            pa[j] = -1;
          }
          out[j] = std::max(out[j], pa[j]);
        }
    }
    return out;
  }
")
  output <- cpp_pmax(args_list)
  output[output == -1] <- NA
  return(output)
}

Pmax2 <- function(...) {
  args_list <- list(...)
  cppFunction("
  NumericVector cpp_pmax(List args) {
    NumericVector out = args[0];
    int n_arg = args.length();
    int n_vec = out.length();
    for (int j = 0; j < n_vec; ++j) {
      if (NumericVector::is_na(out[j])) out[j] = -1;
    }
    for (int i = 1; i < n_arg; ++i) {
      NumericVector pa = args[i];
      for (int j = 0; j < n_vec; ++j) {
        if (NumericVector::is_na(pa[j])) pa[j] = -1;
        out[j] = std::max(out[j], pa[j]);
      }
    }
    return out;
  }
")
  output <- cpp_pmax(args_list)
  output[output == -1] <- NA
  return(output)
}

n <- 200000
x <- sample(0:5, n, replace = TRUE)
y <- sample(0:5, n, replace = TRUE)
z <- sample(0:5, n, replace = TRUE)
w <- sample(0:5, n, replace = TRUE)
x[sample(1:n, 900)]<-NA
y[sample(1:n, 600)]<-NA
z[sample(1:n, 700)]<-NA
z[sample(1:n, 800)]<-NA

benchr::benchmark(pmax(x,  y, z, w, na.rm = TRUE),
                  Pmax1(x,  y, z, w),
                  Pmax2(x, y, z, w))

Benchmark summary:
  Time units : milliseconds 
                          expr n.eval   min lw.qu median  mean up.qu  max total relative
pmax(x, y, z, w, na.rm = TRUE)    100  2.38  2.43   2.46  2.46  2.48  2.6   246     1.00
Pmax1(x, y, z, w)                 100 16.00 16.90  17.20 19.40 17.70 61.2  1940     6.98
Pmax2(x, y, z, w)                 100  9.44  9.74   9.90 11.30 10.10 45.6  1130     4.02

Does anyone has an idea on how to make it faster than the R base pmax?

The idea was to have a generalized function to handle different number of vectors, all inside Rcpp function.

Update based on @DirkEddelbuettel and @Cole answer

Thank you for helping in optimize the code. Inspired by @DirkEddelbuettel and @Cole answers I just add the Rcpp::pmax to remove one of the loop and it helped also to speed it up.

library("bench")
library("Rcpp")

cppFunction("
  IntegerVector cpp_pmax1(List args) {
    IntegerVector tmp = args[0];
    IntegerVector out = clone(tmp);
    int n_arg = args.length();
    int n_vec = out.length();
    for (int i = 1; i < n_arg; ++i) {
      IntegerVector pa = args[i];
      for (int j = 0; j < n_vec; ++j) {
        if (pa[j] > out[j]) out[j] = pa[j];
      }
    }
    return out;
  }
")

cppFunction("
  IntegerVector cpp_pmax2(List args) {
    IntegerVector tmp = args[0];
    IntegerVector out = clone(tmp);
    int n_arg = args.length();
    int n_vec = out.length();
    for (int i = 1; i < n_arg; ++i) {
      IntegerVector pa = args[i];
      out = pmax(out, pa);
    }
    return out;
  }
")

Pmax1 <- function(...) {
  cpp_pmax1(list(...))
}


Pmax2 <- function(...) {
  cpp_pmax2(list(...))
}


n <- 200000
x <- sample(0:5, n, replace = TRUE)
y <- sample(0:5, n, replace = TRUE)
z <- sample(0:5, n, replace = TRUE)
w <- sample(0:5, n, replace = TRUE)
k <- sample(0:5, n, replace = TRUE)
x[sample(1:n, 900)] <- NA
y[sample(1:n, 600)] <- NA
z[sample(1:n, 700)] <- NA
w[sample(1:n, 800)] <- NA
k[sample(1:n, 800)] <- NA

pm0 <- pmax(x,  y, z, w, k, na.rm = TRUE)
pm1 <- Pmax1(x, y, z, w, k)
pm2 <- Pmax2(x, y, z, w, k)

benchr::benchmark(pmax(x,  y, z, w, k, na.rm = TRUE),
                  Pmax1(x, y, z, w, k),
                  Pmax2(x, y, z, w, k))


Benchmark summary:
  Time units : microseconds 
                             expr n.eval  min lw.qu median mean up.qu  max  total relative
pmax(x, y, z, w, k, na.rm = TRUE)    100 2880  2900   2920 3050  3080 8870 305000     5.10
Pmax1(x, y, z, w, k)                 100 2150  2180   2200 2310  2350 8060 231000     3.85
Pmax2(x, y, z, w, k)                 100  527   558    572  812   719 7870  81200     1.00
  

Thank you!


Solution

  • There seem to be a few issues that memory allocations that can be seen from bench::mark uncover.

    bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
                Pmax2(x, y, z, w))
    
    ## # A tibble: 2 x 13
    ##   expression                         min  median `itr/sec` mem_alloc
    ##   <bch:expr>                     <bch:t> <bch:t>     <dbl> <bch:byt>
    ## 1 pmax(x, y, z, w, na.rm = TRUE)  5.79ms  6.28ms     157.    781.3KB
    ## 2 Pmax2(x, y, z, w)              39.56ms 54.48ms      19.7    9.18MB
    

    Memory Coercion

    There is 10 times the memory allocation in comparison to base pmax(). Your is relatively straight forward, so this hints that there is some kind of coercion. And when looking at your sample data, you are sending integer vectors to a numeric signature. This creates a costly coercion. Let's update the signature and code to expect IntegerVectors. I simply changed everything from NumericVector to IntegerVector for this.

      expression                         min  median `itr/sec` mem_alloc
      <bch:expr>                     <bch:t> <bch:t>     <dbl> <bch:byt>
    1 pmax(x, y, z, w, na.rm = TRUE)  1.89ms  2.33ms     438.    781.3KB
    2 Pmax2_int(x, y, z, w)          37.42ms 49.88ms      17.6    2.32MB
    

    Re-Compilation

    The OP code includes cppFunction within the larger function code. Unless we need to recompile it every loop, we can instead compile and then call the compiled code from R. This is the biggest performance boost for this dataset size.

    cppFunction("
      IntegerVector cpp_pmax_pre(List args) {
        IntegerVector out = args[0];
        int n_arg = args.length();
        int n_vec = out.length();
        for (int j = 0; j < n_vec; ++j) {
          if (IntegerVector::is_na(out[j])) out[j] = -1;
        }
        for (int i = 1; i < n_arg; ++i) {
          IntegerVector pa = args[i];
          for (int j = 0; j < n_vec; ++j) {
            if (IntegerVector::is_na(pa[j])) pa[j] = -1;
            out[j] = std::max(out[j], pa[j]);
          }
        }
        return out;
      }
    ")
    
    Pmax2_int_pre <- function(...) {
      args_list <- list(...)
      output <- cpp_pmax_pre(args_list)
      output[output == -1] <- NA
      return(output)
    }
    
    bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
                Pmax2_int_pre(x, y, z, w))
    
    ## # A tibble: 2 x 13
    ##   expression                        min median `itr/sec` mem_alloc
    ##   <bch:expr>                     <bch:> <bch:>     <dbl> <bch:byt>
    ## 1 pmax(x, y, z, w, na.rm = TRUE) 2.31ms 2.42ms      397.   781.3KB
    ## 2 Pmax2_int_pre(x, y, z, w)      2.48ms 3.55ms      270.    2.29MB
    

    More memory and small optimizations

    Finally, we still have more memory allocated. That hints we can do more - in this case we should update NA_REAL in . Related, we can optimize the loop assignment some.

    cppFunction("
      IntegerVector cpp_pmax_final(List args) {
        IntegerVector out = args[0];
        int n_arg = args.length();
        int n_vec = out.length();
        for (int j = 0; j < n_vec; ++j) {
          if (IntegerVector::is_na(out[j])) out[j] = -1;
        }
        for (int i = 1; i < n_arg; ++i) {
          IntegerVector pa = args[i];
          for (int j = 0; j < n_vec; ++j) {
    // simplify logic; if the element is not na and is greater than the out, update out.
            if (!IntegerVector::is_na(pa[j]) & pa[j] > out[j]) out[j] = pa[j];
          }
        }
    // update now in Rcpp instead of allocating vectors in R
        for (int i = 0; i < n_vec; i++) {
          if(out[i] == -1) out[i] = NA_INTEGER;
        }
        return out;
      }
    ")
    
    Pmax2_final <- function(...) {
      cpp_pmax_final(list(...))
    }
    
    bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
                Pmax2_final(x, y, z, w))
    
    ## # A tibble: 2 x 13
    ##   expression                        min median `itr/sec` mem_alloc
    ##   <bch:expr>                     <bch:> <bch:>     <dbl> <bch:byt>
    ## 1 pmax(x, y, z, w, na.rm = TRUE)    2ms 2.08ms      460.   781.3KB
    ## 2 Pmax2_final(x, y, z, w)        1.19ms 1.45ms      671.    2.49KB
    

    We did it*! I am sure there could be small optimizations - we access pa[j] three times so it may be worthwhile to assign to a variable.

    Bonus - NA_INTEGER

    According to Rcpp for Everyone, the NA_INTEGER should be equivalent to the lowest integer value of -2147483648. Using this, we can remove the replacement of NA's because we can compare directly to NA when dealing with int data types.

    During this realization, I also found an issue with the previous part - we need to clone the initial argument so that we are not accidently changing it by reference. Still, we're still slightly faster than base pmax().

    cppFunction("
      IntegerVector cpp_pmax_last(List args) {
        IntegerVector tmp = args[0];
        IntegerVector out = clone(tmp);
        int n_arg = args.length();
        int n_vec = out.length();
        for (int i = 1; i < n_arg; ++i) {
          IntegerVector pa = args[i];
          for (int j = 0; j < n_vec; ++j) {
            if (pa[j] > out[j]) out[j] = pa[j];
          }
        }
        return out;
      }
    ")
    
    Pmax2_last <- function(...) {
      cpp_pmax_last(list(...))
    }
    
    bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
                Pmax2_last(x, y, z, w),
    )
    
    ## # A tibble: 2 x 13
    ##   expression                        min median `itr/sec` mem_alloc `gc/sec`
    ##   <bch:expr>                     <bch:> <bch:>     <dbl> <bch:byt>    <dbl>
    ## 1 pmax(x, y, z, w, na.rm = TRUE) 5.98ms 6.36ms      154.     781KB        0
    ## 2 Pmax2_last(x, y, z, w)         5.09ms 5.46ms      177.     784KB        0