I have a vector that consists of integers and NA values, and I want to replace each stretch of NA values with an incrementing sequence of integers that starts from the previous non-NA value.
This works but it's slow:
> v=c(NA,NA,5,NA,NA,2,8,NA)
> Reduce(\(i,j)if(is.na(j))i+1 else j,v,accumulate=T)
[1] NA NA 5 6 7 2 8 9
This was about ten times faster in my benchmark but it's a bit clunky:
> r=rle(is.na(v));w=which(r$values[-1]);s=cumsum(r$lengths);v2=v
> v2[bit::vecseq(s[w]+1,s[w+1])]=bit::vecseq(v[s[w]]+1,v[s[w]]+r$lengths[w+1]);v2
[1] NA NA 5 6 7 2 8 9
Is there some faster or simpler option?
Benchmark (edited to add answers to this question):
set.seed(0);v=sample(2e4);v[sample(2e4,1e4)]=NA
vnum=as.numeric(v)
# last observation incremented forward
loif=\(x){r=rle(is.na(x));w=which(r$values[-1]);s=cumsum(r$lengths)
x[bit::vecseq(s[w]+1,s[w+1])]=bit::vecseq(v[s[w]]+1,v[s[w]]+r$lengths[w+1]);x}
fill_incr <- function(v){
idx = which(complete.cases(v))
le = length(v)
d <- diff(c(idx, le + 1))
v[idx[1]:le] <- sequence(d, v[complete.cases(v)])
v
}
library(collapse)
fill_incr_collapse <- function(v){
idx = whichNA(v, invert = TRUE)
le = length(v)
d <- diff(c(idx, le + 1))
v[idx[1]:le] <- sequence(d, na_rm(v))
v
}
# replace two `complete_cases` functions with a single `!is.na`
fill_incr2=\(x){nn=!is.na(x);w=which(nn);l=length(x);d=diff(c(w,l+1))
x[w[1]:l]=sequence(d,x[nn]);x}
cpp11::cpp_source(,'#include "cpp11.hpp"
using namespace cpp11;
[[cpp11::register]]
doubles loif_cpp(doubles xs) {
bool hadnonna = false;
int n = xs.size();
writable::doubles out(n) ;
for (int i = 0; i < n; ++i) {
if (hadnonna && ISNA(xs[i])) {
out[i] = out[i-1] + 1;
} else {
out[i] = xs[i];
if (!ISNA(xs[i])) hadnonna = true;
}
}
return out;
}')
cppFunction('NumericVector na_locf_numeric(NumericVector x) {
int n = x.size();
LogicalVector ina = is_na(x);
for(int i = 1; i<n; i++) {
if(ina[i] == TRUE) {
x[i] = x[i-1] + 1;
}
}
return x;
}')
b=microbenchmark::microbenchmark(times=100,
Reduce(\(i,j)if(is.na(j))i+1 else j,v,accumulate=T),
loif(v),
fill_incr(v),
fill_incr2(v),
fill_incr_collapse(v),
loif_cpp(vnum),
na_locf_numeric(vnum))
o=sort(tapply(b$time,gsub(" +"," ",b$expr),median))
writeLines(sprintf("%.2f %s",o/min(o),names(o)))
The output shows the median time of a hundred runs relative to the fastest option:
1.00 na_locf_numeric(vnum)
2.26 loif_cpp(vnum)
9.80 fill_incr_collapse(v)
11.63 fill_incr2(v)
15.63 fill_incr(v)
20.41 loif(v)
211.83 Reduce(function(i, j) if (is.na(j)) i + 1 else j, v, accumulate = T)
Here's a way with diff
and sequence
:
fill_incr <- function(v){
idx = which(complete.cases(v))
le = length(v)
d <- diff(c(idx, le + 1))
v[idx[1]:le] <- sequence(d, v[complete.cases(v)])
v
}
fill_incr(v)
#[1] NA NA 5 6 7 2 8 9
One can also use collapse
's fast functions:
library(collapse)
fill_incr_collapse <- function(v){
idx = whichNA(v, invert = TRUE)
le = length(v)
d <- diff(c(idx, le + 1))
v[idx[1]:le] <- sequence(d, na_rm(v))
v
}
fill_incr_collapse(v)
Benchmark:
Unit: microseconds
expr min lq mean median uq max neval
loif(v) 1892.7 2702.95 4962.194 3509.45 4521.35 261297.5 1000
fill_incr(v) 1693.6 2466.65 3797.575 3106.20 4044.25 32057.1 1000
fill_incr_collapse(v) 985.2 1434.35 2844.424 1838.70 2377.05 194761.9 1000
Code:
loif=\(x){r=rle(is.na(x));w=which(r$values[-1]);s=cumsum(r$lengths);o=x
o[bit::vecseq(s[w]+1,s[w+1])]=bit::vecseq(x[s[w]]+1,x[s[w]]+r$lengths[w+1]);o}
library(microbenchmark)
v=sample(2e4);v[sample(2e4,1e4)]=NA
b <- microbenchmark(loif(v), fill_incr(v),
fill_incr_collapse(v),
times = 1000)
b