I have a data frame like this:
require(dplyr)
x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)
data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA
> data
x_1 x_2 x_3 y_1 y_2
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091
2 NA 0.2346621 1.09837343 0.3731092 0.6111779
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394
I need to create a new variable which counts the number of non missing values in each row for the variables starting with "x_". To do that I used mutate
and across
functions from dplyr
.
data=data %>% mutate(sum_no_miss=across(.cols = starts_with("x_"),~ sum(is.na(.x))))
I ran the code without getting error. But I am not getting the ourput that I want. I am getting this.
Would it be possible to tell what I'm doing wrong?
We may use rowSums
which is vectorized and efficient compared to rowwise
with sum
library(dplyr)
data %>%
mutate(sum_no_miss = rowSums(!is.na(across(starts_with("x_")))))
-output
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091 2
2 NA 0.2346621 1.09837343 0.3731092 0.6111779 2
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358 3
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099 3
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381 2
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722 3
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137 3
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816 3
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178 3
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394 3
If we want to use sum
, then need rowwise
data %>%
rowwise %>%
mutate(sum_no_miss = sum(!is.na(c_across(starts_with('x_'))))) %>%
ungroup
-output
# A tibble: 10 × 6
x_1 x_2 x_3 y_1 y_2 sum_no_miss
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 NA 0.927 0.294 -1.79 1.66 2
2 NA 0.235 1.10 0.373 0.611 2
3 0.732 -0.558 -0.0852 -2.87 1.15 3
4 -0.947 0.693 -2.67 0.639 -0.511 3
5 1.54 NA 1.33 -0.558 -2.57 2
6 -0.285 -0.953 0.0375 -1.01 0.393 3
7 -1.38 -2.19 -0.0304 0.396 -1.53 3
8 -0.945 0.267 0.652 0.431 -1.53 3
9 -0.902 0.207 0.109 -1.17 -0.489 3
10 -0.974 -0.809 0.641 0.606 0.725 3
In the OP's code, the function sum
is used within across
and across
loops over each column, thus the sum
will be the sum of non-NA elements in each column instead of across a row