Using across function in dplyr to a subset of variables

I have a data frame like this:

require(dplyr)

x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)

data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA

> data
          x_1        x_2         x_3        y_1        y_2
1          NA  0.9272000  0.29439845 -1.7856567  1.6579091
2          NA  0.2346621  1.09837343  0.3731092  0.6111779
3   0.7315300 -0.5579094 -0.08524311 -2.8661310  1.1545358
4  -0.9469221  0.6929277 -2.67173898  0.6391045 -0.5114099
5   1.5408777         NA  1.33386146 -0.5581233 -2.5733381
6  -0.2852210 -0.9532492  0.03750860 -1.0129503  0.3929722
7  -1.3821487 -2.1865094 -0.03039062  0.3960388 -1.5332137
8  -0.9447420  0.2669902  0.65167163  0.4310705 -1.5300816
9  -0.9023479  0.2068130  0.10868635 -1.1652238 -0.4892178
10 -0.9739177 -0.8094084  0.64103491  0.6063812  0.7248394

I need to create a new variable which counts the number of non missing values in each row for the variables starting with "x_". To do that I used mutate and across functions from dplyr.

data=data %>% mutate(sum_no_miss=across(.cols = starts_with("x_"),~ sum(is.na(.x))))

I ran the code without getting error. But I am not getting the ourput that I want. I am getting this.

Would it be possible to tell what I'm doing wrong?

Solution

We may use rowSums which is vectorized and efficient compared to rowwise with sum

library(dplyr)
data %>% 
   mutate(sum_no_miss = rowSums(!is.na(across(starts_with("x_")))))

-output

          x_1        x_2         x_3        y_1        y_2 sum_no_miss
1          NA  0.9272000  0.29439845 -1.7856567  1.6579091           2
2          NA  0.2346621  1.09837343  0.3731092  0.6111779           2
3   0.7315300 -0.5579094 -0.08524311 -2.8661310  1.1545358           3
4  -0.9469221  0.6929277 -2.67173898  0.6391045 -0.5114099           3
5   1.5408777         NA  1.33386146 -0.5581233 -2.5733381           2
6  -0.2852210 -0.9532492  0.03750860 -1.0129503  0.3929722           3
7  -1.3821487 -2.1865094 -0.03039062  0.3960388 -1.5332137           3
8  -0.9447420  0.2669902  0.65167163  0.4310705 -1.5300816           3
9  -0.9023479  0.2068130  0.10868635 -1.1652238 -0.4892178           3
10 -0.9739177 -0.8094084  0.64103491  0.6063812  0.7248394           3

If we want to use sum, then need rowwise

data %>%
   rowwise %>%
   mutate(sum_no_miss = sum(!is.na(c_across(starts_with('x_'))))) %>% 
   ungroup

-output

# A tibble: 10 × 6
      x_1    x_2     x_3    y_1    y_2 sum_no_miss
    <dbl>  <dbl>   <dbl>  <dbl>  <dbl>       <int>
 1 NA      0.927  0.294  -1.79   1.66            2
 2 NA      0.235  1.10    0.373  0.611           2
 3  0.732 -0.558 -0.0852 -2.87   1.15            3
 4 -0.947  0.693 -2.67    0.639 -0.511           3
 5  1.54  NA      1.33   -0.558 -2.57            2
 6 -0.285 -0.953  0.0375 -1.01   0.393           3
 7 -1.38  -2.19  -0.0304  0.396 -1.53            3
 8 -0.945  0.267  0.652   0.431 -1.53            3
 9 -0.902  0.207  0.109  -1.17  -0.489           3
10 -0.974 -0.809  0.641   0.606  0.725           3

In the OP's code, the function sum is used within across and across loops over each column, thus the sum will be the sum of non-NA elements in each column instead of across a row