Search code examples
rfunctioncomposition

R fuction composition for the substitution of values in dataframe


given the following reproducible example

my objective is to row-wise substitute the original values with NA in adjacent columns of a data frame; I know it's a problem (with so many variants) already posted but I've not yet found the solution with the approach I'm trying to accomplish: i.e. by applying a function composition

in the reproducible example the column driving the substitution with NA of the original values is column a

this is what I've done so far

the very last code snippet is a failing attempt of what I'm actually searching for...

#-----------------------------------------------------------
# ifelse approach, it works but...
# it's error prone: i.e. copy and paste for all columns can introduce a lot of troubles

df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df

df$b<-ifelse(is.na(df$a), NA, df$b)
df$c<-ifelse(is.na(df$a), NA, df$c)

df

#--------------------------------------------------------
# extraction and subsitution approach
# same as above

df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df

df$b[is.na(df$a)]<-NA
df$c[is.na(df$a)]<-NA

df

#----------------------------------------------------------
# definition of a function
# it's a bit better, but still error prone because of the copy and paste

df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df

fix<-function(x,y){
  ifelse(is.na(x), NA, y)
}

df$b<-fix(df$a, df$b)
df$c<-fix(df$a, df$c)

df

#------------------------------------------------------------
# this approach is not working as expected!
# the idea behind is of function composition;
# lapply does the fix to some columns of data frame

df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
df

fix2<-function(x){
  x[is.na(x[1])]<-NA
  x
}

df[]<-lapply(df, fix2)

df

any help for this particular approach? I'm stuck on how to properly conceive the substitute function passed to lapply

thanx


Solution

  • Using lexical closure

    If you use lexical closureing - you define a function which generates first the function you need. And then you can use this function as you wish.

    # given a column all other columns' values at that row should become NA
    # if the driver column's value at that row is NA
    
    # using lexical scoping of R function definitions, one can reach that.
    
    df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
    df
    
    # whatever vector given, this vector's value should be changed
    # according to first column's value
    
    na_accustomizer <- function(df, driver_col) {
      ## Returns a function which will accustomize any vector/column
      ## to driver column's NAs
      function(vec) {
        vec[is.na(df[, driver_col])] <- NA
        vec
      }
    }
    
    df[] <- lapply(df, na_accustomizer(df, "a"))
    
    df
    ##    a  b  c
    ## 1  1  3 NA
    ## 2  2 NA  5
    ## 3 NA NA NA
    
    # 
    # na_accustomizer(df, "a") returns
    # 
    #   function(vec) {
    #     vec[is.na(df[, "a"])] <- NA
    #     vec
    #   }
    # 
    # which then can be used like you want:
    # df[] <- lapply(df, na_accustomize(df, "a"))
    

    Using normal functions

    df<-data.frame(a=c(1, 2, NA), b=c(3, NA, 4), c=c(NA, 5, 6))
    df
    
    # define it for one column
    overtake_NA <- function(df, driver_col, target_col) {
      df[, target_col] <- ifelse(is.na(df[, driver_col]), NA, df[, target_col])
      df
    }
    
    # define it for all columns of df
    overtake_driver_col_NAs <- function(df, driver_col) {
      for (i in 1:ncol(df)) {
        df <- overtake_NA(df, driver_col, i)
      }
      df
    }
    
    overtake_driver_col_NAs(df, "a")
    #    a  b  c
    # 1  1  3 NA
    # 2  2 NA  5
    # 3 NA NA NA
    
    

    Generalize for any predicate function

    driver_col_to_other_cols <- function(df, driver_col, pred) {
      ## overtake any value of the driver column to the other columns of df,
      ## whenever predicate function (pred) is fulfilled.
      # define it for one column
      overtake_ <- function(df, driver_col, target_col, pred) {
        selectors <- do.call(pred, list(df[, driver_col]))
        if (deparse(substitute(pred)) != "is.na") {
          # this is to 'recorrect' NA's which intrude into the selector vector
          # then driver_col has NAs. For sure "is.na" is not the only possible
          # way to check for NA - so this edge case is not covered fully
          selectors[is.na(selectors)] <- FALSE
        }
        df[, target_col] <- ifelse(selectors, df[, driver_col], df[, target_col])
        df
      }
      for (i in 1:ncol(df)) {
        df <- overtake_(df, driver_col, i, pred)
      }
      df
    }
    
    
    driver_col_to_other_cols(df, "a", function(x) x == 1)
    #    a  b c
    # 1  1  1 1
    # 2  2 NA 5
    # 3 NA  4 6
    
    ## if the "is.na" check is not done, then this would give
    ## (because of NA in selectorvector):
    #    a  b  c
    # 1  1  1  1
    # 2  2 NA  5
    # 3 NA NA NA
    ## hence in the case that pred doesn't check for NA in 'a',
    ## these NA vlaues have to be reverted to the original columns' value.
    
    driver_col_to_other_cols(df, "a", is.na)
    #    a  b  c
    # 1  1  3 NA
    # 2  2 NA  5
    # 3 NA NA NA