Search code examples
rdplyrdummy-variableinversion

Assigning values based on varying and mutable criteria


I am generating a dataframe with dummy variables using the fastDummies for later modeling using two character variables.

The original looking something like this:

PID Flag1   Flag2   Var_1   Var_2
0001    0   0   AB  AB
0001    0   0   CD  CD
0001    1   0   EF  EF
0001    0   0   GH  GH
0001    0   0   IJ  IJ
0001    0   1   KL  KL
0001    0   0   MN  MN
0001    1   0   OP  OP
0001    0   0   QR  QR
0001    0   1   ST  ST
0001    0   0   UV  UV
0001    1   0   WX  WX
0001    0   0   YZ  YZ

And fastDummies will generate an output of Var_1 and Var_2 like the following:

PID Flag1   Flag2   Var_1   Var_2   Var_1_AB    Var_1_CD    Var_1_EF    Var_1_GH    Var_1_IJ    Var_1_KL    Var_1_MN    Var_1_OP    Var_1_QR    Var_1_ST    Var_1_UV    Var_1_WX    Var_1_YZ    Var_2_AB    Var_2_CD    Var_2_EF    Var_2_GH    Var_2_IJ    Var_2_KL    Var_2_MN    Var_2_OP    Var_2_QR    Var_2_ST    Var_2_UV    Var_2_WX    Var_2_YZ
0001    0   0   AB  AB  1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
0001    0   0   CD  CD  0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
0001    1   0   EF  EF  0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0
0001    0   0   GH  GH  0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0
0001    0   0   IJ  IJ  0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
0001    0   1   KL  KL  0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
0001    0   0   MN  MN  0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
0001    1   0   OP  OP  0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0
0001    0   0   QR  QR  0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
0001    0   1   ST  ST  0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
0001    0   0   UV  UV  0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0
0001    1   0   WX  WX  0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0
0001    0   0   YZ  YZ  0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1

However, the data contains flags for encoding whether or not a variable needs to be inverted (in this case, flipped from 0 to 1 or vise-verse), indicated with Flag_1 and Flag_2.

Which would need to look something like this:

PID Flag1   Flag2   Var_1   Var_2   Var_1_AB    Var_1_CD    Var_1_EF    Var_1_GH    Var_1_IJ    Var_1_KL    Var_1_MN    Var_1_OP    Var_1_QR    Var_1_ST    Var_1_UV    Var_1_WX    Var_1_YZ    Var_2_AB    Var_2_CD    Var_2_EF    Var_2_GH    Var_2_IJ    Var_2_KL    Var_2_MN    Var_2_OP    Var_2_QR    Var_2_ST    Var_2_UV    Var_2_WX    Var_2_YZ
1   0   0   AB  AB  1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
1   0   0   CD  CD  0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
1   1   0   EF  EF  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0
1   0   0   GH  GH  0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0
1   0   0   IJ  IJ  0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
1   0   1   KL  KL  0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
1   0   0   MN  MN  0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
1   1   0   OP  OP  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0
1   0   0   QR  QR  0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
1   0   1   ST  ST  0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
1   0   0   UV  UV  0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0
1   1   0   WX  WX  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
1   0   0   YZ  YZ  0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   1

Where a 1 on Flag_1 indicates a flip from 0 to 1 on the dummy variable created using Var_1 and a 1 on Flag_2 is indicative of a 0 to 1 flip on the dummy variable created using Var_2.

I was originally going to create a function to do this that reads in the dataframe and parses values based on strings and column names and deploy it via lapply, but I feel like there's a way to do this using dplyr/tidyverse via mutate_at or mutate_all that can be manipulated with the standard variable names that I'm just missing.

Any suggestions at all on how to accomplish this in a simpler manner would be appreciated.

Some code to generate the aforementioned dataframe:

list.of.packages <-
  c("data.table", "tidyverse", "stringr","janitor","fastDummies")
new.packages <-
  list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages))
  install.packages(new.packages)

lapply(list.of.packages, require, character.only = TRUE)

df = data.frame(PID = c(0001,0001,0001,0001,0001,0001,0001,0001,0001,0001,0001,0001,0001), Flag1 = c(0,0,1,0,0,0,0,1,0,0,0,1,0), Flag2 = c(0,0,0,0,0,1,0,0,0,1,0,0,0), Var_1 = c("AB","CD","EF","GH","IJ","KL","MN","OP","QR","ST","UV","WX","YZ"), Var_2 = c("AB","CD","EF","GH","IJ","KL","MN","OP","QR","ST","UV","WX","YZ"))
df <- dummy_cols(df, select_columns = "Var_1", remove_selected_columns=FALSE, omit_colname_prefix=FALSE)
df <- dummy_cols(df, select_columns = "Var_2", remove_selected_columns=FALSE, omit_colname_prefix=FALSE)

Solution

  • Try a two-across method:

    library(dplyr)
    dat1new <- dat1 %>%
      mutate(
        across(starts_with("Var_1_"),
          ~ if_else(Var_1 == sub(".*_", "", cur_column()) & Flag1 > 0, +!., .)),
        across(starts_with("Var_2_"),
          ~ if_else(Var_2 == sub(".*_", "", cur_column()) & Flag2 > 0, +!., .))
      )
    dat1new
    #    PID Flag1 Flag2 Var_1 Var_2 Var_1_AB Var_1_CD Var_1_EF Var_1_GH Var_1_IJ Var_1_KL Var_1_MN Var_1_OP Var_1_QR Var_1_ST Var_1_UV Var_1_WX Var_1_YZ Var_2_AB Var_2_CD Var_2_EF Var_2_GH Var_2_IJ Var_2_KL Var_2_MN Var_2_OP Var_2_QR Var_2_ST Var_2_UV Var_2_WX Var_2_YZ
    # 1    1     0     0    AB    AB        1        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0
    # 2    1     0     0    CD    CD        0        1        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0
    # 3    1     1     0    EF    EF        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0
    # 4    1     0     0    GH    GH        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0
    # 5    1     0     0    IJ    IJ        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0
    # 6    1     0     1    KL    KL        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
    # 7    1     0     0    MN    MN        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0
    # 8    1     1     0    OP    OP        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0
    # 9    1     0     0    QR    QR        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0
    # 10   1     0     1    ST    ST        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0
    # 11   1     0     0    UV    UV        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0
    # 12   1     1     0    WX    WX        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        0        1        0
    # 13   1     0     0    YZ    YZ        0        0        0        0        0        0        0        0        0        0        0        0        1        0        0        0        0        0        0        0        0        0        0        0        0        1
    all.equal(dat1new, dat2)
    # [1] TRUE
    

    Walkthrough:

    • across is the preferred method over mutate_at, it should look similar-enough.
    • Using the trailing _ in starts_with("Var_1_") should match only the columns we want and skip the literal Var_1 column.
    • sub(...) converts each column name to just the two-letter code found in Var_1
    • when the two ("this" column and Var_1) match and Flag1 is set, then use +!. which is a shortcut way to convert an integer to logical, invert it, then back to integer.
    • repeat for _2_ variables

    Data: dat1 is your original (second data code block), dat2 is your desired output (third data code block).

    dat1 <- structure(list(PID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Flag1 = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), Flag2 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_2 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_1_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,  0L, 0L), Var_1_EF = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_IJ = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_KL = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), Var_1_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_1_ST = c(0L, 0L, 0L, 0L, 0L,  0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_1_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), Var_1_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), Var_2_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_EF = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_IJ = c(0L,  0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_KL = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), Var_2_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_2_ST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_2_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_2_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,  1L, 0L), Var_2_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -13L))
    dat2 <- structure(list(PID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Flag1 = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), Flag2 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_2 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_1_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,  0L, 0L), Var_1_EF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_IJ = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_KL = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_1_ST = c(0L, 0L, 0L, 0L, 0L,  0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_1_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), Var_2_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_EF = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_IJ = c(0L,  0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_KL = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), Var_2_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_2_ST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_2_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,  1L, 0L), Var_2_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -13L))