Search code examples
rdataframerecodecategorization

Creating one merged variable from multiple separate one


Any help would be greatly appreciated

I have a file exported from a PCR plate software. I have already coded the call for all alleles and have now merged them into one data frame.

I need to create a new variable merging the 3 alleles (G1-1, G1-2, and G2) to get a final genotype.

I then need to count the occurrence of the alleles to generate the other 3 APOL1 risk variables that I need to generate.


Allele logic for final genotype:

+/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (occurence of (G2) at either G2-1 or G2-2)

+/+ = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))

G2/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(G2) & G2-2(G2))

G1^GM/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))

G1^G+/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))

G1^GM/G1^GM = (occurence of (G1^S342G) at both G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at both G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))

G1^GM/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (occurence of (G2) at either G2-1 or G2-2)

G1^G+/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & (G1-2-1(+) & G1-2-2(+) & (occurence of (G2) at either G2-1 or G2-2)

Original dataframe

Final dataframe needed

Original Dataframe structure

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   28 obs. of  6 variables:
 $ G1-1-1   : chr  "+" "+" "+" "+" ...
 $ G1-1-2   : chr  "+" "+" "+" "+" ...
 $ G1-2-1   : chr  "+" "+" "+" "+" ...
 $ G1-2-2   : chr  "+" "+" "+" "+" ...
 $ G2-1     : chr  "+" "+" "+" "+" ...
 $ G2-2     : chr  "G2" "+" "G2" "G2" ...
The APOL1 Risk variables logic is below:

If (+/+) categorize as 1 in "no APOL1 Risk Alleles"

If (+/G2) or (G1^GM/+) or (G1^G+/+) categorize as 1 in "1 APOL1 Risk Alleles"

If (G1^GM/G1^GM) or (G1^GM/G2) or (G2/G2) categorize as 1 in "2 APOL1 Risk Alleles" 

Solution

  • You can implement the logic easily using dplyr functions.

    Implementing the logic given this should be the code needed to do that logic;

    library(dplyr)
    
    data <-
      data.frame(
        G1_1_1 = c("+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "G1S342G", "+", "+", "+", "+",
                   "G1S342G", "G1S342G", "+", "G1S342G", "+", "+", "+", "G1S342G", "+", "+",
                   "G1S342G", "+", "G1S342G", "G1S342G"),
        G1_1_2 = c("+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "+", "+", "+", "+", "+", "+",
                   "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
        G1_2_1 = c("+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "G1I384M", "+", "+", "+", "+",
                   "G1I384M", "G1I384M", "+", "G1I384M", "+", "+", "+", "+", "+", "+", "G1I384M",
                   "+", "G1I384M", "G1I384M"),
        G1_2_2 = c("+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "+", "+", "+", "+", "+", "+",
                   "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
        G2_1 = c("+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",
                 "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+"),
        G2_2 = c("G2", "+", "G2", "G2", "G2", "+", "+", "+", "G2", "G2", "+", "G2", "+", "G2", "G2",
                 "+", "+", "+", "G2", "+", "G2", "+", "+", "+", "+", "+", "+", "+"),
        stringsAsFactors = FALSE
      )
    
    result <-
      data %>% mutate(
        "Final genotype of APOL1" = 
          case_when(
            # +/+ = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
            G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" & 
              G1_2_2 == "+" & G2_1 == "+" & G2_2 == "+" ~ "+/+",
    
            # G2/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) & (G2-1(G2) & G2-2(G2))
            G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" & 
              G1_2_2 == "+" & G2_1 == "G2" & G2_2 == "G2" ~ "G2/G2",
    
            # G1^GM/G1^GM = (occurence of (G1^S342G) at both G1-1-1 or G1-1-2) 
            # & (occurence of (G1^I384M) at both G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
            G1_1_1 == "G1S342G" & G1_1_2 == "G1S342G" & G1_2_1 == "G1I384M" & 
              G1_2_2 == "G1I384M" & G2_1 == "+" & G2_2 == "+" ~ "G1GM/G1GM",
    
            #+/G2 = (G1-1-1(+) & G1-1-2(+)) & (G1-2-1(+) & G1-2-2(+)) 
            #& (occurence of (G2) at either G2-1 or G2-2)
            G1_1_1 == "+" & G1_1_2 == "+" & G1_2_1 == "+" & 
              G1_2_2 == "+" & (G2_1 == "G2" | G2_2 == "G2") ~ "+/G2",
    
            # G1^G+/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) 
            # & (G1-2-1(+) & G1-2-2(+)) & (G2-1(+) & G2-2(+))
            (G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & G1_2_1 == "+" & 
              G1_2_2 == "+" & G2_1 == "+" & G2_2 == "+" ~ "G1G+/+",
    
            # G1^G+/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) 
            # & (G1-2-1(+) & G1-2-2(+) & (occurence of (G2) at either G2-1 or G2-2)
            (G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & G1_2_1 == "+" & 
              G1_2_2 == "+" & (G2_1 == "G2" | G2_2 == "G2") ~ "G1G+/G2",
    
            # G1^GM/+ = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & 
            # (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & (G2-1(+) & G2-2(+))
            (G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & 
              (G1_2_1 == "G1I384M" | G1_2_2 == "G1I384M") & 
              G2_1 == "+" & G2_2 == "+" ~ "G1GM/+",
    
            # G1^GM/G2 = (occurence of (G1^S342G) at either G1-1-1 or G1-1-2) & 
            # (occurence of (G1^I384M) at either G1-2-1 or G1-2-2) & 
            # (occurence of (G2) at either G2-1 or G2-2)
            (G1_1_1 == "G1S342G" | G1_1_2 == "G1S342G") & 
              (G1_2_1 == "G1I384M" | G1_2_2 == "G1I384M") & 
              (G2_1 == "G2" | G2_2 == "G2") ~ "G1GM/G2",
    
            TRUE ~ NA_character_),
    
        "no APOL1 Risk Alleles" = ifelse(`Final genotype of APOL1` == "+/+", 1, NA),
    
        "1 APOL1 Risk Alleles" = 
          ifelse(`Final genotype of APOL1` %in% c("+/G2", "G1GM/+", "G1G+/+"), 1, NA),
    
        "2 APOL1 Risk Alleles" =
          ifelse(`Final genotype of APOL1` %in% c("G1GM/G1GM", "G1GM/G2", "G2/G2"), 1, NA),
      )
    
    glimpse(result)
    
    # Observations: 28
    # Variables: 10
    # $ G1_1_1                    <chr> "+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "G1S342G", ...
    # $ G1_1_2                    <chr> "+", "+", "+", "+", "+", "+", "+", "G1S342G", "+", "+", "+", "...
    # $ G1_2_1                    <chr> "+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "G1I384M", ...
    # $ G1_2_2                    <chr> "+", "+", "+", "+", "+", "+", "+", "G1I384M", "+", "+", "+", "...
    # $ G2_1                      <chr> "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+...
    # $ G2_2                      <chr> "G2", "+", "G2", "G2", "G2", "+", "+", "+", "G2", "G2", "+", "...
    # $ `Final genotype of APOL1` <chr> "+/G2", "+/+", "+/G2", "+/G2", "+/G2", "+/+", "+/+", "G1GM/G1G...
    # $ `no APOL1 Risk Alleles`   <dbl> NA, 1, NA, NA, NA, 1, 1, NA, NA, NA, 1, NA, 1, NA, NA, NA, 1, ...
    # $ `1 APOL1 Risk Alleles`    <dbl> 1, NA, 1, 1, 1, NA, NA, NA, 1, NA, NA, 1, NA, 1, NA, 1, NA, 1,...
    # $ `2 APOL1 Risk Alleles`    <dbl> NA, NA, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, NA, 1, NA, N...