Search code examples
rspotfire

R - generate dynamic number of columns and substring column values


looking for some help with data manipulation in R. I have data in the following format;

ID  L1  L2  L3
1   BBCBCACCBCB CBCBBBB BEBBBAAB
2   BBCBCCCCBCB CBCCCBC BBAACCCB
3   BBCBCACCBCB CBCBBBB BEBBBAAB
4   BBCBCACCBCB CBCBBBB BEBBBAAB
5   BBCBACBCCCB BBCCCBC BBCBAAAAB
6   BBCBBCCBBCB BBCBCEB BBBBCAACB
7   BBCBBCCBBCB BBCBCEB BBBBCAACB
8           
9   BBCBCACCBCB CBCBBBB BEBBBAAB
10  BBCBBCCBBCB BBCBCEB BBBBCAACB
11  BBCBBCCBBCB BBCBCEB BBBBCAACB

The values in each column will be strings of varying length. I want an R function that for each column above, will

1) generate a dynamic number of columns based on the maximum length of any string in the column e.g. L1 max length = 11, therefore 11 new columns each labelled L1_1:L1_11

2) then split the strings into triplets, e.g.

ID  L1  L2  L3  L1_1    L1_2    L1_3    L1_4    L1_5    L1_6    L1_7    L1_8    L1_9
1   BBCBCACCBCB CBCBBBB BEBBBAAB    BBC BCB CBC BCA CAC ACC CCB CBC BCB

3) perform a calculation on this triplet i.e. (number of 'a' * 1) + (number of 'b' * 3) + (number of 'c'*7) in the triplet.

4) return the value of this calculation in the new column.

I have found that the code suggested does exactly what I need when run for columns L1, L2 but does not work for L3. The error I receive is 'Error in as.data.frame.matrix(passed.args[[i]], stringsAsFactors = st : missing value where TRUE/FALSE needed'

Any ideas? Thanks very much.

EDIT

dput(df):

structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))

structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))


Solution

  • #DATA
    df = structure(list(ID = 1:4, L1 = c("abbbcc", "aabacd", "abbda", 
    "bbad")), .Names = c("ID", "L1"), class = "data.frame", row.names = c(NA, 
    -4L))
    
    #Go through the strings and split into subgroups of 3 characters.
    #Put the substrings in a list
    temp = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
    
    #Obtain the length of the subgroup with the most triplets
    temp_l = max(lengths(temp))
    
    #Subset the subgroups from 1 to temp_l so that remianing values are NA
    cbind(df, setNames(data.frame(do.call(rbind, lapply(temp, function(a)
        a[1:temp_l]))), nm = paste0("L1_",1:temp_l)))
    #  ID     L1 L1_1 L1_2 L1_3 L1_4
    #1  1 abbbcc  abb  bbb  bbc  bcc
    #2  2 aabacd  aab  aba  bac  acd
    #3  3  abbda  abb  bbd  bda <NA>
    #4  4   bbad  bba  bad <NA> <NA>
    

    If you want calculation based on triplets, run the following before doing the cbind step

    temp_L1 = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
    temp_L1_length = max(lengths(temp_L1))
    temp_L1 = lapply(temp_L1, function(x)
                 sapply(x, function(y){
                         num_a = unlist(gregexpr(pattern = "a", text = y))
                         num_a = sum(num_a > 0)  #length of positive match
                         num_b = unlist(gregexpr(pattern = "b", text = y))
                         num_b = sum(num_b > 0)
                         num_c = unlist(gregexpr(pattern = "c", text = y))
                         num_c = sum(num_c > 0)
                         num_a * 1 + num_b * 3 + num_c * 7
                     })
             )
    temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
                  a[1:temp_L1_length]))), nm = paste0("L1_",1:temp_L1_length))
    
    #REPEAT FOR L2, L3, ...
    
    cbind(df, temp_L1)   #Run cbind(df, temp_L1, temp_L2, ...)
    #  ID     L1 L1_1 L1_2 L1_3 L1_4
    #1  1 abbbcc    7    9   13   17
    #2  2 aabacd    5    5   11    8
    #3  3  abbda    7    6    4   NA
    #4  4   bbad    7    4   NA   NA
    

    UPDATE

    You could create a function and use it like shown below

    #FUNCTION
    foo = function(data, column){
        temp_L1 = lapply(as.character(data[[column]]), function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
        temp_L1_length = max(lengths(temp_L1))
        temp_L1 = lapply(temp_L1, function(x)
            sapply(x, function(y){
                num_a = unlist(gregexpr(pattern = "a", text = y, ignore.case = TRUE))
                num_a = sum(num_a > 0)  #length of positive match
                num_b = unlist(gregexpr(pattern = "b", text = y, ignore.case = TRUE))
                num_b = sum(num_b > 0)
                num_c = unlist(gregexpr(pattern = "c", text = y, ignore.case = TRUE))
                num_c = sum(num_c > 0)
                num_a * 1 + num_b * 3 + num_c * 7
            })
        )
        temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
            a[1:temp_L1_length]))), nm = paste0(column,"_",1:temp_L1_length))
        return(temp_L1)
    }
    
    #USING ON NEW DATA
    cbind(df, do.call(cbind, lapply(colnames(df)[-1], function(x) foo(data = df, column = x))))