Search code examples
rrowcolumnsorting

R move columns to be rows of other columns


I am converting a messy excel sheet into usable data. The basic structure is a repeating block of variables with similar items in rows. I want to move columns so they become rows of the first block of columns.

Difficult to describe so here is mwe.

d1 = structure(list(...1 = c("p", "w", "s", NA, "A Ex", NA, "Pres", 
"Time", "HR"), ...2 = c("1", "1", "1", NA, "Walk", NA, NA, NA, 
NA), ...3 = c(NA, NA, NA, NA, NA, NA, "Done", "Time", "HR"), 
    ...4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), ...5 = c("p", 
    "w", "s", NA, "A Ex", NA, "Pres", "Time", "HR"), ...6 = c("1", 
    "1", "2", NA, "Walk", NA, NA, NA, NA), ...7 = c(NA, NA, NA, 
    NA, NA, NA, "Done", "Time", "HR"), ...8 = c(NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), ...9 = c("p", "w", "s", NA, "A Ex", 
    NA, "Pres", "Time", "HR"), ...10 = c("1", "1", "3", NA, "Walk", 
    NA, NA, NA, NA), ...11 = c(NA, NA, NA, NA, NA, NA, "Done", 
    "Time", "HR"), ...12 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
    NA)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", 
"data.frame"))

I want to move columns 5:8 and 6:12 so they become additional rows of 1:4 so result is:

df2 = structure(list(...1 = c("p", "w", "s", NA, "A Ex", NA, "Pres", 
"Time", "HR", "p", "w", "s", NA, "A Ex", NA, "Pres", "Time", 
"HR", "p", "w", "s", NA, "A Ex", NA, "Pres", "Time", "HR"), ...2 = c("1", 
"1", "1", NA, "Walk", NA, NA, NA, NA, "1", "1", "2", NA, "Walk", 
NA, NA, NA, NA, "1", "1", "3", NA, "Walk", NA, NA, NA, NA), ...3 = c(NA, 
NA, NA, NA, NA, NA, "Done", "Time", "HR", NA, NA, NA, NA, NA, 
NA, "Done", "Time", "HR", NA, NA, NA, NA, NA, NA, "Done", "Time", 
"HR"), ...4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 
-27L), class = c("tbl_df", "tbl", "data.frame"))

Solution

  • base R

    The basic premise is to split by groups of 4 columns:

    splitframes <- split.default(d1, (seq_along(d1)-1) %/% 4)
    splitframes
    # $`0`
    # # A tibble: 9 x 4
    #   ...1  ...2  ...3  ...4 
    #   <chr> <chr> <chr> <lgl>
    # 1 p     1     <NA>  NA   
    # 2 w     1     <NA>  NA   
    # 3 s     1     <NA>  NA   
    # 4 <NA>  <NA>  <NA>  NA   
    # 5 A Ex  Walk  <NA>  NA   
    # 6 <NA>  <NA>  <NA>  NA   
    # 7 Pres  <NA>  Done  NA   
    # 8 Time  <NA>  Time  NA   
    # 9 HR    <NA>  HR    NA   
    # $`1`
    # # A tibble: 9 x 4
    #   ...1  ...2  ...3  ...4 
    #   <chr> <chr> <chr> <lgl>
    # 1 p     1     <NA>  NA   
    # 2 w     1     <NA>  NA
    

    Unfortunately, rbind.data.frame needs all arguments to have the same names,

    lapply(splitframes, names)
    # $`0`
    # [1] "...1" "...2" "...3" "...4"
    # $`1`
    # [1] "...5" "...6" "...7" "...8"
    # $`2`
    # [1] "...9"  "...10" "...11" "...12"
    

    so we need to fix that first:

    splitframes <- lapply(splitframes, `colnames<-`, names(splitframes[[1]]))
    do.call(rbind, splitframes)
    # # A tibble: 27 x 4
    #    ...1  ...2  ...3  ...4 
    #  * <chr> <chr> <chr> <lgl>
    #  1 p     1     <NA>  NA   
    #  2 w     1     <NA>  NA   
    #  3 s     1     <NA>  NA   
    #  4 <NA>  <NA>  <NA>  NA   
    #  5 A Ex  Walk  <NA>  NA   
    #  6 <NA>  <NA>  <NA>  NA   
    #  7 Pres  <NA>  Done  NA   
    #  8 Time  <NA>  Time  NA   
    #  9 HR    <NA>  HR    NA   
    # 10 p     1     <NA>  NA   
    # # ... with 17 more rows
    

    dplyr

    This package is a little more accommodating:

    splitframes <- split.default(d1, (seq_along(d1)-1) %/% 4)
    dplyr::bind_rows(splitframes)
    # New names:
    # * ...5 -> ...1
    # * ...6 -> ...2
    # * ...7 -> ...3
    # * ...8 -> ...4
    # New names:
    # * ...9 -> ...1
    # * ...10 -> ...2
    # * ...11 -> ...3
    # * ...12 -> ...4
    # # A tibble: 27 x 4
    #    ...1  ...2  ...3  ...4 
    #    <chr> <chr> <chr> <lgl>
    #  1 p     1     <NA>  NA   
    #  2 w     1     <NA>  NA   
    #  3 s     1     <NA>  NA   
    #  4 <NA>  <NA>  <NA>  NA   
    #  5 A Ex  Walk  <NA>  NA   
    #  6 <NA>  <NA>  <NA>  NA   
    #  7 Pres  <NA>  Done  NA   
    #  8 Time  <NA>  Time  NA   
    #  9 HR    <NA>  HR    NA   
    # 10 p     1     <NA>  NA   
    # # ... with 17 more rows
    

    data.table

    splitframes <- split.default(d1, (seq_along(d1)-1) %/% 4)
    library(data.table)
    rbindlist(splitframes)
    # Column 1 ['...5'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.
    #       ...1   ...2   ...3   ...4
    #     <char> <char> <char> <lgcl>
    #  1:      p      1   <NA>     NA
    #  2:      w      1   <NA>     NA
    #  3:      s      1   <NA>     NA
    #  4:   <NA>   <NA>   <NA>     NA
    

    To silence that message, use instead rbindlist(..., use.names=FALSE).