Search code examples
rfunctionfor-looplapplymapply

Using lapply with multiple lists of DFs as input to a function


I have 3 subjects who underwent an intervention - so I have pre- and post-treatment DFs for all of them. The 6 final datasets are as follows: df_pre_sub101, df_post_sub101, df_pre_sub202, df_post_sub202, df_pre_sub303, df_post_sub303

Below is the code I currently have where I've manually gone in and typed out the dataset names. I'd like to automate this process for when I have more subjects so I don't have to manually go in and type all of these steps for new datasets. Here is the code:

# create my necessary pre and post dataframes for each subject
df_pre_sub101 <- read_csv("sub101_pre.csv")
df_post_sub101<- read_csv("sub101_post.csv")
df_pre_sub202<- read_csv("sub202_pre.csv")
df_post_sub202<- read_csv("sub202_post.csv")
df_pre_sub303 <- read_csv("sub303_pre.csv")
df_post_sub303 <- read_csv("sub303_post.csv")

# pull out columns "State" and "MEP amp pre" into df_pre
# pull out columns "State" and "MEP amp post" into df_post
df_pre_sub101 <- df_pre_sub101[ , c("State", "MEP AMP (mV)")] %>%
  rename("mepAMP_pre" = "MEP AMP (mV)")
df_post_sub101 <- df_post_sub101[ , c("State", "MEP AMP (mV)")] %>%
  rename("mepAMP_post" = "MEP AMP (mV)")

df_pre_sub202 <- df_pre_sub202[ , c("State", "MEP AMP (mV)")] %>%
  rename("mepAMP_pre" = "MEP AMP (mV)")
df_post_sub202 <- df_post_sub202[ , c("State", "MEP AMP (mV)")] %>%
  rename("mepAMP_post" = "MEP AMP (mV)")

df_pre_sub303 <- df_pre_sub303[ , c("State", "MEP AMP (mV)")] %>%
  rename("mepAMP_pre" = "MEP AMP (mV)")
df_post_sub303 <- df_post_sub303[ , c("State", "MEP AMP (mV)")] %>%
  rename("mepAMP_pre" = "MEP AMP (mV)")

# combine pre and post DFs per subject, and add in a subID column
df_sub101 <- cbind(df_pre_sub101, mepAMP_post = df_post_sub101$mepAMP_post)
df_sub101 <- cbind(subID = 101, df_sub101)

df_sub202 <- cbind(df_pre_sub202, mepAMP_post = df_post_sub202$mepAMP_post)
df_sub202 <- cbind(subID = 202, df_sub202)

df_sub303 <- cbind(df_pre_sub303, mepAMP_post = df_post_sub303$mepAMP_post)
df_sub303 <- cbind(subID = 303, df_sub303)

# combine rows for different subjects
df_all <- bind_rows(df_sub101, df_sub202, df_sub303)

I was told that instead of a for loop, I can try using lapply. This is what I have so far:

# create my necessary pre and post dataframes for each subject
df_pre_sub101 <- read_csv("sub101_pre.csv")
df_post_sub101<- read_csv("sub101_post.csv")
df_pre_sub202<- read_csv("sub202_pre.csv")
df_post_sub202<- read_csv("sub202_post.csv")
df_pre_sub303 <- read_csv("sub303_pre.csv")
df_post_sub303 <- read_csv("sub303_post.csv")

# create a list of DFs to feed into lapply
df.list.pre <- list(dfRCpre_sub19746, dfRCpre_sub19674, dfRCpre_sub19673)
df.list.post <- list(dfRCpost_sub19746, dfRCpost_sub19674, dfRCpost_sub19673)

# create my function to try and replicate my above code
myfunction <- function(x,y) {
  df.pre <- x[ , c("State", "MEP AMP (mV)")] %>% rename("mepAMP_pre" = "MEP AMP (mV)")
  df.post <- y[ , c("State", "MEP AMP (mV)")] %>% rename("mepAMP_post" = "MEP AMP (mV)")
  df <- cbind(df.pre, mepAMP_post = df.post$mepAMP_post)
}

# use lapply to feed in my pre and post DF lists
result <- lapply(x = df.list.pre, FUN = myfunction, y = df.list.post)

However, this gives me an error that "X" is missing. I originally got the above code partially working when I did pre and post DFs separately (so not trying to feed in two lists). I found online that I can use mapply, however, the structure this outputs for me is weird. I'd like to get the following dataframe as an output for the above:

enter image description here

However, I haven't been able to

  1. Pass my function over multiple lists (pre and post DFs) using lapply
  2. Incorporate df_sub101 <- cbind(subID = 101, dfRC_sub101) from my original code. This adds in a column subID that pastes that subject's ID for all 11 states.
  3. I'm not sure if I still need to include df_all <- bind_rows(df_sub101, df_sub202, df_sub303) or if lapply would automatically append the subjects it iterates through.

Would appreciate some help.

Reproducible dataset example (all follow the same structure):

df_pre_sub101

structure(list(State = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), 
    `Pulse Time (ms)` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA), `MEP AMP (mV)` = c(0.197647215643277, 0.0954348832989732, 
    0.523944806307554, 0.625025630825096, 0.924895880371332, 
    0.898288637399673, 0.918388723582029, 1.43350288197398, 2.1276653945446, 
    1.6496229916811, 1.64261059297456), `MEP Max (mV)` = c(0.112193062901497, 
    0.0476949736475945, 0.337814380414784, 0.382357756296794, 
    0.551612275838852, 0.599131107330322, 0.5466592207551, 0.875852793455124, 
    1.34147183597088, 0.968559554219246, 0.960105763541328), 
    `MEP Min (mV)` = c(-0.0854541527417799, -0.0477399096513788, 
    -0.18613042589277, -0.242667874528302, -0.37328360453248, 
    -0.299157530069351, -0.371729502826929, -0.557650088518858, 
    -0.786193558573723, -0.681063437461853, -0.682504829433229
    ), `MEP Max T. (ms)` = c(158.066666666667, 158.866666666667, 
    159.7, 160.244444444444, 160, 160.64, 159.9, 160.74, 159.82, 
    161.28, 160.355555555556), `MEP Min T. (ms)` = c(165.066666666667, 
    166.866666666667, 164.4, 163.577777777778, 163.84, 164.5, 
    165.4, 165.18, 165.58, 166.74, 165.822222222222), `MEP Latency (ms)` = c(28.0666666666667, 
    28.8666666666667, 29.7, 30.2444444444444, 30, 30.64, 29.9, 
    30.74, 29.82, 31.28, 30.3555555555555), `# Trials` = c(3, 
    3, 4, 9, 10, 10, 10, 10, 10, 10, 9), `# Rejected` = c(7, 
    7, 6, 1, 0, 0, 0, 0, 0, 0, 1), `Rejected Trials` = c("8 22 34 57 73 76 96", 
    "3 14 20 42 45 67 77", "2 6 25 26 41 92", "38", NA, NA, NA, 
    NA, NA, NA, "1")), row.names = c(NA, -11L), spec = structure(list(
    cols = list(State = structure(list(), class = c("collector_double", 
    "collector")), `Pulse Time (ms)` = structure(list(), class = c("collector_logical", 
    "collector")), `MEP AMP (mV)` = structure(list(), class = c("collector_double", 
    "collector")), `MEP Max (mV)` = structure(list(), class = c("collector_double", 
    "collector")), `MEP Min (mV)` = structure(list(), class = c("collector_double", 
    "collector")), `MEP Max T. (ms)` = structure(list(), class = c("collector_double", 
    "collector")), `MEP Min T. (ms)` = structure(list(), class = c("collector_double", 
    "collector")), `MEP Latency (ms)` = structure(list(), class = c("collector_double", 
    "collector")), `# Trials` = structure(list(), class = c("collector_double", 
    "collector")), `# Rejected` = structure(list(), class = c("collector_double", 
    "collector")), `Rejected Trials` = structure(list(), class = c("collector_character", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x10d9c71f0>, class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"))

Solution

  • Actually there is no need for lapply as readr::read_csv can import multiple files at once. To this end create a list or vector containing your file names which e.g. could be achieved via list.files. Then you could read your files at once and bind them by row where using the id argument you can add an identifier for each file aka the filename. Afterwards you can clean up the file identifier to get the subject id and the type of treatment. Finally, select your desired columns and reshape to wide.

    The code below first creates four example files based on the example data you provided.

    # Make example files. Save in a temporary dictionary
    fns <- paste0("sub", rep(c(101, 202), each = 2), "_", rep(c("pre", "post"), 2), ".csv")
    path <- tempdir()
    lapply(fns, \(x) write.csv(dat, file.path(path, x), row.names = FALSE))
    
    library(readr)
    library(dplyr, warn=FALSE)
    library(tidyr)
    
    # Get a list of the files to red
    files <- list.files(path, pattern = "\\.csv", full.names = TRUE)
    
    # Read files and add an id
    read_csv(files, id = "file") |>
      # Clean file name
      mutate(file = gsub("\\.csv", "", basename(file))) |>
      # separate into subject and treatment columns
      separate(file, into = c("subject", "treatment"), sep = "_") |> 
      # Select and rename
      select(subject, treatment, State, mepAMP = "MEP AMP (mV)") |> 
      pivot_wider(names_from = treatment, values_from = mepAMP, names_prefix = "mepAMP_")
    
    #> # A tibble: 22 × 4
    #>    subject State mepAMP_post mepAMP_pre
    #>    <chr>   <dbl>       <dbl>      <dbl>
    #>  1 sub101      1      0.198      0.198 
    #>  2 sub101      2      0.0954     0.0954
    #>  3 sub101      3      0.524      0.524 
    #>  4 sub101      4      0.625      0.625 
    #>  5 sub101      5      0.925      0.925 
    #>  6 sub101      6      0.898      0.898 
    #>  7 sub101      7      0.918      0.918 
    #>  8 sub101      8      1.43       1.43  
    #>  9 sub101      9      2.13       2.13  
    #> 10 sub101     10      1.65       1.65  
    #> # ℹ 12 more rows
    

    DATA

    dat <- structure(list(
      State = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
      `Pulse Time (ms)` = c(
        NA, NA, NA, NA, NA, NA, NA, NA, NA,
        NA, NA
      ), `MEP AMP (mV)` = c(
        0.197647215643277, 0.0954348832989732,
        0.523944806307554, 0.625025630825096, 0.924895880371332,
        0.898288637399673, 0.918388723582029, 1.43350288197398, 2.1276653945446,
        1.6496229916811, 1.64261059297456
      ), `MEP Max (mV)` = c(
        0.112193062901497,
        0.0476949736475945, 0.337814380414784, 0.382357756296794,
        0.551612275838852, 0.599131107330322, 0.5466592207551, 0.875852793455124,
        1.34147183597088, 0.968559554219246, 0.960105763541328
      ),
      `MEP Min (mV)` = c(
        -0.0854541527417799, -0.0477399096513788,
        -0.18613042589277, -0.242667874528302, -0.37328360453248,
        -0.299157530069351, -0.371729502826929, -0.557650088518858,
        -0.786193558573723, -0.681063437461853, -0.682504829433229
      ), `MEP Max T. (ms)` = c(
        158.066666666667, 158.866666666667,
        159.7, 160.244444444444, 160, 160.64, 159.9, 160.74, 159.82,
        161.28, 160.355555555556
      ), `MEP Min T. (ms)` = c(
        165.066666666667,
        166.866666666667, 164.4, 163.577777777778, 163.84, 164.5,
        165.4, 165.18, 165.58, 166.74, 165.822222222222
      ), `MEP Latency (ms)` = c(
        28.0666666666667,
        28.8666666666667, 29.7, 30.2444444444444, 30, 30.64, 29.9,
        30.74, 29.82, 31.28, 30.3555555555555
      ), `# Trials` = c(
        3,
        3, 4, 9, 10, 10, 10, 10, 10, 10, 9
      ), `# Rejected` = c(
        7,
        7, 6, 1, 0, 0, 0, 0, 0, 0, 1
      ), `Rejected Trials` = c(
        "8 22 34 57 73 76 96",
        "3 14 20 42 45 67 77", "2 6 25 26 41 92", "38", NA, NA, NA,
        NA, NA, NA, "1"
      )
    ), row.names = c(NA, -11L), spec = structure(list(
      cols = list(State = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `Pulse Time (ms)` = structure(list(), class = c(
        "collector_logical",
        "collector"
      )), `MEP AMP (mV)` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `MEP Max (mV)` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `MEP Min (mV)` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `MEP Max T. (ms)` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `MEP Min T. (ms)` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `MEP Latency (ms)` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `# Trials` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `# Rejected` = structure(list(), class = c(
        "collector_double",
        "collector"
      )), `Rejected Trials` = structure(list(), class = c(
        "collector_character",
        "collector"
      ))), default = structure(list(), class = c(
        "collector_guess",
        "collector"
      )), delim = ","
    ), class = "col_spec"), class = c(
      "spec_tbl_df",
      "tbl_df", "tbl", "data.frame"
    ))