Search code examples
rlistloopsdata-analysis

in R: loop over multiple input files and save the results file according the name of each input file


I'm a beginner with R. I would need your help to automate these analyses over all the files contained in the list "scores" (~50 files) and get (and save) a summary output with the results corresponding to each of the input file (summary output to be named similarly to each input file).

Now, with my code (attached) I can get the results for one input file at a time. For example, if I consider the second element of the list named "scores":

    file.names <- dir(pattern ="*.my.files") ### set the pattern name of the input files
    scores<-list() ### create a variable (list) that will contain the input data 
    for (k in 1:length(file.names)){scores[[k]] <- read.table(file.names[k],h=T)}

    ### scores[[2]] is the second element of the list named "scores"
    f1<-function(threshold) { glm(pheno ~ threshold + PC1 + PC2 + PC3 + PC4 +PC5 +PC6 +PC7+ PC8 + PC9 + C1+C2+C3+C4+C5+C6+C7+C8+C9, family='binomial',data=scores[[2]]) }


    m1<-apply(scores[[2]][2:9],2,f1) ### scores[[2]] is the second element of the list named "scores", 
                                     ###    the columns from 2 to 9 corresponds to the scores at the different p thresholds
                                     ###    after the first comma --> MARGIN: a vector giving the subscripts which the function 
                                     ###    will be applied over. E.g., for a matrix 1 indicates rows, 2 indicates columns, 
                                     ###    c(1, 2) indicates rows and columns. Where X has named dimnames, it can be a character vector 
                                     ###    selecting dimension names.

    ### calculate OR and R2:
    out1<-do.call(rbind,lapply(m1,function(z)summary(z)$coefficients[2,]))
    out1<-data.frame(out1)
    out1$OR<-exp(out1$Estimate)
    out1$ci_l<-exp(out1$Estimate-(1.96 * out1$Std..Error))
    out1$ci_u<-exp(out1$Estimate+(1.96 * out1$Std..Error))

    write.table(out1, file="estimates.txt", quote=F, sep=" ", dec=".", na="NA", row.names=T, col.names=T)

    ### calculate nagelkerkeR2: 
    library(sizeMat)
    full.nagel.r2<-do.call(rbind,lapply(m1,function(z)nagelkerkeR2(z)))
    full.nagel.r2<-data.frame(full.nagel.r2)
    n1<-glm(as.factor(pheno) ~ PC1 + PC2 + PC3 + PC4 +PC5 +PC6 +PC7+ PC8 + PC9 + C1+C2+C3+C4+C5+C6+C7+C8+C9, family='binomial',data=scores[[2]]) # null model
    full.nagel.r2$prs.r2<-full.nagel.r2[,1] - nagelkerkeR2(n1)

    write.table(full.nagel.r2, file="variance.txt", quote=F, sep=" ", dec=".", na="NA", row.names=T, col.names=T)

I was also wondering if I could merge the results of the "estimate.txt" and "variance.txt" files in order to get a single output file for each input file of the list named "scores".

Many thanks in advance.


Solution

  • Simply generalize your process with a data frame input or build a named list of data frames and iterate off names for distinct .txt files. Below shows how lapply can handle your iteration and data handling needs:

    library(sizeMat)
    ...
    
    # BUILD NAMED LIST OF DATA FRAMES
    file.names <- dir(pattern ="*.my.files")
    scores <- setNames(lapply(file.names, read.table, header=TRUE),
                       gsub(".my.files", "", file.names))
    
    # DEFINED FUNCTION RECEIVING DF NAME PARAM
    proc_scores <- function(df_name) {
       df <- scores[[df_name]]
    
       models <- lapply(df, function(threshold) {
           glm(pheno ~ threshold + PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9 + C1+C2+C3+C4+C5+C6+C7+C8+C9, 
               family='binomial', data=df)
       })
    
       ### calculate OR and R2:
       coeff_mat <- do.call(rbind, lapply(models, function(z) summary(z)$coefficients[2,]))
       coeff_df <- transform(data.frame(coeff_mat), 
          OR = exp(Estimate),
          ci_l = exp(Estimate - (1.96 * Std..Error)),
          ci_u = exp(Estimate + (1.96 * Std..Error)),
          score = df_name
      )
    
       # SAVE OUTPUT WITH DATA FRAME NAME
       write.table(coeff_df, file=paste0("estimates_", df_name, "_.txt"), quote=FALSE, 
                   sep=" ", dec=".", na="NA", row.names=TRUE, col.names=TRUE)
    
    
       ### calculate nagelkerkeR2
       null.model <- glm(as.factor(pheno) ~ PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9 + C1+C2+C3+C4+C5+C6+C7+C8+C9, 
                         family='binomial', data=df)
    
       nagel_mat <- do.call(rbind, lapply(models, function(z) nagelkerkeR2(z)))
       nagel_df <- data.frame(nagel_mat)
       nagel_df$prs.r2 <- nagel_df[,1] - nagelkerkeR2(null.model)
       nagel_df$score <- df_name
    
       # SAVE OUTPUT WITH DATA FRAME NAME
       write.table(nagel_df, file=paste0("variance_", df_name,"_.txt"), quote=FALSE, 
                   sep=" ", dec=".", na="NA", row.names=TRUE, col.names=TRUE)
    
       # RETURN LIST OF DATA FRAMES
       return(list(coeff=coeff_df, nagel=nagel_df))
    }
    
    
    # ITERATELY CALL FUNCTION
    results <- lapply(names(scores), proc_scores)
    

    Since process exports data to .txt and returns the same objects in list, you can access them accordingly:

    results[[1]]$coeff
    results[[2]]$coeff   
    results[[3]]$coeff
    ...
    
    results[[1]]$nagel
    results[[2]]$nagel
    results[[3]]$nagel
    ...
    

    For master compilation of results:

    master_coeff <- do.call(rbind, lapply(results, "[[", "coeff"))
    
    # SAVE OUTPUT
    write.table(master_coeff, file="estimates.txt", quote=FALSE, sep=" ",
                dec=".", na="NA", row.names=TRUE, col.names=TRUE)
    
    
    master_nagel <- do.call(rbind, lapply(results, "[[", "nagel"))
    
    # SAVE OUTPUT
    write.table(master_nagel, file="variance.txt", quote=FALSE, 
                sep=" ", dec=".", na="NA", row.names=TRUE, col.names=TRUE)