Search code examples
rlapplyrbind

Modify the code of dataframes for multiple txt files


Following code is for dataframes df1, df2. The code is reading the data columns (var) and look in each dataframe, if the var column is not present, it adds it and put NA in that particular column.

dfs1 <- c('df1','df2')

var <- c('City_Name',  'Temp',  'Pres' , 'Wind_Hor' , 'Wind_Ver' , 'Rainf' , 'S_Moist')

lapply(dfs1, \(x) {
  dfn <- get(x, envir = .GlobalEnv)
  dfn[[var[which(is.na(match(var,names(dfn))))]]] <- NA
  dfn <- dfn %>% select(all_of(var))
  return(assign(x,dfn,envir = .GlobalEnv))
})

If I have a list of files, how can I modify the above code?

I tried following way

dfs1 <- list.files(path = 'D:/Test3', pattern = "*txt", recursive = TRUE)
var <- c('D/T', 'City_Name', 'Temp', 'Pres', 'Wind_Hor', 'Wind_Ver', 'Rainf', 'S_Moist')
lapply(dfs1, \(x) {
  dfn <- get(x, envir = .GlobalEnv)
  dfn[[var[which(is.na(match(var,names(dfn))))]]] <- NA
  dfn <- dfn %>% select(all_of(var))
  return(assign(x,dfn,envir = .GlobalEnv))
})

But it returns an error:

Error in get(x, envir = .GlobalEnv) :
object 'File/File1.txt' not found

Could anyone please respond how to modify the code for files.


Solution

  • library(dplyr)
    library(data.table)
    # I find it easier to use data.table in this case when it comes to assigning NA's using dt[, (character_vector) := NA]
    
    var <- c('City_Name',  'Temp',  'Pres' , 'Wind_Hor' , 'Wind_Ver' , 'Rainf' , 'S_Moist')
    
    ## comment 2 columns 
    df1 = data.frame(City_Name = "NYC", 
                     Temp = 20,
                     Pres = 10,
                     #Wind_Hor = 5,
                     Wind_Ver = 5,
                     # Rainf = 10,
                     S_Moist = 5)
    
    ## Comment 3
    df2 = data.frame(#City_Name = "NYC", 
      Temp = 15,
      #Pres = 15,
      Wind_Hor = 5,
      Wind_Ver = 5,
      Rainf = 15)
    #S_Moist = 5)
    ## put the dfs as a list
    
    
    dfs1 <- list(df1, df2)
    
    
    ## loop through 
    
    
    
    processed_dfs <- lapply(seq_along(dfs1), function(x) {
      
      
      dfn = dfs1[[x]]
      dfn_nms = names(dfn)  
      #get missing column names
      var_missing = var[!var %in% dfn_nms]
      
      setDT(dfn) # convert to data.table 
      
      dfn[, (var_missing) := NA] # asign NA to missing
      
      dfn[, ..var] ## data.table select statement 
    })
    ## combine final output
    ## dplyr method
    final_df <- bind_rows(processed_dfs)
    
    ## if you want final as data.table
    final_df <- rbindlist(processed_dfs)
    
    
    # I made the above code to make it reproducible if I wanted to combine it by from reading text files from my disk then
    
    ## this will give you the files 
    ## you need to read in the files
    ## to read in files create file_paths
    
    dfs1 <- list.files(path = 'D:/Test3',
                       pattern = "*txt", 
                       recursive = TRUE)
    
    ## this will create file paths ie appending the name of the file to folder name
    dfs1_file_paths = file.path( 'D:/Test3', dfs1)
    
    var <- c('D/T', 'City_Name', 'Temp', 
             'Pres', 'Wind_Hor', 
             'Wind_Ver', 'Rainf', 
             'S_Moist')
    
    processed_dfs <- lapply(seq_along(dfs1), function(x) {
      
      
      file_x = dfs1_file_paths[[x]] ## file path i
    
      ## read the file
      dfn <- fread(file_x) ## use can also use read.table but you need one more step to convert to data.table
      
      dfn_nms = names(dfn)  
      
      #get missing column names
      var_missing = var[!var %in% dfn_nms]
      
    
      dfn[, (var_missing) := NA] # asign NA to missing
      
      dfn[, ..var] ## data.table select statement 
    })
    ## combine final output
    ## dplyr method
    final_df <- bind_rows(processed_dfs)
    
    ## if you want final as data.table
    final_df <- rbindlist(processed_dfs)
    
    # Hope this helps