How to read files iteratively and get segregated output in R?

I have multiple .gz files (1 file for each month and it contains per minute data) stored in multiple folders (1 folder for 1 state). The order of file is as follows:

C:/Users/Desktop/Data/Test_OP (i.e., Main folder or wd)
 Ala (i.e. Folder 1)
   Ala_2021-03_0100_JC_qualitya.txt.gz
   Ala_2021-05_0100_JC_qualitya.txt.gz
   Ala_2022-02_0100_JC_qualitya.txt.gz
 
 Chi (i.e. Folder 2)
   TEX_2019-06_0100+0500_JC_qualitya.txt.gz
   TEX_2020-07_0100+0300_JC_qualitya.txt.gz
   ...
   ...

I have ungz them to get various .txt files in the respective folders and processed them to get 15-rows mean of all columns. I am using the following code in R to achieve it. But as an output, I am getting one .csv file containing 15rows mean for all the files (i.e., the data of all the files has clubbed together). However, I want the same output in .csv file but in a segregated manner as per the station, as mentioned below. Also, the code that I am using read all files in one go, I want to process the files iteratively through each state. Could anyone please help me in modifying the code to achieve the desired output.

Desired output
Ala
Time_sp C1 [C1c] C2 [C2c] C3 [C3c] C4 [C3c] C5 [h]               
2021-03-01 00:00:00+00 257 285 255 .....
2021-03-01 00:15:00+00  .....
2021-03-01 00:30:00+00  .....
..

Chi
Time_sp  C1 [CC]  C2 [C2]  C3 [C2]  C4  C5 [h]
2020-03-01 00:00:00+00  267  215  250 .....
2020-03-01 00:15:00+00  .....
2020-03-01 00:30:00+00  .....
..

The code that I am using in R is as follows:

setwd("D:/Test2/")

library(data.table)
library(tidyr)

#List of files
filelist <- list.files("D:/Test2/", full.names = TRUE, recursive 
                   = TRUE, pattern = ".txt$")

#Read the files
dt <- lapply(filelist, function(file) {
  lines <- readLines(file)
  comment_end = match("*/", lines)
  fread(file, skip = comment_end)
})

#Adjust Column names
dt.tidied <- lapply(dt, FUN = function(x){
  #adjust ? to degree
  setnames(x, old = "T2 [?C]", new = "T2 [°C]", skip_absent = TRUE)

  colnames(x) <- gsub("\\[", "(", colnames(x))
  colnames(x) <- gsub("\\]", ")", colnames(x))

  if ("C1c" %in% colnames(x)) {
    x[C1c != 0, C := NA]
    x[C < -5 | C > 5, C := NA]
  }

  #return
  return(x)
})

 #bind, filling missing columns to NA
merged <- rbindlist(dt.tidied, fill = TRUE, use.names = TRUE)

#mn <- merged %>% separate(`Date/Time`, into = c("Date", "Time"), sep = 
 #"T")

cn <- c('Date/Time', 'DIF (W/m**2)', 'DIR (W/m**2)', 'LWD (W/m**2)', 
'LWU (W/m**2)', 'PoPoPoPo (hPa)', 'RH (%)', 'SWD (W/m**2)', 'SWU 
(W/m**2)', 'SumSW (W/m**2)', 'SWDTOA (W/m**2)', 'T2 (°C)')

mnf <- merged[, cn, with = FALSE ] %>% 
  as_tibble() %>%
  group_by(group = as.integer(gl(n(), 15, n()))) %>%
  summarise(across(everything(), ~ if(mean(is.na(.x)) > 0.8) NA else 
mean(.x, na.rm = TRUE))) 

mnf

write.csv(mnf, 'D:/Test2/15min_full.csv')

The dput() of my sample input data (excluding initial rows) is as follows:

The dput() of a sample input txt file (excluding initial rows) is as follows:

structure(list(`Date/Time` = structure(c(1614556800, 1614556860, 
1614556920, 1614556980, 1614557040, 1614557100), tzone = "UTC", class = 
c("POSIXct", "POSIXt")), `XY [XY]` = c(0.990641, 0.990641, 0.990641, 
0.990641, 0.990641, 0.990641), `C1 [C1]` = c(257, 257, 257, 256, 255, 
255), Cc = c(0, 0, 0, 0, 0, 0), `C2 [C2]` = c(285, 284, 289, 264, 
231, 223), Dc = c(0, 0, 0, 0, 0, 0), `C3 [C3]` = c(255, 255, 
255, 255, 254, 254), Ac = c(0, 0, 0, 0, 0, 0), C4 = c(0.463735, 
0.465678, 0.467612, 0.469561, 0.471472, 0.473374), `C5 [h]` = c(1013, 
NA, NA, NA, NA, NA), `C6 [%]` = c(43, NA, NA, NA, NA, NA), `C7 [E2]` = 
c(390, 390, 393, 380, 365, 361), Jc = c(0, 0, 0, 0, 0, 0), `D [S]` = 
c(62.3716, 62.2459, 62.1206, 61.9942, 61.8701, 61.7465), `Sw [S2]` = 
c(1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95), `SW [Q2]` = 
c(389.164, 389.253, 392.14, 379.964, 363.91, 360.562), `QA [H2]` = 
c(646.61, 649.313, 652.002, 654.712, 657.371, 660.016), `T2 [C]` = 
c(3.7, NA, NA, NA, NA, NA), Lc = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 
-6L), class = c("tbl_df", "tbl", "data.frame"))

Initial rows are as follows:

/* DESCRIPTION:
Alg
Checks: V1.0
Const:  90 W
Const:  4 W
Para(s):DATE/TIME (Date/Time)
XY [XY] (XY)
C1 [CC] (C1)
Cc (Cc)
C2 [C2] (C2)
Dc (Dc)
C3 [C3] (C3)
Ac (Ac)
C4 (C4)
C5 [h] (C5)
C6 [%] (C6)
C7 [E2] (C7)
Jc (Jc)
D [S] (D)
Sw [S2] (Sw)
SW [Q2] (SW)
QA [H2] (QA)
T2 [C] (T2)
Lc (Lc)
*/

Solution

As it's quite a different question than the original, here's another take. There are still some contradicting bits in provided data, existing code and desired output, but if the core of this question is how to cycle through directories and get a summary for each of those, the logic is hopefully here.

Tried to keep as close to provided code as possible, but had to adjust according to provided data, also dropped the extraction part. And refactored some blocks into functions.

library(dplyr)
library(data.table)

#Read the files
read_the_files <- function(filelist){
  lapply(filelist, function(file) {
    # skip rows until fragment from header row is detected 
    fread(file, skip = "Date/Time\tXY")
  })
}

#Adjust Column names
adjust_column_names <- function(dt_list){
  lapply(dt_list, FUN = function(x){
    #adjust ? to degree
    setnames(x, old = "T2 [?C]", new = "T2 [°C]", skip_absent = TRUE)
    
    colnames(x) <- gsub("\\[", "(", colnames(x))
    colnames(x) <- gsub("\\]", ")", colnames(x))
    
    #return
    return(x)
  })
}
# directory with samples generated from provided data
setwd(file.path(tempdir(), "Test_OP"))
fs::dir_tree()
#> .
#> ├── Ala
#> │   ├── 1.txt
#> │   ├── 2.txt
#> │   ├── 3.txt
#> │   ├── 4.txt
#> │   └── 5.txt
#> └── Chi
#>     ├── 1.txt
#>     ├── 2.txt
#>     ├── 3.txt
#>     ├── 4.txt
#>     └── 5.txt


dirlist <- list.files(full.names = FALSE, no.. = TRUE) 
dirlist
#> [1] "Ala" "Chi"

# Cycle trhough directories in current working directory,
# create a summary file for files in each dir
for (idx in seq_along(dirlist)){
  filelist <- list.files(path = dirlist[idx], full.names = TRUE, recursive = TRUE, pattern = ".txt$")
  dt_ <- read_the_files(filelist)
  dt.tidied <- adjust_column_names(dt_)
  
  #bind, filling missing columns to NA
  merged <- rbindlist(dt.tidied, fill = TRUE, use.names = TRUE)
  # fread() should have parsed datetime strings, so at that point separate() should just
  # rename `Date/Time` to Date and create Time filled with NAs
  # mn <- merged %>% separate(`Date/Time`, into = c("Date", "Time"), sep = "T")

  # Create 15 minute groups with lubridate, summarise only numeric columns
  mnf <- merged %>% 
    as_tibble() %>%
    group_by(Time_sp = lubridate::floor_date(`Date/Time`, "15 mins")) %>% 
    summarise(across(where(is.numeric), ~ if(mean(is.na(.x)) > 0.8) NA else mean(.x, na.rm = TRUE)))
  write.csv(mnf, paste0(dirlist[idx],"_15min_full.csv"), row.names = FALSE)
}

# Last processed directory:
dirlist[idx]
#> [1] "Chi"
# Last generated summary dataframe:
mnf
#> # A tibble: 3 × 19
#>   Time_sp             `XY (XY)` `C1 (CC)`    Cc `C2 (C2)`    Dc `C3 (C2)`    Ac
#>   <dttm>                  <dbl>     <dbl> <dbl>     <dbl> <dbl>     <dbl> <dbl>
#> 1 2021-03-01 00:00:00     0.991       257     0       285     0       255     0
#> 2 2021-03-01 00:15:00     0.991       257     0       285     0       255     0
#> 3 2021-03-01 00:30:00     0.991       257     0       285     0       255     0
#> # ℹ 11 more variables: C4 <dbl>, `C5 (h)` <dbl>, `C6 (%)` <dbl>,
#> #   `C7 (E2)` <dbl>, Jc <dbl>, `D (S)` <dbl>, `Sw (S2)` <dbl>, `SW (Q2)` <dbl>,
#> #   `OA (H2)` <dbl>, `T2 (C)` <dbl>, Lc <dbl>

# Files in working directory:
fs::dir_info()[1:3] 
#> # A tibble: 4 × 3
#>   path               type             size
#>   <fs::path>         <fct>     <fs::bytes>
#> 1 Ala                directory           0
#> 2 Ala_15min_full.csv file              480
#> 3 Chi                directory           0
#> 4 Chi_15min_full.csv file              480

Reprex setup:

# reprex setup ------------------------------------------------------------
library(fs)
library(purrr)

# Sample is crafted by combining 
# https://stackoverflow.com/revisions/76255679/9 and
# https://stackoverflow.com/revisions/76334740/6 
# Includes some manual changes so may or may not apply

file_sample <-  c(
  "/* DESCRIPTION:",
  "Algo:\tSolp", 
  "Checks:\tV2.0",
  "Const:\t89 W", 
  "Const1:\t9 W",
  "Para(s):\tDATE/TIME (Date/Time)", 
  "\tXY [XY] (XY)",
  "\tC1 [C1] (C1)", "\tCc (Cc)",
  "\tC2 [C2] (C2)", "\tAc (Ac)",
  "\tC3 [C3] (C3)", "\tDc (Dc)",
  "\tC4 (C4)", 
  "\tC5 [h] (h)","\tC6DH [%] (C6DH)", 
  "\tC7 [E2] (C7)","\tJc (Jc)","\tD [S] (D)", "\tSw [S2] (Sw)",
  "\tSW [Q2] (SW)", "\tQA [H2] (QA)","\tT2 [C] (T2)", "\tLc (Lc)",
  "*/", 
  "Date/Time\tXY [XY]\tC1 [C1]\tCc\tC2 [C2]\tAc\tC3 [C3]\tDc\tC4\tC5 [h]\tC6DH [%]\tC7 [E2]\tJc\tS [D]\tSw [S2]\tQ2 [SW]\tQA [H2]\tT2 [C]\tLc",
  "2021-03-01T00:00:00\t0.990641\t257\t0\t285\t0\t255\t0\t0.463735\t1013\t43\t390\t0 \t62.3716\t1392.95\t389.164\t646.61\t3.7\t0",
  "2021-03-01T00:01:00\t0.990641\t257\t0\t284\t0\t255\t0\t0.465678\t\t\t390\t0\t62.2 459\t1392.95\t389.253\t649.313\t\t0",
  "2021-03-01T00:02:00\t0.990641\t257\t0\t289\t0\t255\t0\t0.467612\t\t\t393\t0\t62.1 206\t1392.95\t392.14\t652.002\t\t0"
)

main_d <- path(tempdir(), "Test_OP")
walk(path(main_d, c("Ala/", "Chi/")), dir_create)
paste0(c("Ala/", "Chi/"), rep(1:5, each =2), ".txt") %>% 
  walk(\(fname) writeLines(file_sample, path(main_d, fname)))
dir_tree(main_d)
#> Temp/RtmpEdFDdc/Test_OP
#> ├── Ala
#> │   ├── 1.txt
#> │   ├── 2.txt
#> │   ├── 3.txt
#> │   ├── 4.txt
#> │   └── 5.txt
#> └── Chi
#>     ├── 1.txt
#>     ├── 2.txt
#>     ├── 3.txt
#>     ├── 4.txt
#>     └── 5.txt

# file content:
readLines(path(main_d, "Ala/1.txt")) %>% paste0(collapse = "\n") %>% cat()
#> /* DESCRIPTION:
#> Algo:    Solp
#> Checks:  V2.0
#> Const:   89 W
#> Const1:  9 W
#> Para(s): DATE/TIME (Date/Time)
#>  XY [XY] (XY)
#>  C1 [C1] (C1)
#>  Cc (Cc)
#>  C2 [C2] (C2)
#>  Ac (Ac)
#>  C3 [C3] (C3)
#>  Dc (Dc)
#>  C4 (C4)
#>  C5 [h] (h)
#>  C6DH [%] (C6DH)
#>  C7 [E2] (C7)
#>  Jc (Jc)
#>  D [S] (D)
#>  Sw [S2] (Sw)
#>  SW [Q2] (SW)
#>  QA [H2] (QA)
#>  T2 [C] (T2)
#>  Lc (Lc)
#> */
#> Date/Time    XY [XY] C1 [C1] Cc  C2 [C2] Ac  C3 [C3] Dc  C4  C5 [h]  C6DH [%]    C7 [E2] Jc  S [D]   Sw [S2] Q2 [SW] QA [H2] T2 [C]  Lc
#> 2021-03-01T00:00:00  0.990641    257 0   285 0   255 0   0.463735    1013    43  390 0   62.3716 1392.95 389.164 646.61  3.7 0
#> 2021-03-01T00:01:00  0.990641    257 0   284 0   255 0   0.465678            390 0   62.2 459    1392.95 389.253 649.313     0
#> 2021-03-01T00:02:00  0.990641    257 0   289 0   255 0   0.467612            393 0   62.1 206    1392.95 392.14  652.002     0

^{Created on 2023-05-20 with reprex v2.0.2}