I have multiple .gz files (1 file for each month and it contains per minute data) stored in multiple folders (1 folder for 1 state). The order of file is as follows:
C:/Users/Desktop/Data/Test_OP (i.e., Main folder or wd)
Ala (i.e. Folder 1)
Ala_2021-03_0100_JC_qualitya.txt.gz
Ala_2021-05_0100_JC_qualitya.txt.gz
Ala_2022-02_0100_JC_qualitya.txt.gz
Chi (i.e. Folder 2)
TEX_2019-06_0100+0500_JC_qualitya.txt.gz
TEX_2020-07_0100+0300_JC_qualitya.txt.gz
...
...
I have ungz them to get various .txt files in the respective folders and processed them to get 15-rows mean of all columns. I am using the following code in R to achieve it. But as an output, I am getting one .csv file containing 15rows mean for all the files (i.e., the data of all the files has clubbed together). However, I want the same output in .csv file but in a segregated manner as per the station, as mentioned below. Also, the code that I am using read all files in one go, I want to process the files iteratively through each state. Could anyone please help me in modifying the code to achieve the desired output.
Desired output
Ala
Time_sp C1 [C1c] C2 [C2c] C3 [C3c] C4 [C3c] C5 [h]
2021-03-01 00:00:00+00 257 285 255 .....
2021-03-01 00:15:00+00 .....
2021-03-01 00:30:00+00 .....
..
Chi
Time_sp C1 [CC] C2 [C2] C3 [C2] C4 C5 [h]
2020-03-01 00:00:00+00 267 215 250 .....
2020-03-01 00:15:00+00 .....
2020-03-01 00:30:00+00 .....
..
The code that I am using in R is as follows:
setwd("D:/Test2/")
library(data.table)
library(tidyr)
#List of files
filelist <- list.files("D:/Test2/", full.names = TRUE, recursive
= TRUE, pattern = ".txt$")
#Read the files
dt <- lapply(filelist, function(file) {
lines <- readLines(file)
comment_end = match("*/", lines)
fread(file, skip = comment_end)
})
#Adjust Column names
dt.tidied <- lapply(dt, FUN = function(x){
#adjust ? to degree
setnames(x, old = "T2 [?C]", new = "T2 [°C]", skip_absent = TRUE)
colnames(x) <- gsub("\\[", "(", colnames(x))
colnames(x) <- gsub("\\]", ")", colnames(x))
if ("C1c" %in% colnames(x)) {
x[C1c != 0, C := NA]
x[C < -5 | C > 5, C := NA]
}
#return
return(x)
})
#bind, filling missing columns to NA
merged <- rbindlist(dt.tidied, fill = TRUE, use.names = TRUE)
#mn <- merged %>% separate(`Date/Time`, into = c("Date", "Time"), sep =
#"T")
cn <- c('Date/Time', 'DIF (W/m**2)', 'DIR (W/m**2)', 'LWD (W/m**2)',
'LWU (W/m**2)', 'PoPoPoPo (hPa)', 'RH (%)', 'SWD (W/m**2)', 'SWU
(W/m**2)', 'SumSW (W/m**2)', 'SWDTOA (W/m**2)', 'T2 (°C)')
mnf <- merged[, cn, with = FALSE ] %>%
as_tibble() %>%
group_by(group = as.integer(gl(n(), 15, n()))) %>%
summarise(across(everything(), ~ if(mean(is.na(.x)) > 0.8) NA else
mean(.x, na.rm = TRUE)))
mnf
write.csv(mnf, 'D:/Test2/15min_full.csv')
The dput() of my sample input data (excluding initial rows) is as follows:
The dput() of a sample input txt file (excluding initial rows) is as follows:
structure(list(`Date/Time` = structure(c(1614556800, 1614556860,
1614556920, 1614556980, 1614557040, 1614557100), tzone = "UTC", class =
c("POSIXct", "POSIXt")), `XY [XY]` = c(0.990641, 0.990641, 0.990641,
0.990641, 0.990641, 0.990641), `C1 [C1]` = c(257, 257, 257, 256, 255,
255), Cc = c(0, 0, 0, 0, 0, 0), `C2 [C2]` = c(285, 284, 289, 264,
231, 223), Dc = c(0, 0, 0, 0, 0, 0), `C3 [C3]` = c(255, 255,
255, 255, 254, 254), Ac = c(0, 0, 0, 0, 0, 0), C4 = c(0.463735,
0.465678, 0.467612, 0.469561, 0.471472, 0.473374), `C5 [h]` = c(1013,
NA, NA, NA, NA, NA), `C6 [%]` = c(43, NA, NA, NA, NA, NA), `C7 [E2]` =
c(390, 390, 393, 380, 365, 361), Jc = c(0, 0, 0, 0, 0, 0), `D [S]` =
c(62.3716, 62.2459, 62.1206, 61.9942, 61.8701, 61.7465), `Sw [S2]` =
c(1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95), `SW [Q2]` =
c(389.164, 389.253, 392.14, 379.964, 363.91, 360.562), `QA [H2]` =
c(646.61, 649.313, 652.002, 654.712, 657.371, 660.016), `T2 [C]` =
c(3.7, NA, NA, NA, NA, NA), Lc = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
Initial rows are as follows:
/* DESCRIPTION:
Alg
Checks: V1.0
Const: 90 W
Const: 4 W
Para(s):DATE/TIME (Date/Time)
XY [XY] (XY)
C1 [CC] (C1)
Cc (Cc)
C2 [C2] (C2)
Dc (Dc)
C3 [C3] (C3)
Ac (Ac)
C4 (C4)
C5 [h] (C5)
C6 [%] (C6)
C7 [E2] (C7)
Jc (Jc)
D [S] (D)
Sw [S2] (Sw)
SW [Q2] (SW)
QA [H2] (QA)
T2 [C] (T2)
Lc (Lc)
*/
As it's quite a different question than the original, here's another take. There are still some contradicting bits in provided data, existing code and desired output, but if the core of this question is how to cycle through directories and get a summary for each of those, the logic is hopefully here.
Tried to keep as close to provided code as possible, but had to adjust according to provided data, also dropped the extraction part. And refactored some blocks into functions.
library(dplyr)
library(data.table)
#Read the files
read_the_files <- function(filelist){
lapply(filelist, function(file) {
# skip rows until fragment from header row is detected
fread(file, skip = "Date/Time\tXY")
})
}
#Adjust Column names
adjust_column_names <- function(dt_list){
lapply(dt_list, FUN = function(x){
#adjust ? to degree
setnames(x, old = "T2 [?C]", new = "T2 [°C]", skip_absent = TRUE)
colnames(x) <- gsub("\\[", "(", colnames(x))
colnames(x) <- gsub("\\]", ")", colnames(x))
#return
return(x)
})
}
# directory with samples generated from provided data
setwd(file.path(tempdir(), "Test_OP"))
fs::dir_tree()
#> .
#> ├── Ala
#> │ ├── 1.txt
#> │ ├── 2.txt
#> │ ├── 3.txt
#> │ ├── 4.txt
#> │ └── 5.txt
#> └── Chi
#> ├── 1.txt
#> ├── 2.txt
#> ├── 3.txt
#> ├── 4.txt
#> └── 5.txt
dirlist <- list.files(full.names = FALSE, no.. = TRUE)
dirlist
#> [1] "Ala" "Chi"
# Cycle trhough directories in current working directory,
# create a summary file for files in each dir
for (idx in seq_along(dirlist)){
filelist <- list.files(path = dirlist[idx], full.names = TRUE, recursive = TRUE, pattern = ".txt$")
dt_ <- read_the_files(filelist)
dt.tidied <- adjust_column_names(dt_)
#bind, filling missing columns to NA
merged <- rbindlist(dt.tidied, fill = TRUE, use.names = TRUE)
# fread() should have parsed datetime strings, so at that point separate() should just
# rename `Date/Time` to Date and create Time filled with NAs
# mn <- merged %>% separate(`Date/Time`, into = c("Date", "Time"), sep = "T")
# Create 15 minute groups with lubridate, summarise only numeric columns
mnf <- merged %>%
as_tibble() %>%
group_by(Time_sp = lubridate::floor_date(`Date/Time`, "15 mins")) %>%
summarise(across(where(is.numeric), ~ if(mean(is.na(.x)) > 0.8) NA else mean(.x, na.rm = TRUE)))
write.csv(mnf, paste0(dirlist[idx],"_15min_full.csv"), row.names = FALSE)
}
# Last processed directory:
dirlist[idx]
#> [1] "Chi"
# Last generated summary dataframe:
mnf
#> # A tibble: 3 × 19
#> Time_sp `XY (XY)` `C1 (CC)` Cc `C2 (C2)` Dc `C3 (C2)` Ac
#> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2021-03-01 00:00:00 0.991 257 0 285 0 255 0
#> 2 2021-03-01 00:15:00 0.991 257 0 285 0 255 0
#> 3 2021-03-01 00:30:00 0.991 257 0 285 0 255 0
#> # ℹ 11 more variables: C4 <dbl>, `C5 (h)` <dbl>, `C6 (%)` <dbl>,
#> # `C7 (E2)` <dbl>, Jc <dbl>, `D (S)` <dbl>, `Sw (S2)` <dbl>, `SW (Q2)` <dbl>,
#> # `OA (H2)` <dbl>, `T2 (C)` <dbl>, Lc <dbl>
# Files in working directory:
fs::dir_info()[1:3]
#> # A tibble: 4 × 3
#> path type size
#> <fs::path> <fct> <fs::bytes>
#> 1 Ala directory 0
#> 2 Ala_15min_full.csv file 480
#> 3 Chi directory 0
#> 4 Chi_15min_full.csv file 480
Reprex setup:
# reprex setup ------------------------------------------------------------
library(fs)
library(purrr)
# Sample is crafted by combining
# https://stackoverflow.com/revisions/76255679/9 and
# https://stackoverflow.com/revisions/76334740/6
# Includes some manual changes so may or may not apply
file_sample <- c(
"/* DESCRIPTION:",
"Algo:\tSolp",
"Checks:\tV2.0",
"Const:\t89 W",
"Const1:\t9 W",
"Para(s):\tDATE/TIME (Date/Time)",
"\tXY [XY] (XY)",
"\tC1 [C1] (C1)", "\tCc (Cc)",
"\tC2 [C2] (C2)", "\tAc (Ac)",
"\tC3 [C3] (C3)", "\tDc (Dc)",
"\tC4 (C4)",
"\tC5 [h] (h)","\tC6DH [%] (C6DH)",
"\tC7 [E2] (C7)","\tJc (Jc)","\tD [S] (D)", "\tSw [S2] (Sw)",
"\tSW [Q2] (SW)", "\tQA [H2] (QA)","\tT2 [C] (T2)", "\tLc (Lc)",
"*/",
"Date/Time\tXY [XY]\tC1 [C1]\tCc\tC2 [C2]\tAc\tC3 [C3]\tDc\tC4\tC5 [h]\tC6DH [%]\tC7 [E2]\tJc\tS [D]\tSw [S2]\tQ2 [SW]\tQA [H2]\tT2 [C]\tLc",
"2021-03-01T00:00:00\t0.990641\t257\t0\t285\t0\t255\t0\t0.463735\t1013\t43\t390\t0 \t62.3716\t1392.95\t389.164\t646.61\t3.7\t0",
"2021-03-01T00:01:00\t0.990641\t257\t0\t284\t0\t255\t0\t0.465678\t\t\t390\t0\t62.2 459\t1392.95\t389.253\t649.313\t\t0",
"2021-03-01T00:02:00\t0.990641\t257\t0\t289\t0\t255\t0\t0.467612\t\t\t393\t0\t62.1 206\t1392.95\t392.14\t652.002\t\t0"
)
main_d <- path(tempdir(), "Test_OP")
walk(path(main_d, c("Ala/", "Chi/")), dir_create)
paste0(c("Ala/", "Chi/"), rep(1:5, each =2), ".txt") %>%
walk(\(fname) writeLines(file_sample, path(main_d, fname)))
dir_tree(main_d)
#> Temp/RtmpEdFDdc/Test_OP
#> ├── Ala
#> │ ├── 1.txt
#> │ ├── 2.txt
#> │ ├── 3.txt
#> │ ├── 4.txt
#> │ └── 5.txt
#> └── Chi
#> ├── 1.txt
#> ├── 2.txt
#> ├── 3.txt
#> ├── 4.txt
#> └── 5.txt
# file content:
readLines(path(main_d, "Ala/1.txt")) %>% paste0(collapse = "\n") %>% cat()
#> /* DESCRIPTION:
#> Algo: Solp
#> Checks: V2.0
#> Const: 89 W
#> Const1: 9 W
#> Para(s): DATE/TIME (Date/Time)
#> XY [XY] (XY)
#> C1 [C1] (C1)
#> Cc (Cc)
#> C2 [C2] (C2)
#> Ac (Ac)
#> C3 [C3] (C3)
#> Dc (Dc)
#> C4 (C4)
#> C5 [h] (h)
#> C6DH [%] (C6DH)
#> C7 [E2] (C7)
#> Jc (Jc)
#> D [S] (D)
#> Sw [S2] (Sw)
#> SW [Q2] (SW)
#> QA [H2] (QA)
#> T2 [C] (T2)
#> Lc (Lc)
#> */
#> Date/Time XY [XY] C1 [C1] Cc C2 [C2] Ac C3 [C3] Dc C4 C5 [h] C6DH [%] C7 [E2] Jc S [D] Sw [S2] Q2 [SW] QA [H2] T2 [C] Lc
#> 2021-03-01T00:00:00 0.990641 257 0 285 0 255 0 0.463735 1013 43 390 0 62.3716 1392.95 389.164 646.61 3.7 0
#> 2021-03-01T00:01:00 0.990641 257 0 284 0 255 0 0.465678 390 0 62.2 459 1392.95 389.253 649.313 0
#> 2021-03-01T00:02:00 0.990641 257 0 289 0 255 0 0.467612 393 0 62.1 206 1392.95 392.14 652.002 0
Created on 2023-05-20 with reprex v2.0.2