I built this function:
alterations <- function() {
if (!require(readr)) {install.packages("readr")}
if (!require(stringr)) {install.packages("stringr")}
if (!require(data.table)) {install.packages("data.table")}
temp <- list.files(pattern = "*_automat.lif")
dados <- NULL
for (i in 1:length(temp)) {
df1 <- fread(file = temp[[i]],
select = c(1, 18),
col.names = c("name", "classificador"))[order(name)]
df2 <-fread(str_remove(temp[[i]], "_automat"),
select = c(1, 18),
col.names = c("name", "validador"))[order(name)]
tb1 <- cbind(df1[, 2], df2[, 2]) %>%
table()
df3 <- cbind(ciclo = temp[[i]],
validation_date = str_sub(file.info(temp[[i]])$mtime, 1, 10),
as.data.table(tb1))
dados <- rbind(dados, df3)
}
write_csv(dados, file = "dados_brutos.csv")
return(dados)
}
I need help to improve this function:
I have a hundreds of files in a folder with the names as: aaa.lif, aaa_automat.lif, bbb.lif, bbb_automat.lif, ccc.lif, ccc.automat.lif, ... that have the same columns and a thousands of rows, but different values in some columns (as column 18, specified in the code). I need bind the columns column 18 (classificador) of these files for the same column 1 (name). However, some files has a problem and didn't have some rows in file *_automat.lif. I trying use merge
in place of cbind
to merge data.tables df1 and df2 by column name but time to execute the function was much worse.
I don't know if my for(){} if efficient, Is there a better way?
data example:
dput(df1)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png",
"2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png",
"2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png",
"2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
), classificador = c("organism", "shadow", "coscinodiscus", "shadow",
"shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table",
"data.frame")))
dput(df2)
setDT(structure(list(name = c("2020-12-01_00_34_54.029_1009_1943.png",
"2020-12-01_00_34_54.029_1025_394.png", "2020-12-01_00_34_54.029_1077_1739.png",
"2020-12-01_00_34_54.029_1345_631.png", "2020-12-01_00_34_54.029_1360_1538.png",
"2020-12-01_00_34_54.029_1435_1340.png", "2020-12-01_00_34_54.029_1508_352.png"
), validador = c("shadow", "shadow", "coscinodiscus", "shadow",
"shadow", "shadow", "shadow")), row.names = c(NA, -7L), class = c("data.table",
"data.frame")))
output:
ciclo validation_date classificador validador N
1: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 coscinodiscus coscinodiscus 1
2: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 organism coscinodiscus 0
3: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 shadow coscinodiscus 0
4: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 coscinodiscus shadow 0
5: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 organism shadow 1
6: Basler_2020-12-01 00_34_52.441983_frames_automat.lif 2021-07-09 shadow shadow 5
7: Basler_2020-12-01 01_35_01.902191_frames_automat.lif 2021-07-10 shadow shadow 7
thank you
I think this comes pretty close to what you try to achieve.
files <- list.files(pattern = "*_automat.lif")
dados <- lapply(files, function(file) {
df1 <- fread(file = file, select = c(1, 18), col.names = c("name", "classificador"))
df2 <- fread(str_remove(file, "_automat"), select = c(1, 18), col.names = c("name", "validador"))
tbl <- merge(df1, df2)
tbl[, ciclo := file]
tbl[, validation_date := str_sub(file.info(file)$mtime, 1, 10)]
tbl
})
dados <- rbindlist(dados)
dados[, name := NULL]
setcolorder(dados, c("ciclo", "validation_date", "classificador", "validador"))
# from the output it seems you want to group and show counts
dados <- dados[, .(N = .N), by = .(ciclo, validation_date, classificador, validador)]
write_csv(dados, file = "dados_brutos.csv")