I have some 38 csv files in a folder of with rainfall data for multiple years as follows:
Precp_1980.csv
Precp_1981.csv
Precp_1982.csv
Precp_1983.csv . . .
Precp_2017.csv
A leap year file looks like this:
Precp_1980 <- data.frame(matrix(runif(366*11299,min = 0, max = 4), ncol = 366, nrow = 11299))
names(Precp_1980) <- c(rep(paste0("d_",1:366)))
Precp_1980$ID1 <- seq(1:11299)
Precp_1980$ID2 <- seq(0.0,1.1, length.out = 11299)
Precp_1980$ID3 <- seq(1,10, length.out = 11299)
Precp_1980$ID4 <- seq(100,200, length.out = 11299)
Precp_1980$year<- 1980
One csv file has 11299 locations (11299 rows). The first four columns have some location specific information (ID1,ID2,ID3,ID4
), fifth column has
year
and column sixth till 371 column has daily rainfall data (i.e. 1 to 366 days).
A non-leap year file is exactly the same except it has one less column because of only 365 days:
Precp_1981 <- data.frame(matrix(runif(365*11299,min = 0, max = 4), ncol = 365, nrow = 11299))
names(Precp_1981) <- c(rep(paste0("d_",1:365)))
Precp_1981$ID1 <- seq(1:11299)
Precp_1981$ID2 <- seq(0.0,1.1, length.out = 11299)
Precp_1981$ID3 <- seq(1,10, length.out = 11299)
Precp_1981$ID4 <- seq(100,200, length.out = 11299)
Precp_1981$year<- 1981
My objective:
1) From all leap-year files, remove the doy 60 of the year so that all files have 365 days of the year.
2) Convert individual files from wide to long format
3) Combine all files into a single file
What I did was:
library(data.table)
library(dplyr)
library(reshape2)
year.list <- list() # create a list to save the outputs
yr.list <- c(1980:2017)
leap.yr <- c(1980,1984,1988,1992,1996,2000,2004,2008,2012,2016) # vector of leap years
for(y in seq_along(yr.list)){
yr <- yr.list[y]
if(yr %in% leap.yr){ # if a year is a leap year
dat <- fread("Precp_",yr,".csv"))
dat.up <- dat %>% dplyr::select(-d_60) # this removes the day 60 from the leap year
dat.up.m <- melt(dat.up, id.vars = c("ID1","ID2","ID3","ID4","year"), value.name = "rain", variable.name = "day") # converts the data into long format
dat.up.m <- dat.up.m %>% mutate(day = gsub("d_", "", day)) %>% # converts the "day_1" to numeric day of year
mutate(day = as.numeric(day)) %>%
mutate(day = ifelse(day >= 61, day - 1, day)) # this converts all days which were greater than 60 to previous day so that I have 365 days of year
year.list[[y]] <- dat.up.m
} else { # if a year is not a leap year
dat <- fread("Precp_",yr,".csv"))
dat.up.m <- melt(dat.up, id.vars = c("ID1","ID2","ID3","ID4","year"), value.name = "rain", variable.name = "day") # converts the data into long format
dat.up.m <- dat.up.m %>% mutate(day = gsub("d_", "", day)) %>% # converts the "day_1" to numeric day of year
mutate(day = as.numeric(day)) %>%
year.list[[y]] <- dat.up.m
}
stack.rain <- rbindlist(year.list)
I am looking for a shorter (maybe a faster?) solution. The script is too long
Don't perform any complex data manipulations within loop.
In loop: load and melt
all data (you won't be loosing much memory as you're removing only ~1/365 of the data).
Then outside the loop: using data.table
object filter (remove day 60) and modify your data ("day" column).
# Arguments
yearAll <- 1980:2017
yearLp <- seq(1980, 2016, 4)
# Libraries
library(data.table)
library(foreach)
# Load data
# It's possible to parallelize loop using %dopar%
result <- foreach(i = yearAll, .combine = rbind) %do% {
melt(fread(paste0("Precp_", i, ".csv")),
c("ID1", "ID2", "ID3", "ID4", "year"))
}
# Modify data
result <- result[!(year %in% yearLp & variable == "d_60")]
result[, day := as.numeric(sub("d_", "", variable))]
result[year %in% yearLp & day >= 61, day := day - 1]