I am creating nested for loop to forcast the data based on category wise. In my data categorical columns are age, gender, state and region. I have to do sales forecast based on above category, for example age contains male, female, undefined subcategory . The same have to do for remaining subcategories. In my nested for loop i am subsetting the data based on category and applying that subsetted data one by one for each category to my forecast function. While doing this my whole program taking 7 minutes to execute. I need to optimize this code. I tried lapply function also, but the problem is i am not able to apply the data which i subsetted with lapply because its giving output as sequence of array. So i am getting dimension error while fetching particular column in one category.
My nested for loop code,
forecasted_category <- list()
for( i in 1:length(categorical_columns))
{
if(categorical_columns[i] %in% names(data)==TRUE){
categorical_df_name <- paste(categorical_columns[i],"_df",sep="")
forecasted_by_categories <- list()
for(j in 1:length(unique(data[,categorical_columns[i]]))){
categorical_data <- (subset(data,data[,categorical_columns[i]] == unique(data[,categorical_columns[i]])[j]))
if (forecast_by == "sales"){
agg_day <- aggregate(categorical_data[,input_amt_column]~categorical_data[,input_date_column],categorical_data,sum)
names(agg_day) = c(input_date_column, input_amt_column)
forecast_input_column <- agg_day[,input_amt_column]
} else if (forecast_by == "customers") {
agg_day <- aggregate(categorical_data[,input_key_column]~categorical_data[,input_date_column],categorical_data,length)
names(agg_day) = c(input_date_column, input_key_column)
forecast_input_column <- agg_day[,input_key_column]
} else if (forecast_by == "average_sales") {
agg_day <-aggregate(categorical_data[,input_amt_column]~categorical_data[,input_date_column],categorical_data,mean)
names(agg_day) = c(input_date_column, input_amt_column)
forecast_input_column <- agg_day[,input_amt_column]
}
min_day <- min(agg_day[,input_date_column])
max_day <- max(agg_day[,input_date_column])
get_autoarima_model <- get_autoarima_model(forecast_input_column,period,min_day,freq)
if (is.null(get_autoarima_model)) {
category_forecast <- NULL
}else {
forecasted_date <- seq(as.Date(max_day)+1, by = "day", length.out = period)
forecasted_date <- as.data.frame(forecasted_date)
label <- sprintf("D-%s",seq(1:period))
if (forecast_by == "customers") {
category_forecast <- cbind.data.frame(forecasted_date=forecasted_date,label=label,value=round(get_autoarima_model$Point.Forecast))
}else {
category_forecast <- cbind.data.frame(forecasted_date=forecasted_date,label=label,value=get_autoarima_model$Point.Forecast)
}
}
forecasted_by_categories[[j]] <- list(sub_category=unique(categorical_data[,categorical_columns[i]]),category_forecast=category_forecast)
}
}
category <- list(category_name=categorical_columns[i])
category_name <- as.data.frame(category)
forecasted_category[[i]] <- list(categories=category_name,forecasted_by_categories=forecasted_by_categories)
}
Please let me know if my query is not clear.
My sample data
cust_id order_date amount quantity discount cost_price age gender state region
1 1 2014-10-27 215.53 9 3 172.424 57 M TN MidWest
3 3 2009-09-10 154.71 4 6 123.768 85 M FL west
4 4 2014-02-19 520.17 6 0 416.136 55 M OH NorthEast
5 5 2008-11-25 228.80 10 1 183.040 52 F AR west
6 6 2015-07-06 293.35 5 6 234.680 57 M CO MidWest
8 8 2014-11-05 537.96 9 5 430.368 53 M MN west
9 8 2011-05-28 316.21 4 2 252.968 53 M MN west
10 9 2010-03-01 1113.32 10 2 890.656 78 F OR west
11 9 2010-09-23 313.98 6 0 251.184 78 F OR west
12 10 2010-04-01 135.88 6 0 108.704 43 M NY west
I am passing my categorical columns dynamically as categorical_columns. Categorical column contains categorical_columns <- c(age, gender, state, region) input_amt_column is "amount" input_date_column is "order_date" input_key_column is "cust_id"
My auto arima model function
get_autoarima_model <- function(value,period,start_date,freq)
{
value <- round(value)
tsdata <- ts(value, start = start_date, freq = freq )
if (length(tsdata) >= 7) {
ts_data <-tsclean(tsdata)
adf_test <- adf.test(ts_data)
if((adf_test$p.value<0.05)==TRUE)
{
model <- auto.arima(ts_data)
fcast<-forecast(model,level=c(95),h=period)
fc <- data.frame(fcast)
}else {
adf.test(diff(diff(log(ts_data))))
model <- auto.arima(ts_data)
fcast<-forecast(model,level=c(95),h=period)
fc <- data.frame(fcast)
}
}else {
fc <- NULL
}
return(fc)
}
You could make age
a factor
and use a nested lapply()
approach:
data$age <- factor(data$age)
list_of_subsets <- lapply(data[c("age", "gender", "state", "region")], function(x){
lapply(levels(x), function(y){
subset(data, x == y)
})
})
To dynamically choose the categorical columns, change data[c("age", "gender", "state", "region")]
to data[sapply(data, is.factor)]
.
NEW CODE:
Here's an lapply
approach to your forecasting loop:
First define a function FOO
:
FOO <- function(var, data){
if(var %in% names(data)){
lapply(unique(data[, var]), function(y){
categorical_data <- subset(data, data[, var] == y)
if (forecast_by == "sales"){
agg_day <- aggregate(categorical_data[,input_amt_column]~categorical_data[,input_date_column],categorical_data,sum)
names(agg_day) = c(input_date_column, input_amt_column)
forecast_input_column <- agg_day[,input_amt_column]
} else if (forecast_by == "customers") {
agg_day <- aggregate(categorical_data[,input_key_column]~categorical_data[,input_date_column],categorical_data,length)
names(agg_day) = c(input_date_column, input_key_column)
forecast_input_column <- agg_day[,input_key_column]
} else if (forecast_by == "average_sales") {
agg_day <-aggregate(categorical_data[,input_amt_column]~categorical_data[,input_date_column],categorical_data,mean)
names(agg_day) = c(input_date_column, input_amt_column)
forecast_input_column <- agg_day[,input_amt_column]
}
min_day <- min(agg_day[,input_date_column])
max_day <- max(agg_day[,input_date_column])
autoarima_model <- get_autoarima_model(forecast_input_column,period,min_day,freq)
if (is.null(autoarima_model)) {
category_forecast <- NULL
}else {
forecasted_date <- seq(as.Date(max_day)+1, by = "day", length.out = period)
forecasted_date <- as.data.frame(forecasted_date)
label <- sprintf("D-%s",seq(1:period))
if (forecast_by == "customers") {
category_forecast <- cbind.data.frame(forecasted_date=forecasted_date,label=label,value=round(autoarima_model$Point.Forecast))
}else {
category_forecast <- cbind.data.frame(forecasted_date=forecasted_date,label=label,value=autoarima_model$Point.Forecast)
}
}
temp <- list(sub_category = y,
category_forecast = category_forecast)
return(temp)
})
} else {
temp <- "Column not in data!"
}
}
Now loop through your column names vector via lapply
:
forecasted_category <- lapply(categorical_columns, FOO, data = data)