I would like to calculate the share of production of each fossil fuel by the drilling type used in the production. Starting point is the following data.table
library(data.table)
dt <- structure(list(Global.Company.Key = c(1380L, 1380L, 1380L, 1380L, 1380L)
, Calendar.Data.Year.and.Quarter = structure(c(2000, 2000, 2000, 2000, 2000), class = "yearqtr")
, Current.Assets.Total = c(2218, 2218, 2218, 2218, 2218)
, DRILL_TYPE = c("U", "D", "V", "H", "U")
, DI.Oil.Prod.Quarter = c(18395.6792379842, 1301949.24041659, 235.311086392291, 27261.8049684835, 4719.27956989249)
, DI.Gas.Prod.Quarter = c(1600471.27107983, 4882347.22928982, 2611.60215053765, 9634.76418242493, 27648.276603634)), .Names = c("Global.Company.Key", "Calendar.Data.Year.and.Quarter", "Current.Assets.Total", "DRILL_TYPE", "DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"), row.names = c(NA, -5L), class = c("data.table", "data.frame"), sorted = c("Global.Company.Key", "Calendar.Data.Year.and.Quarter"))
I can then calculate the total of the production for each of the two fossil fuel types, based on the drilling type.
# Oil Production per Drilling Type and Total Sum
dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total ~ DRILL_TYPE , value.var = c("DI.Oil.Prod.Quarter"), fun = list(sum))[, Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter), .SDcols=c("U","D", "V", "H")][]
# Gas Production per Drilling Type and Total Sum
dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total ~ DRILL_TYPE , value.var = c("DI.Gas.Prod.Quarter"), fun = list(sum))[, Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter), .SDcols=c("U","D", "V", "H")][]
# Combined calculation of the production for both fossil fuels with dynamic naming.
dcast(dt, Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total ~ DRILL_TYPE , value.var = c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"), fun = list(sum))[, Total.Sum :=rowSums(.SD, na.rm = TRUE), by=.(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]
Does anyone have an idea how to calculate the total sum grouped for the different fossil fuel types? As you can see in the last case of the dcast
command it concatenates the names of the new columns, therefore making it impossible to group the columns, by selecting the columns directly.
Basically, I would like to get the output of the last example, although enhanced by additional columns with a sum for the total oil and gas production. I would then would like to use these sums to to calculate the share of oil and gas production coming from one of the four well types.
There is an alternative approach using data.table
and dcast()
which is about twice as fast as OP's merge approach
molten <- melt(dt, measure.vars = patterns("^DI"))
molten
# Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total DRILL_TYPE variable value
# 1: 1380 2000 2218 U DI.Oil.Prod.Quarter 18395.6792
# 2: 1380 2000 2218 D DI.Oil.Prod.Quarter 1301949.2404
# 3: 1380 2000 2218 V DI.Oil.Prod.Quarter 235.3111
# 4: 1380 2000 2218 H DI.Oil.Prod.Quarter 27261.8050
# 5: 1380 2000 2218 U DI.Oil.Prod.Quarter 4719.2796
# 6: 1380 2000 2218 U DI.Gas.Prod.Quarter 1600471.2711
# 7: 1380 2000 2218 D DI.Gas.Prod.Quarter 4882347.2293
# 8: 1380 2000 2218 V DI.Gas.Prod.Quarter 2611.6022
# 9: 1380 2000 2218 H DI.Gas.Prod.Quarter 9634.7642
#10: 1380 2000 2218 U DI.Gas.Prod.Quarter 27648.2766
totals <- molten[, .(DRILL_TYPE = "Total.Sum", value = sum(value)),
by = .(Global.Company.Key, Calendar.Data.Year.and.Quarter,
Current.Assets.Total, variable)]
totals
# Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total variable DRILL_TYPE value
#1: 1380 2000 2218 DI.Oil.Prod.Quarter Total.Sum 1352561
#2: 1380 2000 2218 DI.Gas.Prod.Quarter Total.Sum 6522713
molten <- rbind(molten, totals)
molten
# Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total DRILL_TYPE variable value
# 1: 1380 2000 2218 U DI.Oil.Prod.Quarter 18395.6792
# 2: 1380 2000 2218 D DI.Oil.Prod.Quarter 1301949.2404
# 3: 1380 2000 2218 V DI.Oil.Prod.Quarter 235.3111
# 4: 1380 2000 2218 H DI.Oil.Prod.Quarter 27261.8050
# 5: 1380 2000 2218 U DI.Oil.Prod.Quarter 4719.2796
# 6: 1380 2000 2218 U DI.Gas.Prod.Quarter 1600471.2711
# 7: 1380 2000 2218 D DI.Gas.Prod.Quarter 4882347.2293
# 8: 1380 2000 2218 V DI.Gas.Prod.Quarter 2611.6022
# 9: 1380 2000 2218 H DI.Gas.Prod.Quarter 9634.7642
#10: 1380 2000 2218 U DI.Gas.Prod.Quarter 27648.2766
#11: 1380 2000 2218 Total.Sum DI.Oil.Prod.Quarter 1352561.3153
#12: 1380 2000 2218 Total.Sum DI.Gas.Prod.Quarter 6522713.1433
# reorder factor levels of DRILL_TYPE to ensure
# that columns are in the same order as rows (with totals last)
molten[, DRILL_TYPE := forcats::fct_inorder(DRILL_TYPE)]
# reshape
dcast(molten, ... ~ variable + DRILL_TYPE, sum, value.var = "value")
# Global.Company.Key Calendar.Data.Year.and.Quarter Current.Assets.Total DI.Oil.Prod.Quarter_U DI.Oil.Prod.Quarter_D
#1: 1380 2000 2218 23114.96 1301949
# DI.Oil.Prod.Quarter_V DI.Oil.Prod.Quarter_H DI.Oil.Prod.Quarter_Total.Sum DI.Gas.Prod.Quarter_U DI.Gas.Prod.Quarter_D
#1: 235.3111 27261.8 1352561 1628120 4882347
# DI.Gas.Prod.Quarter_V DI.Gas.Prod.Quarter_H DI.Gas.Prod.Quarter_Total.Sum
#1: 2611.602 9634.764 6522713
The result is similar to the one created with OP's merge()
approach (except for column order).
mb <- microbenchmark::microbenchmark(
merge = merge(
x = dcast(
dt,
Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total ~ DRILL_TYPE ,
value.var = c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
fun = list(sum)
)[, -grepl(glob2rx("DI.Gas.Prod.Quarter_*"), colnames(
dcast(
dt,
Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total ~ DRILL_TYPE ,
value.var = c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
fun = list(sum)
)
)), with = FALSE][, DI.Oil.Prod.Total.Sum := rowSums(.SD, na.rm = TRUE), by =
.(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]
,
y = dcast(
dt,
Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total ~ DRILL_TYPE ,
value.var = c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
fun = list(sum)
)[, -grepl(glob2rx("DI.Oil.Prod.Quarter_*"), colnames(
dcast(
dt,
Global.Company.Key + Calendar.Data.Year.and.Quarter + Current.Assets.Total ~ DRILL_TYPE ,
value.var = c("DI.Oil.Prod.Quarter", "DI.Gas.Prod.Quarter"),
fun = list(sum)
)
)), with = FALSE][, DI.Gas.Prod.Total.Sum := rowSums(.SD, na.rm = TRUE), by =
.(Global.Company.Key, Calendar.Data.Year.and.Quarter)][]
,
all.x = TRUE
,
by = c(
"Global.Company.Key",
"Calendar.Data.Year.and.Quarter",
"Current.Assets.Total"
)
),
aggr = {
molten <- melt(dt, measure.vars = patterns("^DI"))
molten[, Total.Sum := sum(value), by = .(Global.Company.Key, Calendar.Data.Year.and.Quarter, Current.Assets.Total, variable)]
dcast(molten, ... ~ variable + DRILL_TYPE, sum, value.var = "value")
molten <- melt(dt, measure.vars = patterns("^DI"))
molten <- rbind(molten, molten[, .(DRILL_TYPE = "Total.Sum", value = sum(value)),
by = .(Global.Company.Key, Calendar.Data.Year.and.Quarter,
Current.Assets.Total, variable)])
molten[, DRILL_TYPE := forcats::fct_inorder(DRILL_TYPE)]
dcast(molten, ... ~ variable + DRILL_TYPE, sum, value.var = "value")
},
times = 100L
)
Note that the merge approach needs about three times as much lines of code. Also performance is twice as slow as the aggregate and rbind approach.
Unit: milliseconds
expr min lq mean median uq max neval
merge 20.298773 21.181559 22.13640 21.77682 22.59126 26.22265 100
aggr 9.393847 9.806165 10.33053 10.07595 10.35460 20.11112 100