UPDATE:
The proposed solution offered by akrun works for me but, my problem is that the value defined in value.var = RATING
is only carried over to the corresponding date column. Note that all months that are defined as the time period spanning between RATING_DATE
and VALID_THRU_DATE
are however not filled.
What I tried so far and failed: Instead of defining the dcast operation like this
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE,
value.var = 'RATING')
I did try
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ (VALID_THRU_DATE - RATING_DATE),
value.var = 'RATING')
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ as.yearmon(seq(
RATING_DATE, VALID_THRU_DATE), frac = 1),
value.var = 'RATING')
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ (RATING_DATE:VALID_THRU_DATE),
value.var = 'RATING')
I thought that I could just use the 2 columns that define each ratings' validity period as both are date columns in the dcast()
function call, but obviously the logic behind that task is more complicted to conceptualize.
Now I conceptualized this task manually by first building a "skeleton data.table" that is then filled subsequently by looping row-wise through the original ratings data.table in the long format and spread the defined rating between the two dates in the skeleton table. (I renamed RATING to RATING_NUM to differentiate from the "raw" alphanumeric rating)
# (0) Filter only the most recent rating within a given month
ratings_num.dt <- ratings_num.dt[,
.SD[.N],
by = .(ISSUE_ID, RATING_TYPE, RATING_DATE)]
# (1) Defining start and end date for the rating time series
start_date <- as.Date("1990-01-01", "%Y-%m-%d")
end_date <- as.Date("2021-01-31", "%Y-%m-%d")
# (2) Define the dates as new columns for a skeleton data.table
new_cols <- seq(from = start_date,
to = end_date,
by = "month")
new_cols <- date_ymd_to_m_end(new_cols)
new_col_names <- as.character(new_cols, "%Y-%m-%d")
# (3) Determine how many months the rating time series spans
N_months <- elapsed_months_lubri(start_date, end_date) + 1
# some function to do just what the name implies
MONTH_ID <- c(1:N_months)
# (4) Define the layout of the new skeleton table
# Note: The new table should contain the 3 rows per issue ID, namely the rating time series of each issue ID for every considered rating ageny
rating_type.vec <- c("FR", "MR", "SPR")
df_skeleton <- data.frame(rep(issue_IDs.vec, each = 3), rating_type.vec)
someInitialValue <- 0
# Credit to Jonas
to_Add <- setNames(data.frame(matrix(rep(
someInitialValue, nrow(df_skeleton)*length(new_col_names)),
ncol = length(new_col_names),
nrow = NROW(df))),
new_col_names)
ratings_num_ts.df <- cbind(df_skeleton, to_Add)
ratings_num_ts.dt <- setDT(ratings_num_ts.df)
setnames(ratings_num_ts.dt,
c("rep.issue_IDs.vec..each...3.", "rating_type.vec"),
c("ISSUE_ID", "RATING_TYPE"))
# (5) Create a data.table to join on ratings_num.dt to add month IDs to use for assigning ratings
seq_dates.dt <- setDT(data.frame(new_cols, MONTH_ID))
seq_dates.dt <- setnames(seq_dates.dt, c("new_cols"), c("RATING_DATE"))
ratings_num.dt <- ratings_num.dt[seq_dates.dt,
on = .(RATING_DATE = RATING_DATE)]
ratings_num.dt <- ratings_num.dt[seq_dates.dt,
on = .(RATING_VAL_THRU = RATING_DATE)]
# (6) If for the joined MONTH_IDs there is no corresponding RATING_DATE or RATING_VAL_THRU entry, the join will write NA values for these values in the joined table and can be filtered out accordingly
ratings_num.dt <- ratings_num.dt[!is.na(ISSUE_ID)]
# (7) Rename column of second MONTH_ID
setnames(ratings_num.dt,
c("MONTH_ID", "i.MONTH_ID"),
c("MONTH_ID_START", "MONTH_ID_END"))
# (8) Sort table by setting keys
setkey(ratings_num.dt, ISSUE_ID, RATING_TYPE, RATING_DATE)
# (9) Defining logic as loop
tic()
i <- 1
j <- nrow(ratings_num.dt)
id.vec <- ratings_num.dt[, ISSUE_ID]
rating_type.vec <- ratings_num.dt[, RATING_TYPE]
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)
month_ID_end.vec <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec <- ratings_num.dt[, RATING_NUM]
total <- j
pb <- progress_bar$new(format = "[:bar] :current/:total
(:percent) eta: :eta", total = total)
spread_ratings_to_ts <- function(dt_source, dt_ts) {
pb$tick(0)
for (i in 1:j) {
id <- id.vec[i] # alternatively ROW_ID == i
rating_type <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i] # change to right value
month_ID_end <- month_ID_end.vec[i]
rating_num <- rating_num.vec[i]
dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type,
(month_ID_start:month_ID_end) := rating_num]
if (i %% 50 == 0) {
pb$tick()
}
i <- i + 1
}
}
spread_ratings_to_ts(ratings_num.dt, ratings_num_ts.dt)
toc()
## ~ 3,600 sec for ~ 250k rows to loop through ##
# (10) Compute rating means
# Substitute all pre-filled zeros in the table with NA as there is simply no
# rating available at this point in time
ratings_num_ts.dt <- ratings_num_ts.dt %>%
na_if(0)
ratings_num_ts.dt <- rbind(ratings_num_ts.dt,
ratings_num_ts.dt[,
c(.(RATING_TYPE = 'Mean'),
lapply(.SD, mean, na.rm=TRUE)),
by = .(ISSUE_ID),
.SDcols = -(1:2)])
setkey(ratings_num_ts.dt, ISSUE_ID, RATING_TYPE)
I tried parallelizing this loop using foreach(...) %dopar% function(...)
like you can see below but it is not working as of now. This is mainly motiavted by the runtime of the very inefficient loop above - albeit working just fine and accomplishing what I want. Working on the foreach function call, I am particulary unsure about how to write a suitable combine function that I can put in the foreach call that would wrap the results as desired.
i <- 1
j <- nrow(ratings_num.dt)
id.vec <- ratings_num.dt[, ISSUE_ID]
rating_type.vec <- ratings_num.dt[, RATING_TYPE]
# col 1+2 not rating but ISSUE_ID and RATING_TYPE
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)
month_ID_end.vec <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec <- ratings_num.dt[, RATING_NUM]
spread_ratings_to_ts <- function(dt_source, dt_ts) {
id <- id.vec[i]
rating_type <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i]
month_ID_end <- month_ID_end.vec[i]
rating_num <- rating_num.vec[i]
dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type][,
(month_ID_start:month_ID_end) := rating_num]
}
myCluster <- makeCluster(((detectCores()/2) - 1), type = "PSOCK")
registerDoParallel(myCluster)
clusterEvalQ(cl = myCluster, {
setMKLthreads(1)
})
foreach(i = 1:j, .combine = 'rbind') %dopar%
spread_ratings_to_ts(dt_source = ratings_num.dt,
dt_ts = ratings_num_ts.dt)
stopCluster(myCluster)
Background / Data: In theory this is quite easy, and even a 3-year old could do that task manually, but even after nearly a week tackling this problem, I am no further at a solution.
The problem:
I am working with a large financial data set. It contains bond issues identified by an ISSUE_ID
and its corresponding RATING
that is provided by the 3 rating agencies Fitch, Moody's and S&P defined as RATING_TYPE
. I identified for each rating a published date and a valid-thru date defined as RATING_DATE
and VALID_THRU_DATE
, both of type DATE
. All dates are formatted by yearmonth() as last day in a given month, as their rating is used to determine index-inclusion whose rules are evaluated at the end of the month.
ISSUE_ID
is of type numeric
RATING
is of type character
RATING_TYPE
is of type character
My data is set up as data.table named ratings.dt
and I need to add to it columns for the sequence between a start and an end date. My goal is then to have 3 rows for each issue ID, one for the time series of each rating agencies' respective rating history.
Keys are set for the data.table as ISSUE_ID, RATING_TYPE and RATING_DATE.
The data now looks like the following:
ISSUE_ID RATING_TYPE RATING RATING_DATE VALID_THRU_DATE RATING_DATE_SEQ
123 FR 3.33 2000-01-31 2000-04-31 1
123 FR 4.00 2000-05-31 2000-02-28 2
123 FR 3.66 2001-03-31 2001-04-31 3
123 FR 2.00 2001-05-31 2001-04-30 4
123 FR 2.33 2001-04-30 2003-12-31 5
123 FR 3.00 2004-01-31 2004-06-30 6
123 MR 2.33 1999-04-31 1999-12-31 1
123 MR 2.66 2000-01-31 2000-04-31 2
123 MR 3.00 2001-03-31 2001-04-30 3
123 MR 3.33 2001-05-31 2003-01-31 4
123 MR 3.00 2003-02-28 2003-07-31 5
123 SP 3.33 1999-04-31 2002-03-31 1
123 SP 3.00 2002-04-31 2003-05-31 2
244 ...
Now I want to essentially spread the rating defined in RATING
to be spread across a sequence of dates.
I want to get to sth like this:
ISSUE_ID RATING_TYPE 1999-04-30 1999-05-31 ... 2000-01-31 2000-02-28 ... 2004-06-30
123 FR ... 3.33 2.33 ... 3.00
123 MR 2.33 2.33 ... 2.66 2.66 ...
123 SP 3.33 3.33 ... 3.33 2.66 ...
244 ...
So that I can do:
ISSUE_ID RATING_TYPE 1999-04-30 1999-05-31 ... 2000-01-31 2000-02-28 ... 2004-06-30
123 FR ... 3.33 2.33 ... 3.00
123 MR 2.33 2.33 ... 2.66 2.66 ...
123 SP 3.33 3.33 ... 3.33 2.66 ...
123 Mean 2.83 2.83 ... 3.11 2.55 ...
Then I could compute the rating averages per month per issue ID via data.table syntax like this
ratings.dt[,
lapply(.SD, mean),
.SDcols = x:y, # col indexes of added date sequence columns
by = .(ISSUE_ID)]
Using my mapping table to convert the alphanumeric ratings, such as AAA, B+, C- etc. to a numeric value to allow numeric-based arithmetic calcualations such as mean, I can convert the numeric rating averages back to the alphanumeric ones. That would then mean mission accomplished!
Also, I am not sure anymore at the moment, if this problem could maybe be conceptualized more efficienctly. Would appreciate any pointers!
We convert the wide format with pivot_wider
, do a group by summarise
to create the 'Mean' row by concatenating the other observation with the mean
value. With dplyr
version >=1.0
, summarise
can return more than one row per group
library(dplyr)
library(tidyr)
ratings.dt %>%
select(-VALID_THRU_DATE, -RATING_DATE_SEQ) %>%
pivot_wider(names_from = RATING_DATE, values_from = RATING) %>%
group_by(ISSUE_ID) %>%
summarise(RATING_TYPE = c(RATING_TYPE, "Mean"),
across(where(is.numeric), ~ c(., mean(., na.rm = TRUE))), .groups = 'drop')
-output
# A tibble: 4 x 11
# ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 123 FR 3.33 4 3.66 2 2.33 3 NA
#2 123 MR 2.66 NA 3 3.33 NA NA 2.33
#3 123 SP NA NA NA NA NA NA 3.33
#4 123 Mean 3.00 4 3.33 2.66 2.33 3 2.83
# … with 2 more variables: `2003-02-28` <dbl>, `2002-04-31` <dbl>
Or using data.table
library(data.table)
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE,
value.var = 'RATING')
rbind(dt1, dt1[, c(.(RATING_TYPE = 'Mean'), lapply(.SD, mean, na.rm = TRUE)), .(ISSUE_ID), .SDcols = -(1:2)])
# ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1: 123 FR NA 3.330 4 3.66 2.33 2.000 NA NA
#2: 123 MR 2.33 2.660 NA 3.00 NA 3.330 NA 3
#3: 123 SP 3.33 NA NA NA NA NA 3 NA
#4: 123 Mean 2.83 2.995 4 3.33 2.33 2.665 3 3
# 2004-01-31
#1: 3
#2: NA
#3: NA
#4: 3
ratings.dt <- structure(list(ISSUE_ID = c(123L, 123L, 123L, 123L, 123L, 123L,
123L, 123L, 123L, 123L, 123L, 123L, 123L), RATING_TYPE = c("FR",
"FR", "FR", "FR", "FR", "FR", "MR", "MR", "MR", "MR", "MR", "SP",
"SP"), RATING = c(3.33, 4, 3.66, 2, 2.33, 3, 2.33, 2.66, 3, 3.33,
3, 3.33, 3), RATING_DATE = c("2000-01-31", "2000-05-31", "2001-03-31",
"2001-05-31", "2001-04-30", "2004-01-31", "1999-04-31", "2000-01-31",
"2001-03-31", "2001-05-31", "2003-02-28", "1999-04-31", "2002-04-31"
), VALID_THRU_DATE = c("2000-04-31", "2000-02-28", "2001-04-31",
"2001-04-30", "2003-12-31", "2004-06-30", "1999-12-31", "2000-04-31",
"2001-04-30", "2003-01-31", "2003-07-31", "2002-03-31", "2003-05-31"
), RATING_DATE_SEQ = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 1L, 2L)), class = "data.frame", row.names = c(NA, -13L))