R - Expanding a value between a sequence of dates and add as columns to data.table

UPDATE: The proposed solution offered by akrun works for me but, my problem is that the value defined in value.var = RATING is only carried over to the corresponding date column. Note that all months that are defined as the time period spanning between RATING_DATE and VALID_THRU_DATE are however not filled.

What I tried so far and failed: Instead of defining the dcast operation like this

dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE, 
      value.var = 'RATING')

I did try

  dt1 <- dcast(setDT(ratings.dt), 
                 ISSUE_ID + RATING_TYPE ~ (VALID_THRU_DATE - RATING_DATE), 
                 value.var = 'RATING')      


 dt1 <- dcast(setDT(ratings.dt), 
             ISSUE_ID + RATING_TYPE ~ as.yearmon(seq(
             RATING_DATE, VALID_THRU_DATE), frac = 1), 
             value.var = 'RATING')


dt1 <- dcast(setDT(ratings.dt), 
                 ISSUE_ID + RATING_TYPE ~ (RATING_DATE:VALID_THRU_DATE), 
                 value.var = 'RATING')

I thought that I could just use the 2 columns that define each ratings' validity period as both are date columns in the dcast() function call, but obviously the logic behind that task is more complicted to conceptualize.

Now I conceptualized this task manually by first building a "skeleton data.table" that is then filled subsequently by looping row-wise through the original ratings data.table in the long format and spread the defined rating between the two dates in the skeleton table. (I renamed RATING to RATING_NUM to differentiate from the "raw" alphanumeric rating)

# (0) Filter only the most recent rating within a given month
ratings_num.dt <- ratings_num.dt[, 
                                 .SD[.N], 
                                 by = .(ISSUE_ID, RATING_TYPE, RATING_DATE)] 

# (1) Defining start and end date for the rating time series
start_date    <- as.Date("1990-01-01", "%Y-%m-%d")
end_date      <- as.Date("2021-01-31", "%Y-%m-%d")

# (2) Define the dates as new columns for a skeleton data.table
new_cols      <- seq(from = start_date, 
                     to = end_date,
                     by = "month")
new_cols      <- date_ymd_to_m_end(new_cols)
new_col_names <- as.character(new_cols, "%Y-%m-%d")

# (3) Determine how many months the rating time series spans 
N_months <- elapsed_months_lubri(start_date, end_date) + 1 
            # some function to do just what the name implies

MONTH_ID <- c(1:N_months)

# (4) Define the layout of the new skeleton table
# Note: The new table should contain the 3 rows per issue ID, namely the rating time series of each issue ID for every considered rating ageny 

rating_type.vec <- c("FR", "MR", "SPR")    

df_skeleton <- data.frame(rep(issue_IDs.vec, each = 3), rating_type.vec)

someInitialValue <- 0

# Credit to Jonas
to_Add <- setNames(data.frame(matrix(rep(
            someInitialValue, nrow(df_skeleton)*length(new_col_names)), 
            ncol = length(new_col_names), 
            nrow = NROW(df))), 
            new_col_names)

ratings_num_ts.df <- cbind(df_skeleton, to_Add)
ratings_num_ts.dt <- setDT(ratings_num_ts.df)

setnames(ratings_num_ts.dt, 
         c("rep.issue_IDs.vec..each...3.", "rating_type.vec"),
         c("ISSUE_ID", "RATING_TYPE"))

# (5) Create a data.table to join on ratings_num.dt to add month IDs to use for assigning ratings

seq_dates.dt <- setDT(data.frame(new_cols, MONTH_ID))
seq_dates.dt <- setnames(seq_dates.dt, c("new_cols"), c("RATING_DATE"))

ratings_num.dt <- ratings_num.dt[seq_dates.dt, 
                                 on = .(RATING_DATE = RATING_DATE)]

ratings_num.dt <- ratings_num.dt[seq_dates.dt, 
                                 on = .(RATING_VAL_THRU = RATING_DATE)]

# (6) If for the joined MONTH_IDs there is no corresponding RATING_DATE or RATING_VAL_THRU entry, the join will write NA values for these values in the joined table and can be filtered out accordingly

ratings_num.dt <- ratings_num.dt[!is.na(ISSUE_ID)]

# (7) Rename column of second MONTH_ID
setnames(ratings_num.dt,
         c("MONTH_ID", "i.MONTH_ID"),
         c("MONTH_ID_START", "MONTH_ID_END"))

# (8) Sort table by setting keys 
setkey(ratings_num.dt, ISSUE_ID, RATING_TYPE, RATING_DATE)

# (9) Defining logic as loop 
tic()

i <- 1
j <- nrow(ratings_num.dt)
  
id.vec             <- ratings_num.dt[, ISSUE_ID] 
rating_type.vec    <- ratings_num.dt[, RATING_TYPE]
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)  
month_ID_end.vec   <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec     <- ratings_num.dt[, RATING_NUM]

total <- j
pb <- progress_bar$new(format = "[:bar] :current/:total 
                        (:percent) eta: :eta", total = total)

  
spread_ratings_to_ts <- function(dt_source, dt_ts) {
  pb$tick(0)
  for (i in 1:j) {
    id             <- id.vec[i]  # alternatively ROW_ID == i
    rating_type    <- rating_type.vec[i]
    month_ID_start <- month_ID_start.vec[i]  # change to right value
    month_ID_end   <- month_ID_end.vec[i]
    rating_num     <- rating_num.vec[i]
    
    dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type, 
          (month_ID_start:month_ID_end) := rating_num]
    
    if (i %% 50 == 0) {
      pb$tick()
    }  
    
    i <- i + 1
  }
}

spread_ratings_to_ts(ratings_num.dt, ratings_num_ts.dt)

toc() 
## ~ 3,600 sec for ~ 250k rows to loop through ##


# (10) Compute rating means
# Substitute all pre-filled zeros in the table with NA as there is simply no 
# rating available at this point in time

ratings_num_ts.dt <- ratings_num_ts.dt %>% 
                       na_if(0)
ratings_num_ts.dt <- rbind(ratings_num_ts.dt, 
                       ratings_num_ts.dt[, 
                                         c(.(RATING_TYPE = 'Mean'), 
                                             lapply(.SD, mean, na.rm=TRUE)), 
                                         by = .(ISSUE_ID), 
                                         .SDcols = -(1:2)])

setkey(ratings_num_ts.dt, ISSUE_ID, RATING_TYPE)

I tried parallelizing this loop using foreach(...) %dopar% function(...) like you can see below but it is not working as of now. This is mainly motiavted by the runtime of the very inefficient loop above - albeit working just fine and accomplishing what I want. Working on the foreach function call, I am particulary unsure about how to write a suitable combine function that I can put in the foreach call that would wrap the results as desired.

i <- 1
j <- nrow(ratings_num.dt)

id.vec             <- ratings_num.dt[, ISSUE_ID]
rating_type.vec    <- ratings_num.dt[, RATING_TYPE]

# col 1+2 not rating but ISSUE_ID and RATING_TYPE
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2) 
month_ID_end.vec   <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec     <- ratings_num.dt[, RATING_NUM]

spread_ratings_to_ts <- function(dt_source, dt_ts) {
  id             <- id.vec[i]
  rating_type    <- rating_type.vec[i]
  month_ID_start <- month_ID_start.vec[i]
  month_ID_end   <- month_ID_end.vec[i]
  rating_num     <- rating_num.vec[i]
  
  dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type][, 
        (month_ID_start:month_ID_end) := rating_num]
}   

myCluster <- makeCluster(((detectCores()/2) - 1), type = "PSOCK")
registerDoParallel(myCluster)

clusterEvalQ(cl = myCluster, {
  setMKLthreads(1)
})

foreach(i = 1:j, .combine = 'rbind') %dopar% 
    spread_ratings_to_ts(dt_source = ratings_num.dt,
                         dt_ts = ratings_num_ts.dt)

stopCluster(myCluster)

Background / Data: In theory this is quite easy, and even a 3-year old could do that task manually, but even after nearly a week tackling this problem, I am no further at a solution.

The problem: I am working with a large financial data set. It contains bond issues identified by an ISSUE_ID and its corresponding RATING that is provided by the 3 rating agencies Fitch, Moody's and S&P defined as RATING_TYPE. I identified for each rating a published date and a valid-thru date defined as RATING_DATE and VALID_THRU_DATE, both of type DATE. All dates are formatted by yearmonth() as last day in a given month, as their rating is used to determine index-inclusion whose rules are evaluated at the end of the month.

ISSUE_ID is of type numeric

RATING is of type character

RATING_TYPE is of type character

My data is set up as data.table named ratings.dt and I need to add to it columns for the sequence between a start and an end date. My goal is then to have 3 rows for each issue ID, one for the time series of each rating agencies' respective rating history.

Keys are set for the data.table as ISSUE_ID, RATING_TYPE and RATING_DATE.

The data now looks like the following:

ISSUE_ID  RATING_TYPE  RATING   RATING_DATE   VALID_THRU_DATE RATING_DATE_SEQ
  123       FR           3.33   2000-01-31    2000-04-31             1
  123       FR           4.00   2000-05-31    2000-02-28             2
  123       FR           3.66   2001-03-31    2001-04-31             3
  123       FR           2.00   2001-05-31    2001-04-30             4
  123       FR           2.33   2001-04-30    2003-12-31             5
  123       FR           3.00   2004-01-31    2004-06-30             6
  123       MR           2.33   1999-04-31    1999-12-31             1
  123       MR           2.66   2000-01-31    2000-04-31             2
  123       MR           3.00   2001-03-31    2001-04-30             3
  123       MR           3.33   2001-05-31    2003-01-31             4
  123       MR           3.00   2003-02-28    2003-07-31             5
  123       SP           3.33   1999-04-31    2002-03-31             1
  123       SP           3.00   2002-04-31    2003-05-31             2 
  244       ...

Now I want to essentially spread the rating defined in RATING to be spread across a sequence of dates. I want to get to sth like this:

 ISSUE_ID  RATING_TYPE   1999-04-30  1999-05-31  ...   2000-01-31  2000-02-28    ...  2004-06-30 
   123        FR                                 ...      3.33         2.33      ...     3.00
   123        MR            2.33         2.33    ...      2.66         2.66      ...
   123        SP            3.33         3.33    ...      3.33         2.66      ...
   244       ...

So that I can do:

 ISSUE_ID  RATING_TYPE   1999-04-30  1999-05-31  ...   2000-01-31  2000-02-28    ...  2004-06-30 
   123       FR                                  ...      3.33         2.33      ...     3.00
   123       MR            2.33         2.33     ...      2.66         2.66      ...
   123       SP            3.33         3.33     ...      3.33         2.66      ...
   123      Mean           2.83         2.83     ...      3.11         2.55      ...

Then I could compute the rating averages per month per issue ID via data.table syntax like this

ratings.dt[, 
           lapply(.SD, mean),
           .SDcols = x:y,       # col indexes of added date sequence columns
           by = .(ISSUE_ID)]

Using my mapping table to convert the alphanumeric ratings, such as AAA, B+, C- etc. to a numeric value to allow numeric-based arithmetic calcualations such as mean, I can convert the numeric rating averages back to the alphanumeric ones. That would then mean mission accomplished!

Also, I am not sure anymore at the moment, if this problem could maybe be conceptualized more efficienctly. Would appreciate any pointers!

Solution

We convert the wide format with pivot_wider, do a group by summarise to create the 'Mean' row by concatenating the other observation with the mean value. With dplyr version >=1.0, summarise can return more than one row per group

library(dplyr)
library(tidyr)
ratings.dt %>%
     select(-VALID_THRU_DATE, -RATING_DATE_SEQ) %>% 
     pivot_wider(names_from = RATING_DATE, values_from = RATING) %>% 
     group_by(ISSUE_ID) %>% 
     summarise(RATING_TYPE = c(RATING_TYPE, "Mean"), 
       across(where(is.numeric), ~ c(., mean(., na.rm = TRUE))), .groups = 'drop')

-output

# A tibble: 4 x 11
#  ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
#     <int> <chr>              <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
#1      123 FR                  3.33            4         3.66         2            2.33            3        NA   
#2      123 MR                  2.66           NA         3            3.33        NA              NA         2.33
#3      123 SP                 NA              NA        NA           NA           NA              NA         3.33
#4      123 Mean                3.00            4         3.33         2.66         2.33            3         2.83
# … with 2 more variables: `2003-02-28` <dbl>, `2002-04-31` <dbl>

Or using data.table

library(data.table)
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE, 
      value.var = 'RATING')
rbind(dt1, dt1[, c(.(RATING_TYPE = 'Mean'), lapply(.SD, mean, na.rm = TRUE)), .(ISSUE_ID), .SDcols = -(1:2)])
#   ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1:      123          FR         NA      3.330          4       3.66       2.33      2.000         NA         NA
#2:      123          MR       2.33      2.660         NA       3.00         NA      3.330         NA          3
#3:      123          SP       3.33         NA         NA         NA         NA         NA          3         NA
#4:      123        Mean       2.83      2.995          4       3.33       2.33      2.665          3          3
#   2004-01-31
#1:          3
#2:         NA
#3:         NA
#4:          3

data

ratings.dt <- structure(list(ISSUE_ID = c(123L, 123L, 123L, 123L, 123L, 123L, 
123L, 123L, 123L, 123L, 123L, 123L, 123L), RATING_TYPE = c("FR", 
"FR", "FR", "FR", "FR", "FR", "MR", "MR", "MR", "MR", "MR", "SP", 
"SP"), RATING = c(3.33, 4, 3.66, 2, 2.33, 3, 2.33, 2.66, 3, 3.33, 
3, 3.33, 3), RATING_DATE = c("2000-01-31", "2000-05-31", "2001-03-31", 
"2001-05-31", "2001-04-30", "2004-01-31", "1999-04-31", "2000-01-31", 
"2001-03-31", "2001-05-31", "2003-02-28", "1999-04-31", "2002-04-31"
), VALID_THRU_DATE = c("2000-04-31", "2000-02-28", "2001-04-31", 
"2001-04-30", "2003-12-31", "2004-06-30", "1999-12-31", "2000-04-31", 
"2001-04-30", "2003-01-31", "2003-07-31", "2002-03-31", "2003-05-31"
), RATING_DATE_SEQ = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 
5L, 1L, 2L)), class = "data.frame", row.names = c(NA, -13L))