Please bear with me as this is my first question here. I'm still trying to figure out how to post the data and the code that I already have, so for now I will just try to explain. If this is not the acceptable way of asking a question, please ignore the question and next time I will try to do it right.
I have a data frame that I want to do daily calculations on. For a specific day I already have OpenUnits, BuyUnits, SellUnits, CloseUnits and Interest.These values were calculate by another system. I need to proportion the daily interest base on the number of units sold. I can do the calculations, but I cannot figure out how to get the OpenInterest (previous day's close), without using a for loop on the data frame. The ClosingInterest should be OpenInterest + Interest - SellUnits/OpenUnits * OpenInterest
I tried using mutate(OpenInterest = lag(ClosingInterest), ClosingInterest = OpenInterest + Interest - SellUnits/OpenUnits * OpenInterest), but that dosn't seem to work.
I have the code working with a for loop, but I was hoping that there might be a better, and faster way of doing it.
Regards
library(tidyverse)
library(tibbletime)
library(lubridate)
sample <- list(OpenUnits = c(7500000, 7500000, 7500000, 7500000, 7500000,
3300000, 3300000, 3300000, 3300000, 3300000), ClosingUnits = c(7500000,
7500000, 7500000, 7500000, 3300000, 3300000, 3300000, 3300000,
3300000, 3300000), AccrualDate = 16892:16901, AiaAdjustAmt = c(1844.70359677349,
1845.18465061665, 1845.66582990696, 1846.14713467713, 812.516568582349,
812.728453146696, 812.940392965385, 813.152388052826, 813.364438423431,
813.576544091616), SellUnits = c(NA, NA, NA, NA, 4200000, NA,
NA, NA, NA, NA))
sample <- sample %>%
as_tibble() %>%
mutate(
AccrualDate = lubridate::as_date(AccrualDate),
SellUnits = if_else(is.na(SellUnits), 0, SellUnits)
) %>%
as_tbl_time(index = AccrualDate)
sample <- sample %>%
mutate(
RealInterest = 0,
OpenInterest = cumsum(AiaAdjustAmt) - cumsum(RealInterest) - AiaAdjustAmt - RealInterest,
RealInterest = OpenInterest*SellUnits/OpenUnits
)
This does not produce the correct answer.
# A time tibble: 10 x 7
# Index: AccrualDate
OpenUnits ClosingUnits AccrualDate AiaAdjustAmt SellUnits s24j_real s24j_open
<dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
1 7500000. 7500000. 2016-04-01 1845. 0. 0. 0.
2 7500000. 7500000. 2016-04-02 1845. 0. 0. 1845.
3 7500000. 7500000. 2016-04-03 1846. 0. 0. 3690.
4 7500000. 7500000. 2016-04-04 1846. 0. 0. 5536.
5 7500000. 3300000. 2016-04-05 813. 4200000. 4134. 7382.
6 3300000. 3300000. 2016-04-06 813. 0. 0. 8194.
7 3300000. 3300000. 2016-04-07 813. 0. 0. 9007.
8 3300000. 3300000. 2016-04-08 813. 0. 0. 9820.
9 3300000. 3300000. 2016-04-09 813. 0. 0. 10633.
10 3300000. 3300000. 2016-04-10 814. 0. 0. 11446.
The correct answer should look like this. This I achieved with a for loop, which I'm trying to avoid because it feels slow on the bigger data set that's also nested.
# A time tibble: 10 x 7
# Index: AccrualDate
OpenUnits ClosingUnits AccrualDate AiaAdjustAmt SellUnits s24j_real s24j_open
<dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
1 7500000. 7500000. 2016-04-01 1845. 0. 0. 0.
2 7500000. 7500000. 2016-04-02 1845. 0. 0. 1845.
3 7500000. 7500000. 2016-04-03 1846. 0. 0. 3690.
4 7500000. 7500000. 2016-04-04 1846. 0. 0. 5536.
5 7500000. 3300000. 2016-04-05 813. 4200000. 4134. 7382.
6 3300000. 3300000. 2016-04-06 813. 0. 0. 4060.
7 3300000. 3300000. 2016-04-07 813. 0. 0. 4873.
8 3300000. 3300000. 2016-04-08 813. 0. 0. 5686.
9 3300000. 3300000. 2016-04-09 813. 0. 0. 6499.
10 3300000. 3300000. 2016-04-10 814. 0. 0. 7313.
Code to produce the correct answer.
sample2 <- sample %>%
mutate(
sell_ratio = if_else(!is.na(SellUnits), SellUnits/OpenUnits, 0),
s24j_open = 0,
s24j_close = 0,
s24j_real = 0
)
open <- 0
close <- 0
for (i in seq_along(sample2$AccrualDate)) {
open <- close
sellratio <- sample2[i, ]$sell_ratio
int <- sample2[i, ]$AiaAdjustAmt
real <- sellratio*open
close <- open - real + int
sample2[i, ]$s24j_open <- open
sample2[i, ]$s24j_real <- real
sample2[i, ]$s24j_close <- close
}
sample2 %>%
select(
OpenUnits, ClosingUnits, AccrualDate, AiaAdjustAmt, SellUnits, s24j_real, s24j_open
)
Better late than never:
## defining the data frame
sample <- data.frame(OpenUnits = c(7500000, 7500000, 7500000, 7500000, 7500000,
3300000, 3300000, 3300000, 3300000, 3300000),
ClosingUnits = c(7500000, 7500000, 7500000, 7500000, 3300000, 3300000, 3300000, 3300000, 3300000, 3300000),
AccrualDate = 16892:16901,
AiaAdjustAmt = c(1844.70359677349, 1845.18465061665, 1845.66582990696, 1846.14713467713, 812.516568582349,
812.728453146696, 812.940392965385, 813.152388052826, 813.364438423431, 813.576544091616),
SellUnits = c(NA, NA, NA, NA, 4200000, NA, NA, NA, NA, NA))
## defining a function to deliver the final output
## warning: the function is recursive
myfct <- function(n){
ratio <- (sample$SellUnits/sample$OpenUnits)[n]
ratio <- ifelse(is.na(ratio), 0, ratio)
if(n > 1){
vec <- myfct(n-1)
val <- vec[length(vec)]
newval <- sample$AiaAdjustAmt[n]+val*(1-ratio)
newvec <- c(vec, newval)
return(newvec)
}
if(n == 1){return(sample$AiaAdjustAmt[n]+0*(1-ratio))}
}
## finally applying the function and binding the output
## depending on dim(sample)[1], here one might have to add something like:
## options(expressions=10000) (to avoid "Error: evaluation nested too deeply...")
close <- myfct(dim(sample)[1])
open <- c(0, close[1:(length(close)-1)])
real <- open*ifelse(is.na(sample$SellUnits/sample$OpenUnits),
0,
sample$SellUnits/sample$OpenUnits)
output <- cbind.data.frame(sample, real, open)
Here is the result:
> output
OpenUnits ClosingUnits AccrualDate AiaAdjustAmt SellUnits real open
1 7500000 7500000 16892 1844.7036 NA 0.000 0.000
2 7500000 7500000 16893 1845.1847 NA 0.000 1844.704
3 7500000 7500000 16894 1845.6658 NA 0.000 3689.888
4 7500000 7500000 16895 1846.1471 NA 0.000 5535.554
5 7500000 3300000 16896 812.5166 4200000 4133.753 7381.701
6 3300000 3300000 16897 812.7285 NA 0.000 4060.465
7 3300000 3300000 16898 812.9404 NA 0.000 4873.194
8 3300000 3300000 16899 813.1524 NA 0.000 5686.134
9 3300000 3300000 16900 813.3644 NA 0.000 6499.286
10 3300000 3300000 16901 813.5765 NA 0.000 7312.651
However, recursive functions do not improve performance, quite the opposite (see this post). Although in the above code myfct
is only applied to calculate close
(real
and open
are derived from it). At any rate, I believe the code could be modified in order to lose the recursiveness and/or sapply
- I will give that a try and update the code then.
Edit
The problem with the first version of my code was that using both a recursive function and sapply makes the process very lengthy. Indeed, myfct(n)
already calulates myfct(k)
for all k<n
, thus using sapply
to calculate these values was redundant and inefficient.
Here I left the old function
and sapply
for completeness (new function
already edited in the above code):
myfct.old <- function(n){
ratio <- (sample$SellUnits/sample$OpenUnits)[n]
ratio <- ifelse(is.na(ratio), 0, ratio)
if(n > 1){return(sample$AiaAdjustAmt[n]+myfct.old(n-1)*(1-ratio))}
if(n == 1){return(sample$AiaAdjustAmt[n]+0*(1-ratio))}
}
## finally applying the function and binding the output
close <- sapply(1:dim(sample)[1], myfct)
Finally, here is a performance comparison of the different methods:
## Increasing the size of the data frame
sample <- do.call("rbind", replicate(100, sample, simplify = FALSE))
## (1) New method
start.time <- Sys.time()
close <- myfct(dim(sample)[1])
open <- c(0, close[1:(length(close)-1)])
real <- open*ifelse(is.na(sample$SellUnits/sample$OpenUnits),
0,
sample$SellUnits/sample$OpenUnits)
output <- cbind.data.frame(sample, real, open)
end.time <- Sys.time()
time.taken1 <- end.time - start.time
time.taken1
"Time difference of 0.19402 secs"
## (2) A loop
start.time <- Sys.time()
close <- 0
s24j_open <- c()
s24j_real <- c()
s24j_close <- c()
for(k in 1:dim(sample)[1]){
open <- close
ratio <- (sample$SellUnits/sample$OpenUnits)[k]
ratio <- ifelse(is.na(ratio), 0, ratio)
real <- open*ratio
close <- sample$AiaAdjustAmt[k]+open-real
s24j_open <- c(s24j_open, open)
s24j_real <- c(s24j_real, real)
s24j_close <- c(s24j_close, close)
}
output <- cbind.data.frame(sample, s24j_real, s24j_open)
end.time <- Sys.time()
time.taken2 <- end.time - start.time
time.taken2
"Time difference of 0.3530352 secs"
## (3) Old method
start.time <- Sys.time()
close <- sapply(1:dim(sample)[1], myfct.old)
open <- c(0, close[1:(length(close)-1)])
real <- open*ifelse(is.na(sample$SellUnits/sample$OpenUnits),
0,
sample$SellUnits/sample$OpenUnits)
output <- cbind.data.frame(sample, real, open)
end.time <- Sys.time()
time.taken3 <- end.time - start.time
time.taken3
"Time difference of 48.86089 secs"