Search code examples
rtimestampvolume

Process time series data for each second in R


I have data looking like below:

# Data Sample
Time                Price    V1           Time2 V2
2016-06-20 05:09:44 2086.50   1 05:09:44.284670 -1
2016-06-20 05:09:45 2086.75   5 05:09:45.212413  1
2016-06-20 05:09:45 2086.75  10 05:09:45.212413  1
2016-06-20 05:09:45 2086.75   1 05:09:45.212413  1
2016-06-20 05:09:46 2086.75   1 05:09:46.745124  1
2016-06-20 05:09:46 2086.75   1 05:09:46.745124  1
2016-06-20 05:09:46 2086.75   1 05:09:46.819954  1
2016-06-20 05:09:49 2086.75   1 05:09:49.279392  1
2016-06-20 05:09:49 2086.75   1 05:09:49.279392  1
2016-06-20 05:09:49 2086.75   1 05:09:49.352346  1
2016-06-20 05:09:49 2086.50   2 05:09:49.964023 -1
2016-06-20 05:09:49 2086.50   1 05:09:49.964023 -1
2016-06-20 05:09:55 2086.50   1 05:09:55.343324 -1
2016-06-20 05:09:57 2086.75   1 05:09:57.551886  1
2016-06-20 05:09:57 2086.75   1 05:09:57.650549  1
2016-06-20 05:09:57 2086.75   1 05:09:57.654352  1
2016-06-20 05:09:57 2086.75   1 05:09:57.654352  1
2016-06-20 05:09:57 2086.75   1 05:09:57.726578  1

I want to clean the data so that I sum all V1 for within each seconds. So my desired output would look like:

# Desired Example
Time                V1    
2016-06-20 05:09:44  1 
2016-06-20 05:09:45 16
2016-06-20 05:09:46  3
2016-06-20 05:09:47  0
2016-06-20 05:09:48  0
2016-06-20 05:09:49  6
2016-06-20 05:09:50  0
2016-06-20 05:09:51  0
2016-06-20 05:09:52  0
2016-06-20 05:09:53  0
2016-06-20 05:09:54  0
2016-06-20 05:09:55  1
2016-06-20 05:09:56  0
2016-06-20 05:09:57  5

I turn column "Time" to character and split them and process them in list. However, the data is very big and it takes too long to compute. Is there a way to do this possibly through some function in zoo?

Below is a similar data set using dput:

structure(list(V3 = c(2086.5, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.5, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75, 2086.75), V4 = c(1L, 5L, 10L, 1L, 6L, 8L, 1L, 4L, 6L, 2L, 8L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 8L, 1L, 1L, 1L, 4L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), V6 = c("05:09:44.284670", "05:09:45.212413", "05:09:45.212413", "05:09:45.212413", "05:09:45.212413", "05:09:45.299104", "05:09:45.299104", "05:09:45.301513", "05:09:45.301513", "05:09:45.389110", "05:09:45.392840", "05:09:45.475688", "05:09:45.543980", "05:09:46.745124", "05:09:46.745124", "05:09:46.819954", "05:09:49.279392", "05:09:49.279392", "05:09:49.352346", "05:09:49.964023", "05:09:49.964023", "05:09:49.964023", "05:09:49.964023", "05:09:55.343324", "05:09:57.551886", "05:09:57.650549", "05:09:57.654352", "05:09:57.654352", "05:09:57.726578", "05:09:57.728848", "05:09:58.286788", "05:10:00.390708", "05:10:00.473617", "05:10:00.494903", "05:10:00.564042", "05:10:08.24907", "05:10:09.633247", "05:10:09.633247", "05:10:09.633247", "05:10:09.633247", "05:10:09.633247", "05:10:09.633247", "05:10:09.633247", "05:10:09.633247", "05:10:09.830544", "05:10:09.924001", "05:10:09.924001", "05:10:09.924001", "05:10:09.924001", "05:10:09.924001"), V7 = c(-1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("V3", "V4", "V6", "V7"), row.names = c(NA, 50L), class = "data.frame")


Solution

  • data.table is very fast. Try:

    library(data.table)
    library(lubridate)
    
    mydata<-data.table(mydata)
    mydata$Time<-ymd_hms(mydata$Time)
    setkey(mydata, Time)
    
    mydata.summed<-mydata[, .(V1 = sum(V1)), by = Time] # sums by each second
    
    mydata2<-data.table(Time = seq(min(mydata$Time), max(mydata$Time), by = 1))
    #create a new data.table to fill in the seconds you do not have values for
    
    mydata<-mydata.summed[mydata2]
    #merge them. see ?data.table for more information here
    
    mydata[is.na(mydata)]<-0
    #change the NAs that were created by the merge to 0
    
    head(mydata, 10)
    
                      Time V1
    1: 2016-06-20 05:09:44  1
    2: 2016-06-20 05:09:45 16
    3: 2016-06-20 05:09:46  3
    4: 2016-06-20 05:09:47  0
    5: 2016-06-20 05:09:48  0
    6: 2016-06-20 05:09:49  6
    7: 2016-06-20 05:09:50  0
    8: 2016-06-20 05:09:51  0
    9: 2016-06-20 05:09:52  0
    10: 2016-06-20 05:09:53  0