I have the following data frames:
a <- structure(list(half.fac = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2006-08-29 05:00:00",
"2006-08-29 05:30:00", "2006-08-29 06:00:00", "2006-08-29 06:30:00",
"2006-08-29 07:00:00", "2006-08-29 07:30:00", "2006-08-29 08:00:00",
"2006-08-29 08:30:00", "2006-08-29 09:00:00", "2006-08-29 09:30:00",
"2006-08-29 10:00:00", "2006-08-29 10:30:00", "2006-08-29 11:00:00",
"2006-08-29 11:30:00", "2006-08-29 12:00:00", "2006-08-29 12:30:00",
"2006-08-29 13:00:00", "2006-08-29 13:30:00", "2006-08-29 14:00:00",
"2006-08-29 14:30:00", "2006-08-29 15:00:00", "2006-08-29 15:30:00",
"2006-08-29 16:00:00", "2006-08-29 16:30:00", "2006-08-29 17:00:00",
"2006-08-29 17:30:00", "2006-08-29 18:00:00", "2006-08-29 18:30:00",
"2006-08-29 19:00:00", "2006-08-29 19:30:00", "2006-08-29 20:00:00",
"2006-08-29 20:30:00", "2006-08-29 21:00:00", "2006-08-29 21:30:00",
"2006-08-29 22:00:00", "2006-08-29 22:30:00", "2006-08-29 23:00:00",
"2006-08-29 23:30:00", "2006-08-30 00:00:00", "2006-08-30 00:30:00",
"2006-08-30 01:00:00", "2006-08-30 01:30:00", "2006-08-30 02:00:00",
"2006-08-30 02:30:00", "2006-08-30 03:00:00", "2006-08-30 03:30:00",
"2006-08-30 04:00:00", "2006-08-30 04:30:00", "2006-08-30 05:00:00"
), class = "factor"), Ux = c(0.354, 0.38225, 0.48725, 0.43425,
0.3825, 0.406, 0.45125, 0.5975, 0.62525, 0.587, 0.57925, 0.57125,
0.52725, 0.4245, 0.37775, 0.4355, 0.4585, 0.3125, 0.4075, 0.17275
), Ts = c(16.0024, 16.2281, 16.2468, 16.0788, 16.0465, 15.8956,
15.8922, 15.9074, 15.7582, 15.7938, 15.8565, 15.6904, 15.6073,
15.6632, 15.6344, 15.6531, 15.7006, 15.6989, 15.6938, 15.3022
)), .Names = c("half.fac", "Ux", "Ts"), row.names = c(NA, 20L
), class = "data.frame")
and
b <- structure(list(half.fac = structure(1:49, .Label = c("2006-08-29 05:00:00",
"2006-08-29 05:30:00", "2006-08-29 06:00:00", "2006-08-29 06:30:00",
"2006-08-29 07:00:00", "2006-08-29 07:30:00", "2006-08-29 08:00:00",
"2006-08-29 08:30:00", "2006-08-29 09:00:00", "2006-08-29 09:30:00",
"2006-08-29 10:00:00", "2006-08-29 10:30:00", "2006-08-29 11:00:00",
"2006-08-29 11:30:00", "2006-08-29 12:00:00", "2006-08-29 12:30:00",
"2006-08-29 13:00:00", "2006-08-29 13:30:00", "2006-08-29 14:00:00",
"2006-08-29 14:30:00", "2006-08-29 15:00:00", "2006-08-29 15:30:00",
"2006-08-29 16:00:00", "2006-08-29 16:30:00", "2006-08-29 17:00:00",
"2006-08-29 17:30:00", "2006-08-29 18:00:00", "2006-08-29 18:30:00",
"2006-08-29 19:00:00", "2006-08-29 19:30:00", "2006-08-29 20:00:00",
"2006-08-29 20:30:00", "2006-08-29 21:00:00", "2006-08-29 21:30:00",
"2006-08-29 22:00:00", "2006-08-29 22:30:00", "2006-08-29 23:00:00",
"2006-08-29 23:30:00", "2006-08-30 00:00:00", "2006-08-30 00:30:00",
"2006-08-30 01:00:00", "2006-08-30 01:30:00", "2006-08-30 02:00:00",
"2006-08-30 02:30:00", "2006-08-30 03:00:00", "2006-08-30 03:30:00",
"2006-08-30 04:00:00", "2006-08-30 04:30:00", "2006-08-30 05:00:00"
), class = "factor"), Ux = c(0.386129423856881, 0.376585152777778,
0.223084, 0.324716694444444, 0.284751444444444, 0.330879361111111,
0.435775208333333, 0.615431625, 0.0626402222222222, 0.294361277777778,
0.464928125, 0.194410652777778, 0.162782402777778, 0.452202958333333,
0.115627805555556, -0.504861722222222, -0.519877875, -0.472751402777778,
-0.304711708333333, -0.422057305555556, -0.366469180555556, -0.733386694444444,
-0.547648027777778, -0.755199347222222, -0.550659486111111, -0.5423535,
-0.366822472222222, -0.383240513888889, -0.310595805555556, -0.52455025,
-0.362563625, -0.652825569444444, -0.610772416666667, -0.410371666666667,
-0.472121152777778, -0.425157875, -0.402904430555556, -0.500900541666667,
-0.212771525026505, -0.311444125, -0.283801236111111, -0.329137972222222,
0.0119679583333333, -0.0432450416666667, -0.116798097222222,
-0.0186036944444444, 0.18647825, 0.0159295694444444, 0.2725),
Ts = c(15.6259581698983, 15.2544032611111, 15.3477244944444,
16.0545841722222, 16.8355633888889, 17.7415598722222, 18.6617318333333,
19.5413804333333, 20.3955452333333, 21.3235478222222, 22.1279614944444,
23.0201452888889, 23.5360555222222, 24.1431266555556, 24.6397015777778,
24.9368367833333, 24.8767400111111, 25.0295079222222, 24.9217255611111,
24.7817519388889, 25.0392368777778, 24.8607259111111, 24.6273860888889,
23.9305384555556, 23.6246723388889, 23.3412936666667, 22.5086166611111,
21.3677308722222, 20.4862971111111, 19.9382697666667, 19.1141656277778,
18.9857482722222, 18.6177629666667, 18.06932875, 17.7128214,
17.2961282944444, 16.8209205833333, 16.9382630722222, 16.2425613023827,
16.31353845, 16.2208566277778, 16.0621524611111, 15.3475440055556,
14.9305123722222, 14.3440030833333, 13.6119074444444, 13.3696283111111,
13.4342943555556, 13.2741)), .Names = c("half.fac", "Ux.30min",
"Ts.30min"), row.names = c(NA, -49L), class = "data.frame")
where a is my high-frequency measurement for one entire day and b is its average for each half hour.
What I need to do is, for each row, subtract b$Ux.30min from a$Ux IF and only IF the half-hourly fractions are the same (a$half.fac = b$half.fac). And do the same for Ts too.
What I have in mind is something like:
if a$half.fac = b$half.fac then
a$Ts.diff <- a$Ts - b$Ts.30min
a$Ux.diff <- a$Ux - b$Ux.30min
endif
To someone who might be familiar with this, I am trying to perform a Reynolds decomposition on the data.
Regardless of the method/package, ideally it should be as efficient (fast) as possible, because in reality my data frame 'a' is almost one million lines long!
Any ideas?
Many thanks in advance!
Here is a similar approach using dplyr
. The first step is to merge your a
and b
. Then, you do the calculation and create new columns.
library(dplyr)
inner_join(a, b) %>%
mutate(Ux.Ux30 = Ux - Ux.30min,
Ts.Ts30 = Ts - Ts.30min)
# A part of the results
# half.fac Ux Ts Ux.30min Ts.30min Ux.Ux30 Ts.Ts30
#1 2006-08-29 05:00:00 0.35400 16.0024 0.3861294 15.62596 -0.032129424 0.37644183
#2 2006-08-29 05:00:00 0.38225 16.2281 0.3861294 15.62596 -0.003879424 0.60214183
#3 2006-08-29 05:00:00 0.48725 16.2468 0.3861294 15.62596 0.101120576 0.62084183
#4 2006-08-29 05:00:00 0.43425 16.0788 0.3861294 15.62596 0.048120576 0.45284183
#5 2006-08-29 05:00:00 0.38250 16.0465 0.3861294 15.62596 -0.003629424 0.42054183
EXTRA
Here is one way to use data.table. This is my naive attempt.
#Convert data.frame to data.table
setDT(a)
setDT(b)
# Set a key column for merging a and b
setkey(a, "half.fac")
setkey(b, "half.fac")
# Merge a and b, and do the calculation
b[a][, `:=`(Ux.Ux30 = Ux - Ux.30min, Ts.Ts30 = Ts - Ts.30min)]