I apologize in advance for this headache, and in particular, the minimum amount of data I need to give you to reproduce my situation.
I have two data frames that look like:
> dput(df_long[1:60,])
structure(list(id = c("20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-02", "20230420-02", "20230420-02",
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02",
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02",
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02",
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02",
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02",
"20230420-02", "20230420-02"), condition = c("control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control"), pairscores = c(4.1,
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1,
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1,
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1,
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1,
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1), round = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30", "1", "2", "3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30"
), win = c(1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
1), sound = c(1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 3, 5, 6,
7, 8, 8, 1, 4, 8, 6, 7, 8, 5, 4, 8, 8, 8, 1, 1, 2, 1, 2, 1, 1,
2, 2, 2, 1, 1, 2, 3, 5, 6, 7, 8, 8, 1, 4, 8, 6, 7, 8, 5, 4, 8,
8, 8), ttrs = c(34.8679761886597, 53.7323212623596, 72.872288942337,
95.00337266922, 108.467720985413, 127.519412279129, 140.932771205902,
152.619723320007, 164.774888753891, 189.440443992615, 199.848607540131,
211.736871480942, 228.6589448452, 240.730955839157, 258.806998729706,
272.826892614365, 285.802761793137, 301.402906894684, 314.37718629837,
330.579266309738, 342.812170028686, 353.819888830185, 370.405727863312,
379.25874209404, 386.349495410919, 398.328526258469, 408.155629873276,
420.099550485611, 430.813805341721, 441.291796922684, 34.8679761886597,
53.7323212623596, 72.872288942337, 95.00337266922, 108.467720985413,
127.519412279129, 140.932771205902, 152.619723320007, 164.774888753891,
189.440443992615, 199.848607540131, 211.736871480942, 228.6589448452,
240.730955839157, 258.806998729706, 272.826892614365, 285.802761793137,
301.402906894684, 314.37718629837, 330.579266309738, 342.812170028686,
353.819888830185, 370.405727863312, 379.25874209404, 386.349495410919,
398.328526258469, 408.155629873276, 420.099550485611, 430.813805341721,
441.291796922684), ttbp = c(42.8691244125366, 57.7340142726898,
79.8788452148437, 101.018859148026, 114.473237752914, 130.520850658417,
144.944416761398, 158.633060216904, 173.786515951157, 193.448391675949,
203.859516382217, 220.743315935135, 233.666549682617, 249.745782613754,
265.810203790665, 278.836834669113, 291.811623334885, 302.416656732559,
320.388904809952, 335.58996462822, 347.819845914841, 361.828748226166,
372.408302783966, 380.263983488083, 390.360696315765, 402.329761266708,
411.160075902939, 425.111953496933, 435.816273927689, 450.295118093491,
42.8691244125366, 57.7340142726898, 79.8788452148437, 101.018859148026,
114.473237752914, 130.520850658417, 144.944416761398, 158.633060216904,
173.786515951157, 193.448391675949, 203.859516382217, 220.743315935135,
233.666549682617, 249.745782613754, 265.810203790665, 278.836834669113,
291.811623334885, 302.416656732559, 320.388904809952, 335.58996462822,
347.819845914841, 361.828748226166, 372.408302783966, 380.263983488083,
390.360696315765, 402.329761266708, 411.160075902939, 425.111953496933,
435.816273927689, 450.295118093491), ttbi = c(50.7323212623596,
69.872288942337, 92.00337266922, 105.467720985413, 124.519412279129,
137.932771205902, 149.619723320007, 161.774888753891, 186.440443992615,
196.848607540131, 208.736871480942, 225.6589448452, 237.730955839157,
255.806998729706, 269.826892614365, 282.802761793137, 298.402906894684,
311.37718629837, 327.579266309738, 339.811122179031, 350.818850278854,
367.405727863312, 376.25874209404, 383.348484039307, 395.328526258469,
405.155629873276, 417.099550485611, 427.813805341721, 438.291796922684,
454.063094377518, 50.7323212623596, 69.872288942337, 92.00337266922,
105.467720985413, 124.519412279129, 137.932771205902, 149.619723320007,
161.774888753891, 186.440443992615, 196.848607540131, 208.736871480942,
225.6589448452, 237.730955839157, 255.806998729706, 269.826892614365,
282.802761793137, 298.402906894684, 311.37718629837, 327.579266309738,
339.811122179031, 350.818850278854, 367.405727863312, 376.25874209404,
383.348484039307, 395.328526258469, 405.155629873276, 417.099550485611,
427.813805341721, 438.291796922684, 454.063094377518)), row.names = c(NA,
-60L), class = c("tbl_df", "tbl", "data.frame"))
and the other one I must upload as a .txt file (https://drive.google.com/file/d/1oZ-0y5rAEsEuQ0gA_9bdWH6fr4wT7xkJ/view?usp=sharing)
I would like to get a mean of each emotion in au_slim within au_slim$timestamp ranges equal to range df_long$ttbi - 1 to df_long$ttbi - 1 for each row in df_long. I need to keep track of these by the id in each df, and by the df_long$win associated with each df_long$ttbi. My expected output would be a new df that looks something like:
id | win | ttbi | happiness | sadness | surprise | fear | anger | disgust |
---|---|---|---|---|---|---|---|---|
20230420-01 | 1 | 1 | # | # | # | # | # | # |
20230420-01 | 0 | 2 | # | # | # | # | # | # |
20230420-01 | 1 | 3 | # | # | # | # | # | # |
This would, of course, continue for all ids, where each id would have 30 rows (for the 30 ttbi's). I have been poking at this for a few days with basically no success. Frankly, I am just way in over my head code-wise. I know I need to use a for loop to iterate through each id, and probably a nested for loop to run each ttbi in each id. Any help would be great.
*Note that a lot of the cells in the new df will be 0 (or very close to it). That's fine.
EDIT by request:
df_long had more ids, but I have now corrected that:
> unique(au_slim$id) %in% unique(df_long_vid$id)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
> unique(df_long_vid$id) %in% unique(au_slim$id)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
When I run the code with the updated df, I still receive:
> library(dplyr)
> # Add round column to au_slim
> au_slim$round <- NA
> for(i in unique(au_slim$id)) {
+ # Get ttbi values for current id
+ breaks <- df_long_vid[which(df_long_vid$id == i), "ttbi"]
+ for(j in 1:length(breaks)) {
+ # Assign game round value to au_slim$round where timestamp values
+ # are -/+ 1 of breaks[j]
+ au_slim[which(au_slim$id == i &
+ au_slim$timestamp >= breaks[j]-1 &
+ au_slim$timestamp <= breaks[j]+1), "round"] <- j
+ }
+ }
Error in which(au_slim$id == i & au_slim$timestamp >= breaks[j] - 1 & :
dims [product 30] do not match the length of object [637610]
In addition: Warning message:
In au_slim$id == i & au_slim$timestamp >= breaks[j] - 1 :
longer object length is not a multiple of shorter object length
> dput(head(au_slim))
structure(list(id = c("20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-01"), timestamp = c(0,
0.04, 0.08, 0.12, 0.16, 0.2), happiness = c(0, 0, 0, 0, 0, 0),
sadness = c(0, 0, 0, 0, 0, 0), surprise = c(0, 0, 0, 0, 0,
0), fear = c(0, 0, 0, 0, 0, 0), anger = c(0, 0, 0, 0, 0,
0), disgust = c(0, 0, 0, 0, 0, 0), round = c(NA, NA, NA,
NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
EDIT for another problem:
After solving the data.table vs. data.frame issue, the first half of the solution runs without an error. But it does not fill au_slim$round as expected. See:
> for(i in unique(au_slim$id)) {
+ # Get ttbi values for current id
+ breaks <- df_long_vid[which(df_long_vid$id == i), "ttbi"]
+ for(j in 1:length(breaks)) {
+ # Assign game round value to au_slim$round where timestamp values
+ # are -/+ 1 of breaks[j]
+ au_slim[which(au_slim$id == i &
+ au_slim$timestamp >= breaks[j]-1 &
+ au_slim$timestamp <= breaks[j]+1), "round"] <- j
+ }
+ }
> dput(head(au_slim))
structure(list(id = c("20230420-01", "20230420-01", "20230420-01",
"20230420-01", "20230420-01", "20230420-01"), timestamp = c(0,
0.04, 0.08, 0.12, 0.16, 0.2), happiness = c(0, 0, 0, 0, 0, 0),
sadness = c(0, 0, 0, 0, 0, 0), surprise = c(0, 0, 0, 0, 0,
0), fear = c(0, 0, 0, 0, 0, 0), anger = c(0, 0, 0, 0, 0,
0), disgust = c(0, 0, 0, 0, 0, 0), round = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
What is NA_integer_
?
Update
This approach has been tested using the full df_long file and the subset sample of au_slim that has 23,020 rows. Given the "ttbi" values vary between "id" values, the nested for loop first loops through each "id", then through the corresponding "ttbi" values and assigns the bin values. Then summarise()
from the dplyr
package calculates the mean values for each emotion:
library(dplyr)
# Add round column to au_slim
au_slim$round <- NA
for(i in unique(au_slim$id)) {
# Get ttbi values for current id
breaks <- df_long[which(df_long$id == i), "ttbi"]
for(j in 1:length(breaks)) {
# Assign game round value to au_slim$round where timestamp values
# are -/+ 1 of breaks[j]
au_slim[which(au_slim$id == i &
au_slim$timestamp >= breaks[j]-1 &
au_slim$timestamp <= breaks[j]+1), "round"] <- j
}
}
# Summarise values
test <- au_slim %>%
# Remove rows not within -/+ 1 of ttbi break value
filter(!is.na(round)) %>%
# Group by id and round and calculate mean for each subgroup and emotion
group_by(id, round) %>%
summarise_at(vars(happiness:disgust), mean, na.rm = TRUE) %>%
ungroup() %>%
# Join df_long data and select desired columns
left_join(df_long, by = c("id", "round")) %>%
select(id, win, ttbi = "round", happiness:disgust)
data.frame(head(test))
id win ttbi happiness sadness surprise fear anger disgust
1 20230420-01 1 1 0.3475 0.13413333 0.0084 0 0 0.16213333
2 20230420-01 0 2 0.1930 0.01593333 0.0000 0 0 0.05020000
3 20230420-01 1 3 1.4270 0.04226667 0.0000 0 0 0.14613333
4 20230420-01 1 4 1.4111 0.02106667 0.1244 0 0 0.03733333
5 20230420-01 1 5 0.5502 0.25593333 0.0000 0 0 0.44266667
6 20230420-01 1 6 0.0000 0.73880000 0.4627 0 0 0.00000000
data.frame(tail(test))
id win ttbi happiness sadness surprise fear anger disgust
1 20230420-02 0 25 1.1532 0.22020000 0.00000 0 0.00000 0.61713333
2 20230420-02 0 26 1.6466 0.00380000 0.00000 0 0.04685 0.08726667
3 20230420-02 1 27 1.2871 0.10366667 0.00000 0 0.00000 0.08820000
4 20230420-02 1 28 0.9057 0.15060000 0.00000 0 0.00000 0.65953333
5 20230420-02 1 29 0.6546 0.12673333 0.02355 0 0.00540 0.75526667
6 20230420-02 1 30 1.4935 0.05133333 0.00000 0 0.02190 0.63586667