Search code examples
rrangenested-for-loop

Complex conditional df subsetting with nested for loops in r


I apologize in advance for this headache, and in particular, the minimum amount of data I need to give you to reproduce my situation.

I have two data frames that look like:

> dput(df_long[1:60,])
structure(list(id = c("20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-02", "20230420-02", "20230420-02", 
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02", 
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02", 
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02", 
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02", 
"20230420-02", "20230420-02", "20230420-02", "20230420-02", "20230420-02", 
"20230420-02", "20230420-02"), condition = c("control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control", "control", "control", 
"control", "control", "control", "control"), pairscores = c(4.1, 
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 
4.1, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1), round = c("1", "2", "3", 
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", 
"27", "28", "29", "30", "1", "2", "3", "4", "5", "6", "7", "8", 
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30"
), win = c(1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 
0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 
0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 
1), sound = c(1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 3, 5, 6, 
7, 8, 8, 1, 4, 8, 6, 7, 8, 5, 4, 8, 8, 8, 1, 1, 2, 1, 2, 1, 1, 
2, 2, 2, 1, 1, 2, 3, 5, 6, 7, 8, 8, 1, 4, 8, 6, 7, 8, 5, 4, 8, 
8, 8), ttrs = c(34.8679761886597, 53.7323212623596, 72.872288942337, 
95.00337266922, 108.467720985413, 127.519412279129, 140.932771205902, 
152.619723320007, 164.774888753891, 189.440443992615, 199.848607540131, 
211.736871480942, 228.6589448452, 240.730955839157, 258.806998729706, 
272.826892614365, 285.802761793137, 301.402906894684, 314.37718629837, 
330.579266309738, 342.812170028686, 353.819888830185, 370.405727863312, 
379.25874209404, 386.349495410919, 398.328526258469, 408.155629873276, 
420.099550485611, 430.813805341721, 441.291796922684, 34.8679761886597, 
53.7323212623596, 72.872288942337, 95.00337266922, 108.467720985413, 
127.519412279129, 140.932771205902, 152.619723320007, 164.774888753891, 
189.440443992615, 199.848607540131, 211.736871480942, 228.6589448452, 
240.730955839157, 258.806998729706, 272.826892614365, 285.802761793137, 
301.402906894684, 314.37718629837, 330.579266309738, 342.812170028686, 
353.819888830185, 370.405727863312, 379.25874209404, 386.349495410919, 
398.328526258469, 408.155629873276, 420.099550485611, 430.813805341721, 
441.291796922684), ttbp = c(42.8691244125366, 57.7340142726898, 
79.8788452148437, 101.018859148026, 114.473237752914, 130.520850658417, 
144.944416761398, 158.633060216904, 173.786515951157, 193.448391675949, 
203.859516382217, 220.743315935135, 233.666549682617, 249.745782613754, 
265.810203790665, 278.836834669113, 291.811623334885, 302.416656732559, 
320.388904809952, 335.58996462822, 347.819845914841, 361.828748226166, 
372.408302783966, 380.263983488083, 390.360696315765, 402.329761266708, 
411.160075902939, 425.111953496933, 435.816273927689, 450.295118093491, 
42.8691244125366, 57.7340142726898, 79.8788452148437, 101.018859148026, 
114.473237752914, 130.520850658417, 144.944416761398, 158.633060216904, 
173.786515951157, 193.448391675949, 203.859516382217, 220.743315935135, 
233.666549682617, 249.745782613754, 265.810203790665, 278.836834669113, 
291.811623334885, 302.416656732559, 320.388904809952, 335.58996462822, 
347.819845914841, 361.828748226166, 372.408302783966, 380.263983488083, 
390.360696315765, 402.329761266708, 411.160075902939, 425.111953496933, 
435.816273927689, 450.295118093491), ttbi = c(50.7323212623596, 
69.872288942337, 92.00337266922, 105.467720985413, 124.519412279129, 
137.932771205902, 149.619723320007, 161.774888753891, 186.440443992615, 
196.848607540131, 208.736871480942, 225.6589448452, 237.730955839157, 
255.806998729706, 269.826892614365, 282.802761793137, 298.402906894684, 
311.37718629837, 327.579266309738, 339.811122179031, 350.818850278854, 
367.405727863312, 376.25874209404, 383.348484039307, 395.328526258469, 
405.155629873276, 417.099550485611, 427.813805341721, 438.291796922684, 
454.063094377518, 50.7323212623596, 69.872288942337, 92.00337266922, 
105.467720985413, 124.519412279129, 137.932771205902, 149.619723320007, 
161.774888753891, 186.440443992615, 196.848607540131, 208.736871480942, 
225.6589448452, 237.730955839157, 255.806998729706, 269.826892614365, 
282.802761793137, 298.402906894684, 311.37718629837, 327.579266309738, 
339.811122179031, 350.818850278854, 367.405727863312, 376.25874209404, 
383.348484039307, 395.328526258469, 405.155629873276, 417.099550485611, 
427.813805341721, 438.291796922684, 454.063094377518)), row.names = c(NA, 
-60L), class = c("tbl_df", "tbl", "data.frame"))

and the other one I must upload as a .txt file (https://drive.google.com/file/d/1oZ-0y5rAEsEuQ0gA_9bdWH6fr4wT7xkJ/view?usp=sharing)

I would like to get a mean of each emotion in au_slim within au_slim$timestamp ranges equal to range df_long$ttbi - 1 to df_long$ttbi - 1 for each row in df_long. I need to keep track of these by the id in each df, and by the df_long$win associated with each df_long$ttbi. My expected output would be a new df that looks something like:

id win ttbi happiness sadness surprise fear anger disgust
20230420-01 1 1 # # # # # #
20230420-01 0 2 # # # # # #
20230420-01 1 3 # # # # # #

This would, of course, continue for all ids, where each id would have 30 rows (for the 30 ttbi's). I have been poking at this for a few days with basically no success. Frankly, I am just way in over my head code-wise. I know I need to use a for loop to iterate through each id, and probably a nested for loop to run each ttbi in each id. Any help would be great.

*Note that a lot of the cells in the new df will be 0 (or very close to it). That's fine.

EDIT by request:

df_long had more ids, but I have now corrected that:

> unique(au_slim$id) %in% unique(df_long_vid$id)
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
> unique(df_long_vid$id) %in% unique(au_slim$id)
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

When I run the code with the updated df, I still receive:

> library(dplyr)
> # Add round column to au_slim
> au_slim$round <- NA
> for(i in unique(au_slim$id)) {
+   # Get ttbi values for current id
+   breaks <- df_long_vid[which(df_long_vid$id == i), "ttbi"]
+   for(j in 1:length(breaks)) {
+     # Assign game round value to au_slim$round where timestamp values 
+     # are -/+ 1 of breaks[j] 
+     au_slim[which(au_slim$id == i & 
+                     au_slim$timestamp >= breaks[j]-1 & 
+                     au_slim$timestamp <= breaks[j]+1), "round"] <- j
+   }
+ }
Error in which(au_slim$id == i & au_slim$timestamp >= breaks[j] - 1 &  : 
  dims [product 30] do not match the length of object [637610]
In addition: Warning message:
In au_slim$id == i & au_slim$timestamp >= breaks[j] - 1 :
  longer object length is not a multiple of shorter object length
> dput(head(au_slim))
structure(list(id = c("20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-01"), timestamp = c(0, 
0.04, 0.08, 0.12, 0.16, 0.2), happiness = c(0, 0, 0, 0, 0, 0), 
    sadness = c(0, 0, 0, 0, 0, 0), surprise = c(0, 0, 0, 0, 0, 
    0), fear = c(0, 0, 0, 0, 0, 0), anger = c(0, 0, 0, 0, 0, 
    0), disgust = c(0, 0, 0, 0, 0, 0), round = c(NA, NA, NA, 
    NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))

EDIT for another problem:

After solving the data.table vs. data.frame issue, the first half of the solution runs without an error. But it does not fill au_slim$round as expected. See:

> for(i in unique(au_slim$id)) {
+   # Get ttbi values for current id
+   breaks <- df_long_vid[which(df_long_vid$id == i), "ttbi"]
+   for(j in 1:length(breaks)) {
+     # Assign game round value to au_slim$round where timestamp values 
+     # are -/+ 1 of breaks[j] 
+     au_slim[which(au_slim$id == i & 
+                     au_slim$timestamp >= breaks[j]-1 & 
+                     au_slim$timestamp <= breaks[j]+1), "round"] <- j
+   }
+ }
> dput(head(au_slim))
structure(list(id = c("20230420-01", "20230420-01", "20230420-01", 
"20230420-01", "20230420-01", "20230420-01"), timestamp = c(0, 
0.04, 0.08, 0.12, 0.16, 0.2), happiness = c(0, 0, 0, 0, 0, 0), 
    sadness = c(0, 0, 0, 0, 0, 0), surprise = c(0, 0, 0, 0, 0, 
    0), fear = c(0, 0, 0, 0, 0, 0), anger = c(0, 0, 0, 0, 0, 
    0), disgust = c(0, 0, 0, 0, 0, 0), round = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
    )), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))

What is NA_integer_?


Solution

  • Update

    This approach has been tested using the full df_long file and the subset sample of au_slim that has 23,020 rows. Given the "ttbi" values vary between "id" values, the nested for loop first loops through each "id", then through the corresponding "ttbi" values and assigns the bin values. Then summarise() from the dplyr package calculates the mean values for each emotion:

    library(dplyr)
    
    # Add round column to au_slim
    au_slim$round <- NA
    
    for(i in unique(au_slim$id)) {
      
      # Get ttbi values for current id
      breaks <- df_long[which(df_long$id == i), "ttbi"]
      
      for(j in 1:length(breaks)) {
        
        # Assign game round value to au_slim$round where timestamp values 
        # are -/+ 1 of breaks[j] 
        au_slim[which(au_slim$id == i & 
                        au_slim$timestamp >= breaks[j]-1 & 
                        au_slim$timestamp <= breaks[j]+1), "round"] <- j
        
      }
      
    }
    
    # Summarise values
    test <- au_slim %>%
      # Remove rows not within -/+ 1 of ttbi break value
      filter(!is.na(round)) %>%
      # Group by id and round and calculate mean for each subgroup and emotion
      group_by(id, round) %>%
      summarise_at(vars(happiness:disgust), mean, na.rm = TRUE) %>%
      ungroup() %>%
      # Join df_long data and select desired columns
      left_join(df_long, by = c("id", "round")) %>%
      select(id, win, ttbi = "round", happiness:disgust)
    
    data.frame(head(test))
               id win ttbi happiness    sadness surprise fear anger    disgust
    1 20230420-01   1    1    0.3475 0.13413333   0.0084    0     0 0.16213333
    2 20230420-01   0    2    0.1930 0.01593333   0.0000    0     0 0.05020000
    3 20230420-01   1    3    1.4270 0.04226667   0.0000    0     0 0.14613333
    4 20230420-01   1    4    1.4111 0.02106667   0.1244    0     0 0.03733333
    5 20230420-01   1    5    0.5502 0.25593333   0.0000    0     0 0.44266667
    6 20230420-01   1    6    0.0000 0.73880000   0.4627    0     0 0.00000000
    
    data.frame(tail(test))
               id win ttbi happiness    sadness surprise fear   anger    disgust
    1 20230420-02   0   25    1.1532 0.22020000  0.00000    0 0.00000 0.61713333
    2 20230420-02   0   26    1.6466 0.00380000  0.00000    0 0.04685 0.08726667
    3 20230420-02   1   27    1.2871 0.10366667  0.00000    0 0.00000 0.08820000
    4 20230420-02   1   28    0.9057 0.15060000  0.00000    0 0.00000 0.65953333
    5 20230420-02   1   29    0.6546 0.12673333  0.02355    0 0.00540 0.75526667
    6 20230420-02   1   30    1.4935 0.05133333  0.00000    0 0.02190 0.63586667