Search code examples
rdataframeif-statementdplyrmutate

Carry forward data from previous row, based on conditions of various columns in a dataframe in r


Dataset

I have a dataframe (issue_termi_episode) sampled like so:

issue_termi_episode <- structure(
  list(
    new_conflictep_id = c(20504, 20505, 20506, 20507, 20508, 20902, 20903, 20904, 22003, 22101, 22102, 22103, 22104, 22105, 22202),
    conflict_id = c(205, 205, 205, 205, 205, 209, 209, 209, 220, 221, 221, 221, 221, 221, 222),
    location = c("Iran", "Iran", "Iran", "Iran", "Iran", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)"),
    incompatibility = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2),
    conflict = c("Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma)"),
    conflictepisode = c(4, 5, 6, 7, 8, 2, 3, 4, 3, 1, 2, 3, 4, 5, 2),
    outcome = c(5, 5, 5, 5, 5, 2, 2, NA, 4, 5, 5, 5, 2, 5, 5),
    version = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3),
    intensity_level = c(1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1),
    region = c("2", "2", "2", "2", "2", "3", "3", "3", "5", "3", "3", "3", "3", "3", "3"),
    first_year_active = c(1990, 1993, 1996, 2016, 2018, 1989, 1997, 1999, 1989, 1989, 1994, 1997, 2000, 2013, 1990),
    last_year_active = c(1990, 1993, 1996, 2016, 2018, 1995, 1997, 2020, 1989, 1992, 1995, 1998, 2011, 2013, 1992),
    issue_territory = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1),
    issue_statestruc = c(1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
    issue_gov = c(1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1),
    issue_polrights = c(1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
    issue_distrib = c(1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1)
  ),
  row.names = c(NA, -15L),
  class = c("tbl_df", "tbl", "data.frame")
)
issue_termi_episode
# A tibble: 15 × 17
   new_conflictep_id conflict_id location        incompatibility conflict   conflictepisode outcome version intensity_level region first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib
               <dbl>       <dbl> <chr>                     <dbl> <chr>                <dbl>   <dbl>   <dbl>           <dbl> <chr>              <dbl>            <dbl>           <dbl>            <dbl>     <dbl>           <dbl>         <dbl>
 1             20504         205 Iran                          1 Iran: Kur…               4       5       3               1 2                   1990             1990               1                1         1               1             1
 2             20505         205 Iran                          1 Iran: Kur…               5       5       3               1 2                   1993             1993               1                1         0               1             0
 3             20506         205 Iran                          1 Iran: Kur…               6       5       3               1 2                   1996             1996               1                1         1               0             0
 4             20507         205 Iran                          1 Iran: Kur…               7       5       3               1 2                   2016             2016               1                1         1               1             1
 5             20508         205 Iran                          1 Iran: Kur…               8       5       3               1 2                   2018             2018               1                1         1               1             1
 6             20902         209 Philippines                   2 Philippin…               2       2       3               2 3                   1989             1995               0                1         1               1             1
 7             20903         209 Philippines                   2 Philippin…               3       2       3               1 3                   1997             1997               0                1         1               1             1
 8             20904         209 Philippines                   2 Philippin…               4      NA       3               1 3                   1999             2020               0                1         1               1             1
 9             22003         220 Paraguay                      2 Paraguay                 3       4       3               1 5                   1989             1989               0                1         1               1             0
10             22101         221 Myanmar (Burma)               1 Myanmar (…               1       5       3               2 3                   1989             1992               1                1         1               1             1
11             22102         221 Myanmar (Burma)               1 Myanmar (…               2       5       3               1 3                   1994             1995               1                1         0               1             1
12             22103         221 Myanmar (Burma)               1 Myanmar (…               3       5       3               1 3                   1997             1998               1                1         1               1             1
13             22104         221 Myanmar (Burma)               1 Myanmar (…               4       2       3               1 3                   2000             2011               1                1         1               1             1
14             22105         221 Myanmar (Burma)               1 Myanmar (…               5       5       3               1 3                   2013             2013               1                1         0               1             0
15             22202         222 Myanmar (Burma)               2 Myanmar (…               2       5       3               1 3                   1990             1992               1                1         1               1             1

The last 5 variables (of the form issue_*) are binary (0 or 1). The new_conflictep_id is unique to every row, and conflict_id is a grouping variable, often same for several rows but also sometimes just for one row.

Goal

Grouped by conflict_id, I want to "carry forward" the data (1s and 0s) of all the issue_* variables to the next row, but only if

  1. the gap between first_year_active of that next row and last_year_active of the previous row is 5 years or less; and
  2. all issue_* variables of that next row are equal to 0.

My desired output would update row 5 of the sample, where new_conflictep_id = 20508...

  new_conflictep_id conflict_id ... first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib
1             20504         205 ...              1990             1990               1                1         1               1             1
2             20505         205 ...              1993             1993               1                1         0               1             0
3             20506         205 ...              1996             1996               1                1         1               0             0
4             20507         205 ...              2016             2016               1                1         1               1             1
#                           ^^^                                   ^^^^                                                                         
5             20508         205 ...              2018             2018               0                0         0               0             0
#                           ^^^                  ^^^^                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6             20902         209 ...              1989             1995               0                1         1               1             1

...by filling the issue_* variables with all 1s, from the row above.

  new_conflictep_id conflict_id ... first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib
1             20504         205 ...              1990             1990               1                1         1               1             1
2             20505         205 ...              1993             1993               1                1         0               1             0
3             20506         205 ...              1996             1996               1                1         1               0             0
4             20507         205 ...              2016             2016               1                1         1               1             1
#                                                                                    ↓                ↓         ↓               ↓             ↓
5             20508         205 ...              2018             2018               1                1         1               1             1
#                                                                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6             20902         209 ...              1989             1995               0                1         1               1             1

Attempts

I tried different solutions with dplyr. First:

library(dplyr)
issue_termi_episode <- issue_termi_episode %>%
arrange(conflict_id, first_year_active) %>%
group_by(conflict_id) %>%
mutate(across(starts_with("issue_"), ~ ifelse(first_year_active - lag(last_year_active, default = first_year_active[1] + 10) <= 5 & all(. == 0), lag(.), .)))

From the content of my data, it results that for the first row within a group of conflict_id, it is not necessary to make any changes (which would be not possible anyway, because there is no data to carry forward from above). Therefore I added + 10 to the default-argument in lag(), so the condition never applies for the first row. This code just created NAs in some random places, but neither in those rows that I wanted to target with my conditions, nor did it fill these NAs in all of the 5 issue_* variables of a given row (as I wanted).

I then tried a work around, creating first a new variable that fills in a certain value (333) if my conditions are met (to later use that variable to carry the data forward):

issue_termi_episode <- issue_termi_episode %>%
  arrange(conflict_id, first_year_active) %>%
  group_by(conflict_id) %>%
  mutate(row_fill = ifelse(first_year_active - lag(last_year_active, default = first_year_active[1] + 10) <= 5 &all(c_across(starts_with("issue_")) == 0), 
                                 333, 0)) %>%
  ungroup() %>%
  mutate(across(starts_with("issue_"), ~ ifelse(row_fill == 333, lag(.), .))) 

However, in the newly created row_fill, only 0 is filled in. That leads me to the conclusion that my condition is not recognized. Trying the condition of all issue_* variables = 0, 333 is filled in very few but not all rows that should meet my condition. I can't get behind, why. The other condition (tested the same way) is working fine.

issue_termi_episode <- issue_termi_episode %>%
  arrange(conflict_id, first_year_active) %>%
  group_by(conflict_id) %>%
  mutate(row_fill = if_else(all(c_across(starts_with("issue_")) == 0), 
                                 333, 0)) %>%
  ungroup()

I would greatly appreciate some help, be it a complete solution in another approach, or just debugging my condition.


Solution

  • Solution

    Here is the tidyverse solution, which I suggested and you confirmed in the comments. It converts the relevant 0s into NAs, and fill()s them from the data above.

    library(dplyr)
    library(tidyr)  # For filling.
    
    
    # ...
    # Code to generate 'issue_termi_episode'.
    # ...
    
    
    # Set the "closeness" threshold.
    year_threshold <- 5
    
    
    issue_termi_episode %>%
        
        # Sort groups in chronological order.
        arrange(conflict_id, first_year_active) %>%
        
        # Prepare for filling within groups.
        group_by(conflict_id) %>%
        mutate(
            # Flag rows with only 0s.
            all_zero = if_all(starts_with("issue_"), ~ . == 0),
            
            # Flag rows whose year is "close" to the prior.
            is_close = abs(first_year_active - lag(last_year_active)) <= year_threshold,
            
            # Flag rows that are both, and should thus be overwritten.
            needs_fill = if_else(is.na(is_close), FALSE, all_zero & is_close),
            
            # Replace their 0s with NAs.
            across(starts_with("issue_"), ~ if_else(needs_fill, NA_real_, .))
        ) %>%
        
        # Fill downwards with the prior values.
        fill(starts_with("issue_")) %>%
        
        # Clean up from preparations.
        ungroup() %>%
        select(!c(all_zero, is_close, needs_fill))
    

    Result

    Given an issue_termi_episode dataset like your sample...

    issue_termi_episode <- structure(
      list(
        new_conflictep_id = c(20504, 20505, 20506, 20507, 20508, 20902, 20903, 20904, 22003, 22101, 22102, 22103, 22104, 22105, 22202),
        conflict_id = c(205, 205, 205, 205, 205, 209, 209, 209, 220, 221, 221, 221, 221, 221, 222),
        location = c("Iran", "Iran", "Iran", "Iran", "Iran", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)"),
        incompatibility = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2),
        conflict = c("Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma)"),
        conflictepisode = c(4, 5, 6, 7, 8, 2, 3, 4, 3, 1, 2, 3, 4, 5, 2),
        outcome = c(5, 5, 5, 5, 5, 2, 2, NA, 4, 5, 5, 5, 2, 5, 5),
        version = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3),
        intensity_level = c(1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1),
        region = c("2", "2", "2", "2", "2", "3", "3", "3", "5", "3", "3", "3", "3", "3", "3"),
        first_year_active = c(1990, 1993, 1996, 2016, 2018, 1989, 1997, 1999, 1989, 1989, 1994, 1997, 2000, 2013, 1990),
        last_year_active = c(1990, 1993, 1996, 2016, 2018, 1995, 1997, 2020, 1989, 1992, 1995, 1998, 2011, 2013, 1992),
        issue_territory = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1),
        issue_statestruc = c(1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
        issue_gov = c(1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1),
        issue_polrights = c(1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
        issue_distrib = c(1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1)
      ),
      row.names = c(NA, -15L),
      class = c("tbl_df", "tbl", "data.frame")
    )
    

    ...this should yield your desired result.

    # A tibble: 15 × 17
       new_conflictep_id conflict_id location        incompatibility conflict   conflictepisode outcome version intensity_level region first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib
                   <dbl>       <dbl> <chr>                     <dbl> <chr>                <dbl>   <dbl>   <dbl>           <dbl> <chr>              <dbl>            <dbl>           <dbl>            <dbl>     <dbl>           <dbl>         <dbl>
     1             20504         205 Iran                          1 Iran: Kur…               4       5       3               1 2                   1990             1990               1                1         1               1             1
     2             20505         205 Iran                          1 Iran: Kur…               5       5       3               1 2                   1993             1993               1                1         0               1             0
     3             20506         205 Iran                          1 Iran: Kur…               6       5       3               1 2                   1996             1996               1                1         1               0             0
     4             20507         205 Iran                          1 Iran: Kur…               7       5       3               1 2                   2016             2016               1                1         1               1             1
     5             20508         205 Iran                          1 Iran: Kur…               8       5       3               1 2                   2018             2018               1                1         1               1             1
     6             20902         209 Philippines                   2 Philippin…               2       2       3               2 3                   1989             1995               0                1         1               1             1
     7             20903         209 Philippines                   2 Philippin…               3       2       3               1 3                   1997             1997               0                1         1               1             1
     8             20904         209 Philippines                   2 Philippin…               4      NA       3               1 3                   1999             2020               0                1         1               1             1
     9             22003         220 Paraguay                      2 Paraguay                 3       4       3               1 5                   1989             1989               0                1         1               1             0
    10             22101         221 Myanmar (Burma)               1 Myanmar (…               1       5       3               2 3                   1989             1992               1                1         1               1             1
    11             22102         221 Myanmar (Burma)               1 Myanmar (…               2       5       3               1 3                   1994             1995               1                1         0               1             1
    12             22103         221 Myanmar (Burma)               1 Myanmar (…               3       5       3               1 3                   1997             1998               1                1         1               1             1
    13             22104         221 Myanmar (Burma)               1 Myanmar (…               4       2       3               1 3                   2000             2011               1                1         1               1             1
    14             22105         221 Myanmar (Burma)               1 Myanmar (…               5       5       3               1 3                   2013             2013               1                1         0               1             0
    15             22202         222 Myanmar (Burma)               2 Myanmar (…               2       5       3               1 3                   1990             1992               1                1         1               1             1
    

    Notice how the 5th row has been filled from the 4th, across all issue_* columns:

      # ... %>%
      slice(4:5) %>%
      select(ends_with("_id"), starts_with("issue_"))
    
    # A tibble: 2 × 7
      new_conflictep_id conflict_id issue_territory issue_statestruc issue_gov issue_polrights issue_distrib
                  <dbl>       <dbl>           <dbl>            <dbl>     <dbl>           <dbl>         <dbl>
    1             20507         205               1                1         1               1             1
    2             20508         205               1                1         1               1             1
    

    Note

    If a row of 0s is the topmost in its group, it is left untouched. This is conceptually desirable, because there is no "prior" data we can use to fill it.

    Such behavior is facilitated by lag(), which "pads...a missing value" at the top: namely NA. This yields an NA for is_close...

            # ...
            
            # Flag rows whose year is "close" to the prior.
            is_close = abs(first_year_active - lag(last_year_active)) <= year_threshold,
            #                                  ^^^^^^^^^^^^^^^^^^^^^
            
            # ...
    

    ...and thus a FALSE for needs_fill:

            # ...
            
            # Flag rows that are both, and should thus be overwritten.
            needs_fill = if_else(is.na(is_close), FALSE, all_zero & is_close)
            #                    ^^^^^^^^^^^^^^^
            
            # ...