Search code examples
rsubsetdata-analysis

Identifying episodes with R


I'm working with an unbalanced panel that contains information across various countries for several years. One of the variables I have is dist, which represents the log-deviations from HP-filtered GDP. If dist>1, then dummy=1.

structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 
10L, 10L, 10L, 10L), .Label = c("Bangladesh", "Barbados", "Benin", 
"Burundi", "Cameroon", "Chile", "Cyprus", "Ecuador", "Equatorial Guinea", 
"Gabon", "Ghana", "Guatemala", "Guinea", "Guyana", "Haiti", "India", 
"Jamaica", "Jordan", "Lebanon", "Liberia", "Madagascar", "Mali", 
"Mexico", "Morocco", "Mozambique", "Nepal", "Nicaragua", "Niger", 
"Oman", "Pakistan", "Panama", "Papua New Guinea", "Peru", "Rwanda", 
"Senegal", "Seychelles", "Sierra Leone", "Singapore", "Sri Lanka", 
"Sudan", "Togo", "Trinidad and Tobago", "Tunisia", "Turkey", 
"Uganda", "Zambia", "Zimbabwe"), class = c("pseries", "factor"
)), date = structure(c(12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 40L, 
41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 
54L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("1965", "1966", "1967", 
"1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", 
"1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", 
"1984", "1985", "1986", "1987", "1988", "1989", "1990", "1991", 
"1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", 
"2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", 
"2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
"2016", "2017", "2018"), class = c("pseries", "factor")), dist = c(-2.44153863355447, 
-0.565497304455015, 1.4222058591902, 1.18583114700364, 0.936095936859405, 
-0.40582369098349, 2.08296049774769, -1.59377116576285, 0.316969269882462, 
-0.562808458278515, -0.438155688763343, 0.824980940313303, -0.60293744113302, 
0.397182279339227, 1.34777053122572, 0.00539983501313633, 0.0528824118570846, 
-0.838507946421168, 0.220969370384196, -1.06945294612116, 0.157324834050942, 
0.244080457191975, 0.215931770884978, 0.226176571912671, 0.197663973781873, 
0.183399695771442, 0.334605869075331, 0.0683789169881243, -0.0432432418415798, 
-0.191834311017179, -0.368292801016647, -0.572222256415503, -0.838601784768107, 
-1.15213567684887, -1.49844386483361, 2.39265920977342, 1.90573467650352, 
1.3362714819111, 0.158391029649629, 1.12699318386612, 2.38996532864524
), dummy = c(0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 
1, 0, 1, 1)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 400L, 
401L, 402L, 403L, 404L, 405L, 406L, 407L, 408L, 409L, 410L, 411L, 
412L, 413L, 414L, 415L, 416L, 417L, 418L, 419L, 420L), class = "data.frame")

I would like to identify episodes in the following way based on the values of dist and dummy. An episode is defined as the time spam for which values of dist>0.5, conditioned on dist>= 1 at some point. In the example above, the episode from Bangladesh lasts for 3 years from 1978-1980 and the one for Guatemala lasts for another 3 years from 2006 to 2008 Any help would be much appreciated!


Solution

  • I think this does what you want. I use dplyr for the data manipulation, but also the utility function rleid from the data.table package.

    x %>%
      group_by(country) %>%
      mutate(
        potential_episode_grp = (dist > 0.5) * data.table::rleid(dist > 0.5)
      ) %>%
      group_by(country, potential_episode_grp) %>%
      mutate(episode = as.integer(potential_episode_grp > 0 & any(dist > 1)))
    # # A tibble: 11 × 6
    # # Groups:   country, potential_episode_grp [4]
    #    country     date   dist dummy episode potential_episode_grp
    #    <chr>      <int>  <dbl> <int>   <int>                 <int>
    #  1 Bangladesh  1976 -2.44      0       0                     0
    #  2 Bangladesh  1977 -0.565     0       0                     0
    #  3 Bangladesh  1978  1.42      1       1                     2
    #  4 Bangladesh  1979  1.19      1       1                     2
    #  5 Bangladesh  1980  0.936     0       1                     2
    #  6 Guatemala   2004 -0.390     0       0                     0
    #  7 Guatemala   2005 -0.267     0       0                     0
    #  8 Guatemala   2006  1.46      1       1                     2
    #  9 Guatemala   2007  0.763     0       1                     2
    # 10 Guatemala   2008  1.11      1       1                     2
    # 11 Guatemala   2009  0.289     0       0                     0
    

    Using this data:

    x = read.table(text = ' country       date  dist       dummy episode
    1  Bangladesh 1976 -2.4415386     0   0
    2  Bangladesh 1977 -0.5654973     0   0
    3  Bangladesh 1978  1.4222059     1   1
    4  Bangladesh 1979  1.1858311     1   1
    5  Bangladesh 1980  0.9360959     0   1
    560  Guatemala 2004 -0.3899023    0   0
    561  Guatemala 2005 -0.2672316    0   0
    562  Guatemala 2006  1.4632532    1   1
    563  Guatemala 2007  0.7633748    0   1
    564  Guatemala 2008  1.1143216    1   1
    565  Guatemala 2009  0.2891909    0   0', header =T)