Search code examples
rsubsetsapply

time mismatch in calculating the slope of multiple subsets of a data frame


I am trying to subset my data based on the year and country and calculate the regression coefficient for each one.

a subset of my data structure is :

pdata <- structure(list(movie_odid = c(10100L, 10100L, 520100L, 520100L, 
650100L, 650100L, 10100L, 10100L, 520100L, 780100L, 780100L, 
950100L, 950100L, 540100L, 540100L, 780100L, 780100L, 880100L, 
880100L, 450100L, 450100L, 540100L, 540100L, 640100L, 640100L, 
800100L, 800100L, 420100L, 420100L, 450100L, 450100L, 490100L, 
490100L, 640100L, 640100L, 430100L, 430100L, 490100L, 490100L, 
590100L, 590100L, 1620100L, 1620100L, 390100L, 390100L, 8810100L, 
8810100L, 9480100L, 9480100L, 570100L, 570100L, 590100L, 590100L
), chart_date = structure(c(5L, 6L, 3L, 4L, 1L, 2L, 7L, 8L, 7L, 
11L, 12L, 9L, 10L, 17L, 18L, 13L, 14L, 15L, 16L, 23L, 24L, 19L, 
20L, 25L, 26L, 21L, 22L, 29L, 30L, 27L, 28L, 31L, 32L, 27L, 28L, 
37L, 38L, 33L, 34L, 35L, 36L, 39L, 40L, 41L, 42L, 47L, 48L, 45L, 
46L, 43L, 44L, 39L, 40L), .Label = c("1997-05-23", "1997-05-30", 
"1997-07-04", "1997-07-11", "1997-12-19", "1997-12-26", "1998-01-02", 
"1998-01-09", "1998-06-26", "1998-07-03", "1998-07-24", "1998-07-31", 
"1999-02-05", "1999-02-12", "1999-06-04", "1999-06-11", "1999-11-19", 
"1999-11-26", "2000-01-07", "2000-01-14", "2000-05-26", "2000-06-02", 
"2000-11-17", "2000-11-24", "2000-12-22", "2000-12-29", "2001-01-05", 
"2001-01-12", "2001-05-18", "2001-05-25", "2001-11-02", "2001-11-09", 
"2002-01-04", "2002-01-11", "2002-04-19", "2002-04-26", "2002-11-15", 
"2002-11-22", "2003-01-03", "2003-01-10", "2003-05-09", "2003-05-16", 
"2003-05-23", "2003-05-30", "2003-09-12", "2003-09-19", "2003-11-07", 
"2003-11-14"), class = "factor"), chart_year = c(1997L, 1997L, 
1997L, 1997L, 1997L, 1997L, 1998L, 1998L, 1998L, 1998L, 1998L, 
1998L, 1998L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 2000L, 
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L), revenue = c(52969336L, 
71183357L, 76457208L, 43593212L, 81172327L, 45111185L, 45012810L, 
37568867L, 48261L, 49760360L, 36612617L, 1962627L, 1441774L, 
23093123L, 65927993L, 5107876L, 4771193L, 3000000L, 82216507L, 
84977355L, 59922105L, 8431650L, 7296370L, 78711571L, 40769776L, 
83347490L, 37133438L, 56498192L, 63650772L, 2968580L, 788895L, 
76599345L, 57025088L, 28499878L, 22837762L, 106131568L, 61909948L, 
4502006L, 3272808L, 822068L, 1078673L, 3843873L, 2101748L, 42508303L, 
121361422L, 10163670L, 11628760L, 29944555L, 14018616L, 100066590L, 
49010220L, 3536766L, 3321470L), theaters = c(2674L, 2711L, 3020L, 
3020L, 3281L, 3282L, 2727L, 2746L, 58L, 2453L, 2540L, 214L, 214L, 
3236L, 3236L, 1027L, 1140L, 0L, 3312L, 3127L, 3134L, 2752L, 2326L, 
2774L, 2929L, 3653L, 3653L, 3587L, 3623L, 2594L, 912L, 3237L, 
3269L, 2948L, 3048L, 3682L, 3682L, 1425L, 1313L, 108L, 141L, 
1808L, 1180L, 3603L, 3603L, 576L, 1177L, 3282L, 3289L, 3483L, 
3492L, 1194L, 1212L), running_time = c(194L, 194L, 98L, 98L, 
134L, 134L, 194L, 194L, 98L, 169L, 169L, 220L, 220L, 92L, 92L, 
169L, 169L, 95L, 95L, 105L, 105L, 92L, 92L, 143L, 143L, 126L, 
126L, 90L, 90L, 105L, 105L, 92L, 92L, 143L, 143L, 161L, 161L, 
92L, 92L, 95L, 95L, 133L, 133L, 138L, 138L, 135L, 135L, 0L, 0L, 
102L, 102L, 95L, 95L), ifUS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA, 
-53L))

and my code is :

coefLM <- function(x) {
  coef(lm(log(revenue) ~ running_time, data = x))[2]
}

spl <- with(pdata, split(pdata, list(chart_year = chart_year, ifUS = ifUS)))
out <- unique(pdata[, c("chart_year", "ifUS")])
out <- transform(out, slope = sapply(spl, coefLM))

out

However, there is a time mismatch in the result. for example for "1999.0.running_time" the chart_year is "2012". would you please guide me on what could be the possible cause?

enter image description here


Solution

  • In your call to out <- transform(...), the order of chart_year and ifUS does not match the spl list.

    spl <- with(pdata, split(pdata, list(chart_year = chart_year, ifUS = ifUS)))
    out <- unique(pdata[, c("chart_year", "ifUS")])
    out
    #    chart_year ifUS
    # 1        1997    1
    # 7        1998    1
    # 14       1999    1
    # 20       2000    1
    # 28       2001    1
    # 36       2002    0
    # 38       2002    1
    # 42       2003    0
    # 44       2003    1
    nrow(out)
    # [1] 9
    

    But when you try to transform it,

    out <- transform(out, slope = sapply(spl, coefLM))
    # Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : 
    #   0 (non-NA) cases
    

    I suspect that you are not getting this error because you have a different (larger) spl. However, if you look at out and names(spl), even if the lengths were the same, it appears that they do not line up. (I'll show in a frame just to show

    nrow(out)
    # [1] 9
    
    length(spl)
    # [1] 14
    
    sapply(spl, nrow)
    # 1997.0 1998.0 1999.0 2000.0 2001.0 2002.0 2003.0 1997.1 1998.1 1999.1 2000.1 2001.1 2002.1 2003.1 
    #      0      0      0      0      0      2      6      6      7      6      8      8      4      6 
    

    Even if we filter out the 0-row frames from spl (which does give us length 9), the order between spl and out is not the same:

    names(Filter(c, sapply(spl, nrow)))
    # [1] "2002.0" "2003.0" "1997.1" "1998.1" "1999.1" "2000.1" "2001.1" "2002.1" "2003.1"
    do.call(paste, c(out, sep = "."))
    # [1] "1997.1" "1998.1" "1999.1" "2000.1" "2001.1" "2002.0" "2002.1" "2003.0" "2003.1"
    

    (Don't focus on the code I used, instead on what I am showing.)

    (Honestly, you should be happy that you caught this problem. Otherwise, it is highly likely that you would have had models with significance identified, true, but assigned to the incorrect groups. Corrupted results.)

    How to fix? Here's one suggestion, shifting over to a dplyr method:

    library(dplyr)
    pdata %>%
      group_by(chart_year, ifUS) %>%
      summarize(coef = coefLM(cur_data())) %>%
      ungroup()
    # `summarise()` regrouping output by 'chart_year' (override with `.groups` argument)
    # # A tibble: 9 x 3
    #   chart_year  ifUS      coef
    #        <int> <int>     <dbl>
    # 1       1997     1  0.000602
    # 2       1998     1  0.0288  
    # 3       1999     1 -0.0217  
    # 4       2000     1  0.0299  
    # 5       2001     1 -0.0125  
    # 6       2002     0 NA       
    # 7       2002     1 -0.468   
    # 8       2003     0 -0.00962 
    # 9       2003     1  0.0479  
    

    One advantage to using this methodology is that the data used and the coefficients produced are always associated with the same categories/factors.

    Quick walk-through, though a more detailed tutorial on dplyr will bring out many more learning points:

    • %>% is just a "pipe operator", where it helps visually break out what is happening.

      These two are functionally identical:

      pdata %>% somefunc(.)
      somefunc(pdata)
      

      Its strength comes in longer pipes:

      pdata %>% somefunc(.) %>% anotherfunc(., arg = 7) %>% finalfunc(., n = 1)
      finalfunc(anotherfunc(somefunc(pdata), arg = 7), n = 1)
      
    • group_by ensures that all data-creating/changing operations only operate on one group of data at a time. (Note: the "grouped" thing sticks with data until actively removed with ungroup(), and some calculations are different based on the data size. I make it a habit to always ungroup as soon as I know I don't need it, so that I don't accidentally mis-calculate later on.)

    • mutate (not used here) adds or replaces columns in a frame, keeping the same number of rows; summarize typically reduces a group to one row (though newer versions of dplyr are supposed to be able to summarize to more than 1 row ... the key point is that it makes a significant change in the shape of the data). With summarize, all other columns are dropped (because one cannot keep a normal n-row column when summarizing to a 1-row result).


    Alternative that does not require you to learn dplyr: split on a single name that can more easily be joined back in to the rest of the data. One can use paste or such, though I prefer the communicated-intend (declarative) flow of the interaction function (which ... calls paste).

    pdata$groupname <- interaction(pdata$chart_year, pdata$ifUS)
    head(pdata)
    #   movie_odid chart_date chart_year  revenue theaters running_time ifUS groupname
    # 1      10100 1997-12-19       1997 52969336     2674          194    1    1997.1
    # 2      10100 1997-12-26       1997 71183357     2711          194    1    1997.1
    # 3     520100 1997-07-04       1997 76457208     3020           98    1    1997.1
    # 4     520100 1997-07-11       1997 43593212     3020           98    1    1997.1
    # 5     650100 1997-05-23       1997 81172327     3281          134    1    1997.1
    # 6     650100 1997-05-30       1997 45111185     3282          134    1    1997.1
    

    This has not done much, but we're going to split on this single field (same effect of your 2-field split) and run your function.

    Instead of split, I'm going to use its cousin function by, which does the same thing and runs a function on each frame. The output if by is really just a list, even if it looks very different.

    out <- by(pdata, pdata$groupname, FUN = coefLM)
    out
    # pdata$groupname: 1997.0
    # [1] NA
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 1998.0
    # [1] NA
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 1999.0
    # [1] NA
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2000.0
    # [1] NA
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2001.0
    # [1] NA
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2002.0
    # [1] NA
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2003.0
    # [1] -0.009621244
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 1997.1
    # [1] 0.0006017306
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 1998.1
    # [1] 0.02882223
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 1999.1
    # [1] -0.02168886
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2000.1
    # [1] 0.02991097
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2001.1
    # [1] -0.01250767
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2002.1
    # [1] -0.4683953
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
    # pdata$groupname: 2003.1
    # [1] 0.04785696