I am trying to subset my data based on the year and country and calculate the regression coefficient for each one.
a subset of my data structure is :
pdata <- structure(list(movie_odid = c(10100L, 10100L, 520100L, 520100L,
650100L, 650100L, 10100L, 10100L, 520100L, 780100L, 780100L,
950100L, 950100L, 540100L, 540100L, 780100L, 780100L, 880100L,
880100L, 450100L, 450100L, 540100L, 540100L, 640100L, 640100L,
800100L, 800100L, 420100L, 420100L, 450100L, 450100L, 490100L,
490100L, 640100L, 640100L, 430100L, 430100L, 490100L, 490100L,
590100L, 590100L, 1620100L, 1620100L, 390100L, 390100L, 8810100L,
8810100L, 9480100L, 9480100L, 570100L, 570100L, 590100L, 590100L
), chart_date = structure(c(5L, 6L, 3L, 4L, 1L, 2L, 7L, 8L, 7L,
11L, 12L, 9L, 10L, 17L, 18L, 13L, 14L, 15L, 16L, 23L, 24L, 19L,
20L, 25L, 26L, 21L, 22L, 29L, 30L, 27L, 28L, 31L, 32L, 27L, 28L,
37L, 38L, 33L, 34L, 35L, 36L, 39L, 40L, 41L, 42L, 47L, 48L, 45L,
46L, 43L, 44L, 39L, 40L), .Label = c("1997-05-23", "1997-05-30",
"1997-07-04", "1997-07-11", "1997-12-19", "1997-12-26", "1998-01-02",
"1998-01-09", "1998-06-26", "1998-07-03", "1998-07-24", "1998-07-31",
"1999-02-05", "1999-02-12", "1999-06-04", "1999-06-11", "1999-11-19",
"1999-11-26", "2000-01-07", "2000-01-14", "2000-05-26", "2000-06-02",
"2000-11-17", "2000-11-24", "2000-12-22", "2000-12-29", "2001-01-05",
"2001-01-12", "2001-05-18", "2001-05-25", "2001-11-02", "2001-11-09",
"2002-01-04", "2002-01-11", "2002-04-19", "2002-04-26", "2002-11-15",
"2002-11-22", "2003-01-03", "2003-01-10", "2003-05-09", "2003-05-16",
"2003-05-23", "2003-05-30", "2003-09-12", "2003-09-19", "2003-11-07",
"2003-11-14"), class = "factor"), chart_year = c(1997L, 1997L,
1997L, 1997L, 1997L, 1997L, 1998L, 1998L, 1998L, 1998L, 1998L,
1998L, 1998L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2001L, 2001L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2002L, 2002L, 2002L,
2002L, 2002L, 2002L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L,
2003L, 2003L, 2003L, 2003L, 2003L, 2003L), revenue = c(52969336L,
71183357L, 76457208L, 43593212L, 81172327L, 45111185L, 45012810L,
37568867L, 48261L, 49760360L, 36612617L, 1962627L, 1441774L,
23093123L, 65927993L, 5107876L, 4771193L, 3000000L, 82216507L,
84977355L, 59922105L, 8431650L, 7296370L, 78711571L, 40769776L,
83347490L, 37133438L, 56498192L, 63650772L, 2968580L, 788895L,
76599345L, 57025088L, 28499878L, 22837762L, 106131568L, 61909948L,
4502006L, 3272808L, 822068L, 1078673L, 3843873L, 2101748L, 42508303L,
121361422L, 10163670L, 11628760L, 29944555L, 14018616L, 100066590L,
49010220L, 3536766L, 3321470L), theaters = c(2674L, 2711L, 3020L,
3020L, 3281L, 3282L, 2727L, 2746L, 58L, 2453L, 2540L, 214L, 214L,
3236L, 3236L, 1027L, 1140L, 0L, 3312L, 3127L, 3134L, 2752L, 2326L,
2774L, 2929L, 3653L, 3653L, 3587L, 3623L, 2594L, 912L, 3237L,
3269L, 2948L, 3048L, 3682L, 3682L, 1425L, 1313L, 108L, 141L,
1808L, 1180L, 3603L, 3603L, 576L, 1177L, 3282L, 3289L, 3483L,
3492L, 1194L, 1212L), running_time = c(194L, 194L, 98L, 98L,
134L, 134L, 194L, 194L, 98L, 169L, 169L, 220L, 220L, 92L, 92L,
169L, 169L, 95L, 95L, 105L, 105L, 92L, 92L, 143L, 143L, 126L,
126L, 90L, 90L, 105L, 105L, 92L, 92L, 143L, 143L, 161L, 161L,
92L, 92L, 95L, 95L, 133L, 133L, 138L, 138L, 135L, 135L, 0L, 0L,
102L, 102L, 95L, 95L), ifUS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-53L))
and my code is :
coefLM <- function(x) {
coef(lm(log(revenue) ~ running_time, data = x))[2]
}
spl <- with(pdata, split(pdata, list(chart_year = chart_year, ifUS = ifUS)))
out <- unique(pdata[, c("chart_year", "ifUS")])
out <- transform(out, slope = sapply(spl, coefLM))
out
However, there is a time mismatch in the result. for example for "1999.0.running_time" the chart_year is "2012". would you please guide me on what could be the possible cause?
In your call to out <- transform(...)
, the order of chart_year
and ifUS
does not match the spl
list.
spl <- with(pdata, split(pdata, list(chart_year = chart_year, ifUS = ifUS)))
out <- unique(pdata[, c("chart_year", "ifUS")])
out
# chart_year ifUS
# 1 1997 1
# 7 1998 1
# 14 1999 1
# 20 2000 1
# 28 2001 1
# 36 2002 0
# 38 2002 1
# 42 2003 0
# 44 2003 1
nrow(out)
# [1] 9
But when you try to transform
it,
out <- transform(out, slope = sapply(spl, coefLM))
# Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
# 0 (non-NA) cases
I suspect that you are not getting this error because you have a different (larger) spl
. However, if you look at out
and names(spl)
, even if the lengths were the same, it appears that they do not line up. (I'll show in a frame just to show
nrow(out)
# [1] 9
length(spl)
# [1] 14
sapply(spl, nrow)
# 1997.0 1998.0 1999.0 2000.0 2001.0 2002.0 2003.0 1997.1 1998.1 1999.1 2000.1 2001.1 2002.1 2003.1
# 0 0 0 0 0 2 6 6 7 6 8 8 4 6
Even if we filter out the 0-row frames from spl
(which does give us length 9), the order between spl
and out
is not the same:
names(Filter(c, sapply(spl, nrow)))
# [1] "2002.0" "2003.0" "1997.1" "1998.1" "1999.1" "2000.1" "2001.1" "2002.1" "2003.1"
do.call(paste, c(out, sep = "."))
# [1] "1997.1" "1998.1" "1999.1" "2000.1" "2001.1" "2002.0" "2002.1" "2003.0" "2003.1"
(Don't focus on the code I used, instead on what I am showing.)
(Honestly, you should be happy that you caught this problem. Otherwise, it is highly likely that you would have had models with significance identified, true, but assigned to the incorrect groups. Corrupted results.)
How to fix? Here's one suggestion, shifting over to a dplyr
method:
library(dplyr)
pdata %>%
group_by(chart_year, ifUS) %>%
summarize(coef = coefLM(cur_data())) %>%
ungroup()
# `summarise()` regrouping output by 'chart_year' (override with `.groups` argument)
# # A tibble: 9 x 3
# chart_year ifUS coef
# <int> <int> <dbl>
# 1 1997 1 0.000602
# 2 1998 1 0.0288
# 3 1999 1 -0.0217
# 4 2000 1 0.0299
# 5 2001 1 -0.0125
# 6 2002 0 NA
# 7 2002 1 -0.468
# 8 2003 0 -0.00962
# 9 2003 1 0.0479
One advantage to using this methodology is that the data used and the coefficients produced are always associated with the same categories/factors.
Quick walk-through, though a more detailed tutorial on dplyr
will bring out many more learning points:
%>%
is just a "pipe operator", where it helps visually break out what is happening.
These two are functionally identical:
pdata %>% somefunc(.)
somefunc(pdata)
Its strength comes in longer pipes:
pdata %>% somefunc(.) %>% anotherfunc(., arg = 7) %>% finalfunc(., n = 1)
finalfunc(anotherfunc(somefunc(pdata), arg = 7), n = 1)
group_by
ensures that all data-creating/changing operations only operate on one group of data at a time. (Note: the "grouped" thing sticks with data until actively removed with ungroup()
, and some calculations are different based on the data size. I make it a habit to always ungroup
as soon as I know I don't need it, so that I don't accidentally mis-calculate later on.)
mutate
(not used here) adds or replaces columns in a frame, keeping the same number of rows; summarize
typically reduces a group to one row (though newer versions of dplyr
are supposed to be able to summarize to more than 1 row ... the key point is that it makes a significant change in the shape of the data). With summarize
, all other columns are dropped (because one cannot keep a normal n
-row column when summarizing to a 1-row result).
Alternative that does not require you to learn dplyr
: split on a single name that can more easily be joined back in to the rest of the data. One can use paste
or such, though I prefer the communicated-intend (declarative) flow of the interaction
function (which ... calls paste
).
pdata$groupname <- interaction(pdata$chart_year, pdata$ifUS)
head(pdata)
# movie_odid chart_date chart_year revenue theaters running_time ifUS groupname
# 1 10100 1997-12-19 1997 52969336 2674 194 1 1997.1
# 2 10100 1997-12-26 1997 71183357 2711 194 1 1997.1
# 3 520100 1997-07-04 1997 76457208 3020 98 1 1997.1
# 4 520100 1997-07-11 1997 43593212 3020 98 1 1997.1
# 5 650100 1997-05-23 1997 81172327 3281 134 1 1997.1
# 6 650100 1997-05-30 1997 45111185 3282 134 1 1997.1
This has not done much, but we're going to split on this single field (same effect of your 2-field split) and run your function.
Instead of split
, I'm going to use its cousin function by
, which does the same thing and runs a function on each frame. The output if by
is really just a list
, even if it looks very different.
out <- by(pdata, pdata$groupname, FUN = coefLM)
out
# pdata$groupname: 1997.0
# [1] NA
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 1998.0
# [1] NA
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 1999.0
# [1] NA
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2000.0
# [1] NA
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2001.0
# [1] NA
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2002.0
# [1] NA
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2003.0
# [1] -0.009621244
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 1997.1
# [1] 0.0006017306
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 1998.1
# [1] 0.02882223
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 1999.1
# [1] -0.02168886
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2000.1
# [1] 0.02991097
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2001.1
# [1] -0.01250767
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2002.1
# [1] -0.4683953
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# pdata$groupname: 2003.1
# [1] 0.04785696