I have a dataset that looks like this:
working_df %>%
select(user_type, ride_length, day_of_week) %>%
as.tibble() %>%
print(n = 3)
# A tibble: 4,324,766 × 3
user_type ride_length day_of_week
<chr> <dbl> <ord>
1 Casual 34355 Saturday
2 Member 32035 Monday
3 Casual 29271 Saturday
I want to create a table that provides the average ride length of the users per group per day. I used SQL's CASE WHEN to divide the days of the week in separate columns. See below:
SELECT
COALESCE(user_type,'combined') AS user_type,
AVG(CASE WHEN DAYNAME(started_at) = 'Monday' THEN TIMESTAMPDIFF(MINUTE,started_at, ended_at) ELSE NULL END) AS avg_ride_length_monday,
AVG(CASE WHEN DAYNAME(started_at) = 'Tuesday' THEN TIMESTAMPDIFF(MINUTE,started_at, ended_at) ELSE NULL END) AS avg_ride_length_tuesday,
AVG(CASE WHEN DAYNAME(started_at) = 'Wednesday' THEN TIMESTAMPDIFF(MINUTE,started_at, ended_at) ELSE NULL END) AS avg_ride_length_wednesday,
AVG(CASE WHEN DAYNAME(started_at) = 'Thursday' THEN TIMESTAMPDIFF(MINUTE,started_at, ended_at) ELSE NULL END) AS avg_ride_length_thursday,
AVG(CASE WHEN DAYNAME(started_at) = 'Friday' THEN TIMESTAMPDIFF(MINUTE,started_at, ended_at) ELSE NULL END) AS avg_ride_length_friday,
AVG(CASE WHEN DAYNAME(started_at) = 'Saturday' THEN TIMESTAMPDIFF(MINUTE,started_at, ended_at) ELSE NULL END) AS avg_ride_length_saturday,
AVG(CASE WHEN DAYNAME(started_at) = 'Sunday' THEN TIMESTAMPDIFF(MINUTE,started_at, ended_at) ELSE NULL END) AS avg_ride_length_sunday,
AVG(TIMESTAMPDIFF(MINUTE,started_at, ended_at)) AS grand_total
FROM
bikes.work
GROUP BY
user_type WITH ROLLUP;
In my research, I found dplyr::case_when(), but I have only seen examples where values are counted in one column, and I want each day of the week to be spread out. Does R have the ability to push results over several columns?
Edit: Thanks to your suggestions, I found this potential answer:
working_df %>%
mutate_at(vars(c(user_type, day_of_week)), funs(as.character(.))) %>%
bind_rows(mutate(., user_type = "Combined")) %>%
bind_rows(mutate(., day_of_week = "Grand_Total")) %>%
group_by(user_type, day_of_week) %>%
summarize(avg_ride_length_min = mean(ride_length)) %>%
pivot_wider(names_from = day_of_week,
values_from = avg_ride_length_min,
names_prefix = "avg_ride_length_")
user_type avg_ride_length_Friday avg_ride_length_Grand_Total avg_ride_length_Monday avg_ride_length_Saturday avg_ride_length_Sunday
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Casual 22.6 24.2 25.0 27.0 27.5
2 Combined 16.5 17.3 16.8 20.9 20.8
3 Member 12.4 12.6 12.2 14.2 14.0
avg_ride_length_Thursday avg_ride_length_Tuesday avg_ride_length_Wednesday
<dbl> <dbl> <dbl>
1 21.6 21.6 20.9
2 15.5 15.1 14.9
3 12.2 11.9 12.0
Here is the answer with tidier results:
working_df %>%
mutate(across(c(user_type, day_of_week), as.character)) %>%
bind_rows(mutate(., user_type = "United")) %>%
bind_rows(mutate(., day_of_week = "gt")) %>%
group_by(user_type, day_of_week) %>%
summarize(avg_ride_length_min = mean(ride_length)) %>%
pivot_wider(names_from = day_of_week,
values_from = avg_ride_length_min,
names_prefix = "avg_ride_length_") %>%
select("user_type", "avg_ride_length_Monday", "avg_ride_length_Tuesday", "avg_ride_length_Wednesday", "avg_ride_length_Thursday", "avg_ride_length_Friday", "avg_ride_length_Saturday", "avg_ride_length_Sunday", "avg_ride_length_gt")
user_type avg_ride_length_Monday avg_ride_length_Tuesday avg_ride_length_Wednesday avg_ride_length_Thursday avg_ride_length_Friday
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Casual 25.0 21.6 20.9 21.6 22.6
2 Member 12.2 11.9 12.0 12.2 12.4
3 United 16.8 15.1 14.9 15.5 16.5
avg_ride_length_Saturday avg_ride_length_Sunday avg_ride_length_gt
<dbl> <dbl> <dbl>
1 27.0 27.5 24.2
2 14.2 14.0 12.6
3 20.9 20.8 17.3
For some reason, bind_rows
changes the row order alphabetically. I wasn't able to change that, so I changed the last row from "Combined" to "United"