I have some football code that uses data sets that are updated every week, now they have become really large data frames and I needed to convert them into data tables.
The problem is with this part of the code, I don't know how to replicate the result in data tables. The idea is that columns whose name appears in the non_percent vector are summarized in a certain way, while those that appear in the percent vector are summarized differently.
player_matchLogs %>%
group_by(Player, Tier, Sex) %>%
summarize(
Min = round(sum(Min, na.rm = TRUE) / 90),
across(non_percent, ~ round(sum(.x, na.rm = TRUE) / Min, 2)),
across(percent, ~ round(mean(.x, na.rm = TRUE), 2))
)
Here's a glimpse of the dataset:
> glimpse(player_matchLogs)
Rows: 41
Columns: 130
$ League <chr> "Premier League", "Pr…
$ Match_Date <chr> "2023-08-11", "2023-0…
$ Matchweek <dbl> 1, 2, 3, 4, 5, 6, 7, …
$ Home_Team <chr> "Burnley", "Mancheste…
$ Home_Formation <chr> "5-4-1", "4-2-3-1", "…
$ Home_Score <dbl> 0, 1, 1, 5, 1, 2, 2, …
$ Home_xG <dbl> 0.3, 1.0, 0.7, 2.2, 0…
$ Home_Goals <chr> "Anass Zaroury · 90+4…
$ Home_Yellow_Cards <chr> "0", "1", "4", "2", "…
$ Home_Red_Cards <chr> "1", "0", "0", "0", "…
$ Away_Team <chr> "Manchester City", "N…
$ Away_Formation <chr> "4-2-3-1", "4-3-3", "…
$ Away_Score <dbl> 3, 0, 2, 1, 3, 0, 1, …
$ Away_xG <dbl> 1.9, 0.3, 3.5, 1.4, 3…
$ Away_Goals <chr> "Erling Haaland · 4&r…
$ Away_Yellow_Cards <chr> "0", "5", "0", "4", "…
$ Away_Red_Cards <chr> "0", "0", "0", "0", "…
$ Game_URL <chr> "https://fbref.com/en…
$ Team <chr> "Manchester City", "M…
$ Home_Away <chr> "Away", "Home", "Away…
$ Player <chr> "Julián Álvarez", "Ju…
$ Player_Href <chr> "/en/players/15ab5a2b…
$ Player_Num <dbl> 19, 19, 19, 19, 19, 1…
$ Nation <chr> "ARG", "ARG", "ARG", …
$ Pos <chr> "LM,AM,FW", "AM", "AM…
$ Age <chr> "23-192", "23-200", "…
$ Min <dbl> 90, 90, 90, 89, 90, 5…
$ Cmp_Total <dbl> 36, 24, 34, 28, 52, 1…
$ Att_Total <dbl> 44, 31, 52, 37, 63, 2…
$ Cmp_percent_Total <dbl> 81.8, 77.4, 65.4, 75.…
$ TotDist_Total <dbl> 490, 283, 542, 355, 7…
$ PrgDist_Total <dbl> 113, 83, 102, 100, 18…
$ Cmp_Short <dbl> 22, 18, 20, 15, 30, 1…
$ Att_Short <dbl> 25, 20, 24, 19, 34, 1…
$ Cmp_percent_Short <dbl> 88.0, 90.0, 83.3, 78.…
$ Cmp_Medium <dbl> 7, 5, 11, 6, 13, 4, 5…
$ Att_Medium <dbl> 9, 9, 17, 7, 14, 4, 8…
$ Cmp_percent_Medium <dbl> 77.8, 55.6, 64.7, 85.…
$ Cmp_Long <dbl> 3, 0, 3, 2, 5, 2, 2, …
$ Att_Long <dbl> 5, 1, 8, 4, 11, 5, 6,…
$ Cmp_percent_Long <dbl> 60.0, 0.0, 37.5, 50.0…
$ Ast <dbl> 1, 0, 0, 0, 2, 0, 0, …
$ xAG <dbl> 0.3, 0.1, 0.2, 0.0, 0…
$ xA <dbl> 0.4, 0.0, 0.2, 0.0, 0…
$ KP <dbl> 6, 2, 4, 0, 4, 0, 3, …
$ Final_Third <dbl> 0, 0, 1, 2, 5, 0, 2, …
$ PPA <dbl> 2, 1, 1, 1, 3, 0, 1, …
$ CrsPA <dbl> 0, 0, 0, 0, 1, 0, 0, …
$ PrgP <dbl> 2, 4, 4, 3, 4, 1, 4, …
$ Sex <chr> "M", "M", "M", "M", "…
$ Tier <chr> "1st", "1st", "1st", …
$ Att <dbl> 44, 31, 52, 37, 63, 2…
$ Live_Pass_Types <dbl> 38, 29, 40, 34, 55, 2…
$ Dead_Pass_Types <dbl> 6, 2, 11, 2, 8, 3, 8,…
$ FK_Pass_Types <dbl> 3, 1, 1, 1, 1, 0, 6, …
$ TB_Pass_Types <dbl> 2, 1, 0, 0, 1, 0, 0, …
$ Sw_Pass_Types <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ Crs_Pass_Types <dbl> 5, 3, 12, 1, 9, 3, 5,…
$ TI_Pass_Types <dbl> 1, 1, 0, 0, 1, 0, 0, …
$ CK_Pass_Types <dbl> 2, 0, 9, 1, 5, 2, 2, …
$ In_Corner_Kicks <dbl> 0, 0, 5, 0, 0, 1, 0, …
$ Out_Corner_Kicks <dbl> 2, 0, 2, 1, 4, 1, 1, …
$ Str_Corner_Kicks <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ Cmp_Outcomes <dbl> 36, 24, 34, 28, 52, 1…
$ Off_Outcomes <dbl> 0, 0, 1, 1, 0, 0, 0, …
$ Blocks_Outcomes <dbl> 0, 0, 2, 1, 1, 1, 0, …
$ Tkl_Tackles <dbl> 1, 1, 0, 1, 0, 1, 2, …
$ TklW_Tackles <dbl> 1, 1, 0, 1, 0, 1, 0, …
$ `Def 3rd_Tackles` <dbl> 1, 0, 0, 0, 0, 1, 0, …
$ `Mid 3rd_Tackles` <dbl> 0, 0, 0, 1, 0, 0, 0, …
$ `Att 3rd_Tackles` <dbl> 0, 1, 0, 0, 0, 0, 2, …
$ Tkl_Challenges <dbl> 0, 0, 0, 0, 0, 0, 2, …
$ Att_Challenges <dbl> 1, 1, 2, 1, 0, 0, 2, …
$ Tkl_percent_Challenges <dbl> 0, 0, 0, 0, NA, NA, 1…
$ Lost_Challenges <dbl> 1, 1, 2, 1, 0, 0, 0, …
$ Blocks_Blocks <dbl> 1, 1, 3, 1, 1, 0, 0, …
$ Sh_Blocks <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ Pass_Blocks <dbl> 1, 1, 3, 1, 1, 0, 0, …
$ Int <dbl> 0, 0, 1, 0, 0, 0, 0, …
$ `Tkl+Int` <dbl> 1, 1, 1, 1, 0, 1, 2, …
$ Clr <dbl> 0, 0, 0, 0, 0, 1, 0, …
$ Err <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ Touches_Touches <dbl> 53, 40, 67, 41, 67, 3…
$ `Def Pen_Touches` <dbl> 0, 0, 0, 0, 0, 2, 0, …
$ `Def 3rd_Touches` <dbl> 2, 5, 2, 0, 5, 3, 3, …
$ `Mid 3rd_Touches` <dbl> 22, 19, 17, 23, 18, 8…
$ `Att 3rd_Touches` <dbl> 29, 16, 49, 19, 44, 2…
$ `Att Pen_Touches` <dbl> 4, 3, 9, 2, 4, 2, 2, …
$ Live_Touches <dbl> 53, 40, 67, 41, 67, 3…
$ Att_Take_Ons <dbl> 0, 3, 0, 1, 0, 2, 1, …
$ Succ_Take_Ons <dbl> 0, 2, 0, 1, 0, 0, 1, …
$ Succ_percent_Take_Ons <dbl> NA, 66.7, NA, 100.0, …
$ Tkld_Take_Ons <dbl> 0, 1, 0, 0, 0, 2, 0, …
$ Tkld_percent_Take_Ons <dbl> NA, 33.3, NA, 0.0, NA…
$ Carries_Carries <dbl> 25, 25, 44, 26, 45, 1…
$ TotDist_Carries <dbl> 104, 130, 156, 159, 2…
$ PrgDist_Carries <dbl> 50, 62, 91, 114, 158,…
$ PrgC_Carries <dbl> 2, 3, 1, 3, 6, 0, 0, …
$ Final_Third_Carries <dbl> 2, 1, 6, 3, 4, 1, 0, …
$ CPA_Carries <dbl> 1, 1, 0, 1, 1, 0, 0, …
$ Mis_Carries <dbl> 2, 3, 2, 0, 0, 0, 0, …
$ Dis_Carries <dbl> 1, 0, 1, 2, 0, 0, 1, …
$ Rec_Receiving <dbl> 40, 32, 46, 35, 47, 2…
$ PrgR_Receiving <dbl> 6, 5, 10, 5, 7, 2, 4,…
$ CrdY <dbl> 0, 1, 0, 0, 0, 0, 1, …
$ CrdR <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ `2CrdY` <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ Fls <dbl> 1, 3, 0, 3, 0, 0, 1, …
$ Fld <dbl> 1, 1, 0, 1, 0, 0, 1, …
$ Off <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ Crs <dbl> 5, 3, 12, 1, 9, 3, 5,…
$ TklW <dbl> 1, 1, 0, 1, 0, 1, 0, …
$ PKwon <dbl> 0, 0, 0, 1, 0, 0, 0, …
$ PKcon <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ OG <dbl> 0, 0, 0, 0, 0, 0, 0, …
$ Recov <dbl> 2, 3, 2, 3, 2, 3, 2, …
$ Won_Aerial_Duels <dbl> 1, 0, 0, 0, 0, 0, 0, …
$ Lost_Aerial_Duels <dbl> 3, 0, 0, 0, 3, 0, 1, …
$ Won_percent_Aerial_Duels <dbl> 25.0, NA, NA, NA, 0.0…
$ Id <chr> "20230811ManchesterCi…
$ Team_Formation <chr> "4-2-3-1", "4-2-3-1",…
$ Rival_Formation <chr> "5-4-1", "4-3-3", "3-…
$ Team_xG <dbl> 1.9, 1.0, 3.5, 2.2, 3…
$ Rival_xG <dbl> 0.3, 0.3, 0.7, 1.4, 0…
$ Team_YCards <chr> "0", "1", "0", "2", "…
$ Rival_YCards <chr> "0", "5", "4", "4", "…
$ Team_RCards <chr> "0", "0", "0", "0", "…
$ Rival_RCards <chr> "1", "0", "0", "0", "…
$ Team_Goals <dbl> 3, 1, 2, 5, 3, 2, 1, …
$ Rival_Goals <dbl> 0, 0, 1, 1, 1, 0, 2, …
Thank you and sorry if this is a stupid question.
You could try dtplyr, which lets you use dplyr code with data.table as a backend:
library(dplyr)
library(dtplyr)
player_matchLogs %>%
lazy_dt() %>%
group_by(Player, Tier, Sex) %>%
summarize(
Min = round(sum(Min, na.rm = TRUE) / 90),
across(all_of(non_percent), ~ round(sum(.x, na.rm = TRUE) / Min, 2)),
across(all_of(percent), ~ round(mean(.x, na.rm = TRUE), 2))
) %>%
as_tibble()