Search code examples
rdplyrsummarizegt

How to create a calculated column in R


Below is the sample data set and the desired manipulations. As of yet, all works fine. Attempting to create a new calculated column. Some context, the smb stands for small business. 1,2,3,4 represent differing thresholds of what would be considered small. The desired column would what percentage of total employment is taken up by smb = 1 for a given area, for example. For area 001, this would be 46/1927 for example. I can figure out how to have it to be appear once but not as a complete column. How would I go about doing this? The desired result is at the bottom.

library(readxl)
library(dplyr)
library(data.table)
library(DBI)
library(stringr)
library(tidyverse)
library(gt)


 employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
 small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
 area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
 year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
 qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)

 smbtest <- data.frame(employment,small,area,year,qtr)

 smbtest$smb <-0

 smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))


 smbsummary2<-smbtest %>% 
 mutate(period = paste0(year,"q",qtr)) %>%
 group_by(area,period,smb) %>%
 summarise(employment = sum(employment), worksites = n(), 
        .groups = 'drop_last') %>% 
 mutate(employment = cumsum(employment),
     worksites = cumsum(worksites))

 smbsummary2<- smbsummary2%>%
 group_by(area,smb)%>%
 mutate(empprevyear=lag(employment),
     empprevyearpp=employment-empprevyear,
     empprevyearpct=((employment/empprevyear)-1), 
 empprevyearpct=scales::percent(empprevyearpct,accuracy = 0.01)
 )




smblonger2<-smbsummary2 %>%
dplyr::select(area,period,employment,worksites,smb) %>%
ungroup() %>%
pivot_longer(cols = employment:worksites, names_to = "measure", values_to = "value") %>%
group_by(area,measure) %>%
pivot_wider(names_from = period, values_from = value)%>%filter(smb %in% 
c("1","2","3","4"))%>%gt()%>%cols_label(
smb = md("**Category**"))


smblonger2

area    period   smb    employment    worksites    pcttotal
 1      2020q1    1         46           2          46/1927 (total employment)
 2      2020q2    2        301           4          301/1927
 3      2020q3    3        466           5          466/1927
 4      2020q4    4        726           6          726/1927

 schema
 smb      employment range
  1         0 to 100
  2         0 to 150
  3         0 to 250
  4         0 to 500

Solution

  • OK so here is my solution (someone will now come along with a 1 line function!)

    library(dplyr)
    library(tidyr)
    
    employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
    small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
    area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
    year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
    qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)
    
    smbtest <- data.frame(employment,small,area,year,qtr)
    
    smbtest$smb <-0  # I think this line is redundent
    
    smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))
    
    smbsummary2<-smbtest %>% 
        mutate(period = paste0(year,"q",qtr)) %>%
        group_by(area,period,smb) %>%
        summarise(employment = sum(employment), worksites = n(), 
                  .groups = 'drop_last') %>% 
        mutate(employment = cumsum(employment),
               worksites = cumsum(worksites))
    
    
    
    smbsummary2 %>%
        # Make the data wider (a column for each smb)
        pivot_wider(
            id_cols=c("area", "period"), 
            names_from = "smb", 
            values_from = c("employment", "worksites"),
            names_prefix = "SMB"
            ) %>%
        # calculate the %
        mutate(across(starts_with("employment_SMB"), 
                      ~(100*(.x/employment_SMBNA)),
                      .names = "pcttotal_{.col}")) %>%
    
        # Now make the data longer
        pivot_longer(
            cols = contains("SMB")
        ) %>%
        # rework the data names so the smb is a value
        separate(name, into=c("name", "smb"), sep="_SMB") %>%
        # Make the date wider again to match the shape requested
        pivot_wider(
            id_cols=c("area", "period", "smb"), 
            names_from = "name", 
            values_from = "value"
        ) -> smbsummary3