Search code examples
rdataframefunctiondplyrtidyverse

Avoiding NAs in the output of dplyr


I have tried the following to turn my DATA into my Desired_output (reproducible code below). But as you can see, I get a bunch of NA in my output.

Is there a way to obtain my Desired_output?

Tried:

library(tidyverse)
DATA %>%
  mutate(
    category = case_when(
      str_detect(Discription, "CTE") ~ "CTE",
      str_detect(Discription, "AP/IB") ~ "AP_IB",
      TRUE ~ "Other"
    ),
    group = case_when(
      str_detect(Discription, "ELs") ~ "ELs",
      str_detect(Discription, "Former ELs") ~ "Former ELs",
      str_detect(Discription, "Monitor ELs") ~ "Monitor ELs",
      str_detect(Discription, "Never ELs") ~ "Never ELs",  # Explicitly include Never ELs
      TRUE ~ "Other"
    ),
    value_type = case_when(
      str_detect(Discription, "^Total") ~ "Total",  # Identify rows with Total values
      str_detect(Discription, "Percent") ~ "Percent",  # Identify rows with Percent values
      TRUE ~ "Value"  # Identify rows with regular values
    )
  ) %>%
  # Separate the Total rows and use them in the "Total" column for each group
  filter(value_type != "Percent" & value_type != "Other") %>%
  pivot_wider(
    names_from = value_type, 
    values_from = value, 
    values_fn = list(value = sum), 
    values_fill = list(value = NA)
  ) %>%
  # Pivot the Total values separately
  mutate(
    Total = case_when(
      str_detect(Discription, "Total") ~ Value,  # Take "Total" values for each group
      TRUE ~ NA_real_
    )
  ) %>%
  # Now calculate Percent based on (Value / Total) * 100 for each group
  group_by(group) %>%
  mutate(
    Percent = ifelse(!is.na(Value) & !is.na(Total), (Value / Total) * 100, NA)
  ) %>%
  ungroup() %>%
  # Remove rows where Description contains 'Percent' because we already have a Percent column
  filter(!str_detect(Discription, "Percent")) %>%
  # Select the columns for the final output
  select(ResdDistInstID, InstNm, category, group, Discription, Value, Total, Percent) %>%
  arrange(InstNm, category, group)
DATA <- structure(list(
  ResdDistInstID = c(1894, 1894, 1894, 1894, 1894, 
                     1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 
                     1894, 1894, 1894, 1894), 
  InstNm = c("Baker SD 5J", "Baker SD 5J", 
             "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", 
             "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", 
             "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", 
             "Baker SD 5J", "Baker SD 5J", "Baker SD 5J"), 
  Discription = c("ELs in an AP/IB Class", 
                  "ELs CTE Class", "Total ELs", "Percent ELs in AP/IB Class", "Percent ELs in CTE Class", 
                  "Never ELs in an AP/IB Class", "Never ELs in a CTE Class", "Total Never ELs", 
                  "Percent Never ELs in AP/IB", "Percent Never ELs in CTE", "Former ELs CTE Class", 
                  "Former ELs in an AP/IB Class", "Total Former ELs", "Percent Former ELs in AP/IB", 
                  "Percent Former ELs in CTE", "Monitor ELs CTE Class", "Monitor ELs in an AP/IB Class", 
                  "Total Monitor ELs", "Percent Monitor ELs in AP/IB", "Percent Monitor ELs in CTE"
  ), 
  value = c(1, 6, 83, 1.2, 7.2, 95, 329, 4845, 1.9, 6.7, 12, 
            5, 129, 3.8, 9.3, 2, 0, 29, 0, 6.8)), 
  row.names = c(NA, -20L), 
  class = "data.frame"
)

Desired_Output <- structure(list(
  ResdDistInstID = c(1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894), 
  InstNm = c("Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", 
             "Baker SD 5J", "Baker SD 5J", "Baker SD 5J", "Baker SD 5J"), 
  category = c("AP_IB", "CTE", "AP_IB", "CTE", "AP_IB", "CTE", "AP_IB", "CTE"), 
  group = c("ELs", "ELs", "Former ELs", "Former ELs", "Never ELs", "Never ELs", 
            "Monitor ELs", "Monitor ELs"), 
  Discription = c("ELs in an AP/IB Class", "ELs CTE Class", "Former ELs in an AP/IB Class", 
                  "Former ELs CTE Class", "Never ELs in an AP/IB Class", "Never ELs in a CTE Class", 
                  "Monitor ELs in an AP/IB Class", "Monitor ELs CTE Class"), 
  Value = c(1, 6, 5, 12, 95, 329, 2, 0), 
  Total = c(83, 83, 129, 129, 4845, 4845, 29, 29), 
  Percent = c(1.2, 7.2, 3.9, 9.3, 1.9, 6.8, 6.9, 0)
), row.names = c(NA, -8L), class = "data.frame")

Solution

  • This should do what you want. I used
    Gregor Thomas' cool str_extract(Discription, pattern = "((Former )|(Monitor )|(Never ))?ELs")) and some left_joins.

    DO <- DATA %>%
      mutate(
        category = if_else(grepl("AP/IB", Discription), "AP_IB", "CTE"),
        group = str_extract(Discription, pattern = "((Former )|(Monitor )|(Never ))?ELs")
      ) %>%
      # Get totals
      left_join(DATA %>% filter(grepl("Total", Discription)) %>% 
          mutate(
            group = gsub("Total ", "", Discription),
            Total = value
          ) %>% select(group, Total), by = "group") %>%
      # Get percentages
      left_join(DATA %>% filter(grepl("Percent", Discription)) %>%
          mutate(
            group = str_extract(Discription, pattern = "((Former )|(Monitor )|(Never ))?ELs"),
            category = if_else(grepl("AP/IB", Discription), "AP_IB", "CTE")
          ) %>% select(group, category, Percent = value),  by = c("group", "category")
      ) %>%
      filter(!grepl("Total|Percent", Discription)) %>%
      select(ResdDistInstID,InstNm, category, group,Discription,Value = value,Total,Percent) %>% arrange(InstNm, category, group)