When working with data all roads for me lead to "stratification tables" so one can get a feel for the dispersion of the data. Visualization is both by numeric table and plot.
Can someone please recommend a flexible way to generate a stratification table; by "flexible" I mean where the user can input stratification parameters? In the below code I present a sample data frame, and the ways I'd like the user to be eventually able to cut (stratify) the data.
I'm pretty new to R and have always run stratifications in Excel. In the image at the bottom you can see you how I normally stratify in Excel, with the end product highlighted in yellow. I also include a 2nd image that shows the formulas used to generate the stratification table in the first image.
I've been trying to limit the use of packages (other than shiny and the amazing dplyr, DT) but I imagine there are some nice packages too for running stratifications.
Note that my stratifications are run as of a specific point-in-time (in my data there 2 ways to measure time, via Period_1
and Period_2
). So only those rows meeting that time criteria are included in the stratification.
Does anyone have suggestions for doing this?
Code:
library(shiny)
library(tidyverse)
library(shinyWidgets)
ui <-
fluidPage(
h5(strong("Raw data:")),
tableOutput("data"),
h5(strong("Grouped data:")),
radioButtons(
inputId = "grouping",
label = NULL,
choiceNames = c("By period 1", "By period 2"),
choiceValues = c("Period_1", "Period_2"),
selected = "Period_1",
inline = TRUE
),
tableOutput("summed_data"),
h5(strong("Point-in-time stratification table:")),
selectInput(inputId = "time",
label = "Choose a point-in-time:",
list(`By Period_1:` = list("2020-01", "2020-02", "2020-03", "2020-04"),
`By Period_2:` = list(1, 2, 3, 4)),
selected = "2020-04"),
numericInput(label = "Stratify by range of values:", 'strat_gap','',value=5,step=1,width = '100%'),
panel(
checkboxGroupInput(
inputId = "vars",
label = "Select characteristics to filter data by:",
choices = c("Category"),
selected = c("Category"),
inline = TRUE
),
selectizeGroupUI(
id = "my-filters",
params = list(
Category = list(inputId = "Category", title = "Category:")
)
),
status = "primary"
),
)
server <- function(input, output, session) {
data <- reactive({
data.frame(
ID = c(1,1,2,2,2,2,3,3,3),
Period_1 = c("2020-03", "2020-04", "2020-01", "2020-02", "2020-03", "2020-04", "2020-02", "2020-03", "2020-04"),
Period_2 = c(1, 2, 1, 2, 3, 4, 1, 2, 3),
Category = c("Toad", "Toad", "Stool", "Stool", "Stool", "Stool","Toad","Toad","Toad"),
Values = c(15, 25, 35, 45, 55, 87, 10, 20, 30)
)
})
choice <- reactive(input$grouping)
summed_data <- reactive({
data() %>%
group_by(across(choice())) %>%
select("Values") %>%
summarise(across(everything(), sum, na.rm = TRUE)) %>%
filter(across(1,.fns = ~ .x %>% negate(is.na)() ))
})
output$data <- renderTable(data())
output$summed_data <- renderTable(summed_data())
}
shinyApp(ui, server)
Excel example (2nd image shows stratification formulas):
In the interest of making this a more generalizable effort, here's how I would do it. In the UI, you can upload a CSV file and it grabs the names of the variables to use from the names in the file. There is one caveat here - the grouping variables have to have "Period" in their names somewhere. Otherwise, from there, you can choose the values to be summed from a list of the names of variables. The point in time values are taken from the observed values of the stratifying variable. You can also choose to filter on single variable and the values you can filter on are taken from the observed values of the filtering variable. Here's what it looks like:
and here is the code:
library(shiny)
library(tidyverse)
ui <-
fluidPage(
fluidRow(column(3, h5(strong("File Upload:"))),
column(3, h5(strong("Grouping:"))),
column(3, h5(strong("Point-in-time stratification table:"))),
column(3, h5(strong("Filtering:")))),
fluidRow(
column(3,
#actionButton("browser", "Browser"),
fileInput("file1", "Choose CSV File",
multiple = TRUE,
accept = c("text/csv",
"text/comma-separated-values,text/plain",
".csv")),
tags$hr(),
# Input: Checkbox if file has header ----
checkboxInput("header", "Header", TRUE),
# Input: Select separator ----
radioButtons("sep", "Separator",
choices = c(Comma = ",",
Semicolon = ";",
Tab = "\t"),
selected = ","),
# Input: Select quotes ----
radioButtons("quote", "Quote",
choices = c(None = "",
"Double Quote" = '"',
"Single Quote" = "'"),
selected = '"')),
column(3,
uiOutput("values"),
uiOutput("period")),
column(3,
uiOutput("time"),
numericInput(label = "Stratify by range of values:", 'strat_gap','',value=5,step=1,width = '100%'),
),
column(3,
uiOutput("filter_var"),
uiOutput("filter_val")
)),
fluidRow(
column(6,
h5(strong("Raw data:")),
tableOutput("data"),
),
column(6,
h5(strong("Grouped data:")),
tableOutput("summed_data"),
)
)
)
server <- function(input, output, session) {
dat <- reactive({
req(input$file1)
read.csv(input$file1$datapath,
header = input$header,
sep = input$sep,
quote = input$quote)
})
output$period <- renderUI({
req(dat())
pds <- dat() %>% select(contains("Period")) %>% names
chc_pd <- pds
names(chc_pd) <- paste0("By ", gsub("_", "", pds))
selectInput(inputId = "period",
label = NULL,
choices = chc_pd,
selected = pds[1]
)
})
output$time <- renderUI({
req(dat())
req(input$period)
chc <- unique(na.omit(dat()[[input$period]]))
selectInput(inputId = "time",
label = "Choose a point-in-time:",
choices = chc,
selected = chc[1])
})
output$filter_var <- renderUI({
req(dat())
chc_filt <- names(dat())
selectizeInput("filter_var",
label = "Filtering Variable",
choices = c("", names(dat())),
selected="")
})
output$filter_val <- renderUI({
req(dat())
if(input$filter_var != ""){
chc_fv <- sort(unique(na.omit(dat()[[input$filter_var]])))
selectizeInput("filter_vals",
label="Filter Values",
choices = c("", chc_fv),
selected="",
multiple=TRUE)
}
})
output$values <- renderUI({
req(dat())
selectInput("vals",
"Variable to be Summarised",
choices = names(dat()),
selected = names(dat())[ncol(dat())])
})
output$data <- renderTable(dat())
output$summed_data <- renderTable({
breaks <- seq(min(dat()[[input$vals]], na.rm=TRUE),
max(dat()[[input$vals]], na.rm=TRUE),
by=input$strat_gap)
if(max(breaks) < max(dat()[[input$vals]], na.rm=TRUE)){
breaks <- c(breaks, max(breaks) + input$strat_gap)
}
qs <- ifelse(is.character(dat()[[input$period]]), "'", "")
filter_exp1 <- parse(text=paste0(input$period, "==", qs,input$time, qs))
tmp <- dat() %>%
filter(eval(filter_exp1))
if(input$filter_var != ""){
if(is.character(dat()[[input$filter_var]])){
fv <- paste("c(", paste("'", input$filter_vals, "'", collapse=",", sep=""), ")", sep="")
}else{
fv <- paste("c(", paste(input$filter_vals, collapse=",", sep=""), ")", sep="")
}
filter_exp2 <- parse(text=paste0(input$filter_var, "%in%", fv))
tmp <- tmp %>% filter(eval(filter_exp2))
}
tmp <- tmp %>%
mutate(sumvar = cut(!!sym(input$vals), breaks=breaks, include.lowest=TRUE)) %>%
group_by(sumvar) %>%
summarise(Count = n(),
Values = sum(!!sym(input$vals))) %>%
complete(sumvar, fill = list(Count = 0,
Values = 0)) %>%
ungroup %>%
mutate(Count_pct = sprintf("%.1f%%", (Count/sum(Count))*100),
Values_pct = sprintf("%.1f%%", (Values/sum(Values))*100)) %>%
dplyr::select(everything(), Count, Count_pct, Values, Values_pct)
names(tmp)[1] <- "Range"
tmp
})
# observeEvent(input$browser, {
# browser()
# })
}
shinyApp(ui, server)