Search code examples
rsqldf

Using sqldf filter and group by in r dynamically


I am using SQLDF in r to select, filter and group by the data frame dynamically based on user inputs.

I have done the variable selection and done the aggregation part dynamically but not able to add the where clause/having clause in sqldf dynamically. User may select multiple combination of variable and their values as we well.

What I want is

  1. since Texas in state and M in gender is selected so it needs to be shown only in the output on the fly. Had user selected any other combination then those combination will be shown in the output

Below is the code with the data

library(sqldf)

table_a <- data.frame(id =c(1,1,1,2,2,2,3,4,4),
                  state = c("Texas","Texas","Texas","Alaska","Alaska","Alaska","Nevada","Texas","Texas"),
                  gender = c("M","M","M","F","F","F","F","F","F"),
                  channel = c("Y","Y","H","Y","D","D","H","Y","Y"),
                  spend = c(10,10,5,20,20,5,7,1,2))



# user selects variable and their values  
# channel is not selected by user here
input_states <- 'Texas'
input_genders <- 'M'
input_channels <- NULL

a <- c('state','gender','channel')[c(!is.null(input_states),!is.null(input_genders),!is.null(input_channels))]
b <- paste(a, collapse = ",")

sqldf(sprintf("select %s,sum(spend) as tot_spend from table_a group by %s",b,b))

If-else may not be an option since there are multiple filter in my final application


Solution

  • You could build the where clause the same way as you build the select.

    library(sqldf)
    
    table_a <- data.frame(id =c(1,1,1,2,2,2,3,4,4),
                      state = c("Texas","Texas","Texas","Alaska","Alaska","Alaska","Nevada","Texas","Texas"),
                      gender = c("M","M","M","F","F","F","F","F","F"),
                      channel = c("Y","Y","H","Y","D","D","H","Y","Y"),
                      spend = c(10,10,5,20,20,5,7,1,2))
    
    
    
    # user selects variable and their values  
    # channel is not selected by user here
    input_states <- 'Texas'
    input_genders <- 'M'
    input_channels <- NULL
    
    a <- c('state','gender','channel')[c(!is.null(input_states),!is.null(input_genders),!is.null(input_channels))]
    b <- paste(a, collapse = ",")
    
    # Create the where conditions
    where_conditions <- c(
        sprintf("state in ('%s')", paste(input_states, collapse = "', '")),
        sprintf("gender in ('%s')", paste(input_genders, collapse = "', '")),
        sprintf("channel in ('%s')", paste(input_channels, collapse = "', '"))
    )[c(!is.null(input_states),!is.null(input_genders),!is.null(input_channels))]
    
    # combine into where clause
    where_clause <- paste(where_conditions, collapse = " and ")
    
    sqldf(sprintf("select %s,sum(spend) as tot_spend from table_a where %s group by %s", b, where_clause, b))
    #>   state gender tot_spend
    #> 1 Texas      M        25
    

    which I think is your requirement?

    You can then control the output by changing the input_... variables.

    # Try with different combinations of input_...
    input_states <- c('Texas', 'Nevada')
    input_genders <- NULL
    input_channels <- 'H'
    
    a <- c('state','gender','channel')[c(!is.null(input_states),!is.null(input_genders),!is.null(input_channels))]
    b <- paste(a, collapse = ",")
    
    # Create the where conditions
    where_conditions <- c(
        sprintf("state in ('%s')", paste(input_states, collapse = "', '")),
        sprintf("gender in ('%s')", paste(input_genders, collapse = "', '")),
        sprintf("channel in ('%s')", paste(input_channels, collapse = "', '"))
    )[c(!is.null(input_states),!is.null(input_genders),!is.null(input_channels))]
    
    # combine into where clause
    where_clause <- paste(where_conditions, collapse = " and ")
    
    sqldf(sprintf("select %s,sum(spend) as tot_spend from table_a where %s group by %s", b, where_clause, b))
    #>    state channel tot_spend
    #> 1 Nevada       H         7
    #> 2  Texas       H         5
    

    Created on 2024-07-01 with reprex v2.1.0