Search code examples
sqlrdplyrcommon-table-expressiondbplyr

How to retrieve Common Table Expressions (CTEs) as a R list?


Context

I am connected to a PostgreSQL database and I use dplyr + dbplyr to query it. One of the query is quite long and complex and uses multiple CTEs.

Problem

I need to use these CTEs to make some plots and analysis within R but I do not know how to get them as "separated" dataframes within R.

I tried to break down the long query into smaller ones but this obviously fails when one CTE needs a previous one.

Question

How to retrieve the CTEs of a query as a list of dataframes in R?

Example

Please find below a small example illustrating the question. The query is very dumb but I feel it is enough for illustration purpose.

# Packages ----
if(!require("dbplyr")){install.packages("dbplyr")}; library(dbplyr)
if(!require("tidyverse")){install.packages("tidyverse")}; library(tidyverse)

# Set up the example database using the iris dataset ----
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con, iris)
# Query using the "WITH" command ----
query = sql(
  "WITH
  tbl_set AS (SELECT * FROM iris WHERE Species = 'setosa'),
  tbl_ver AS (SELECT * FROM iris WHERE Species = 'versicolor'),
  tbl_all AS (
    SELECT *
    FROM tbl_set 
    UNION ALL SELECT * FROM tbl_ver)
SELECT * FROM tbl_all"
)

Expected output

The expected output is an object in the R environment, for example a list containing each CTE as dataframes, see below the expected structure:

# Target output ----
list(
  tbl_set = filter(iris, Species == "setosa"),
  tbl_ver = filter(iris, Species == "versicolor"),
  tbl_all = collect(tbl(con, query))
)
#> $tbl_set
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1           5.1         3.5          1.4         0.2  setosa
#> 2           4.9         3.0          1.4         0.2  setosa
#> 3           4.7         3.2          1.3         0.2  setosa
#> 4           4.6         3.1          1.5         0.2  setosa
#> 
#> $tbl_ver
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 1           7.0         3.2          4.7         1.4 versicolor
#> 2           6.4         3.2          4.5         1.5 versicolor
#> 3           6.9         3.1          4.9         1.5 versicolor
#> 4           5.5         2.3          4.0         1.3 versicolor
#> 
#> $tbl_all
#> # A tibble: 100 × 5
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
#>  1          5.1         3.5          1.4         0.2 setosa 
#>  2          4.9         3            1.4         0.2 setosa 
#>  3          4.7         3.2          1.3         0.2 setosa 
#>  4          4.6         3.1          1.5         0.2 setosa 
#> # ℹ 90 more rows

Created on 2024-07-04 with reprex v2.1.0


Solution

  • 1) Instead of using CTEs create temporary tables. Below we have changed the example slightly so that the creation of tbl_ver depends on tbl_set.

    library(RSQLite)
    
    con <- dbConnect(SQLite())
    dbWriteTable(con, "iris", iris)
    
    res <- dbExecute(con, "CREATE TEMPORARY TABLE tbl_set AS
      SELECT * FROM iris WHERE Species = 'setosa' ")
    res <- dbExecute(con, "CREATE TEMPORARY TABLE tbl_ver AS
      SELECT * FROM iris WHERE Species = 'versicolor' AND
        \"Sepal.Length\" > (SELECT avg(\"Sepal.Length\") FROM tbl_set)")
    
    L <- list(
      tbl_set = dbGetQuery(con, "SELECT * FROM tbl_set"),
      tbl_ver = dbGetQuery(con, "SELECT * FROM tbl_ver"),
      tbl_all = dbGetQuery(con, "SELECT * FROM tbl_set
        UNION ALL SELECT * FROM tbl_ver")
    )
    
    dbDisconnect(con)
    
    str(L)
    

    giving

    List of 3
     $ tbl_set:'data.frame':        50 obs. of  5 variables:
      ..$ Sepal.Length: num [1:50] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
      ..$ Sepal.Width : num [1:50] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
      ..$ Petal.Length: num [1:50] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
      ..$ Petal.Width : num [1:50] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
      ..$ Species     : chr [1:50] "setosa" "setosa" "setosa" "setosa" ...
     $ tbl_ver:'data.frame':        47 obs. of  5 variables:
      ..$ Sepal.Length: num [1:47] 7 6.4 6.9 5.5 6.5 5.7 6.3 6.6 5.2 5.9 ...
      ..$ Sepal.Width : num [1:47] 3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.9 2.7 3 ...
      ..$ Petal.Length: num [1:47] 4.7 4.5 4.9 4 4.6 4.5 4.7 4.6 3.9 4.2 ...
      ..$ Petal.Width : num [1:47] 1.4 1.5 1.5 1.3 1.5 1.3 1.6 1.3 1.4 1.5 ...
      ..$ Species     : chr [1:47] "versicolor" "versicolor" "versicolor" "versicolor" ...
     $ tbl_all:'data.frame':        97 obs. of  5 variables:
      ..$ Sepal.Length: num [1:97] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
      ..$ Sepal.Width : num [1:97] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
      ..$ Petal.Length: num [1:97] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
      ..$ Petal.Width : num [1:97] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
      ..$ Species     : chr [1:97] "setosa" "setosa" "setosa" "setosa" ...
    

    2) If the problem is to extract the CTE's then this inputs an SQL statement with CTE's, extracts them, generates and runs create temoprary table statements and then reads them and the original SQL statement. (To extract the CTE's we count left and right parentheses and add a @ after the outer ones so that we can later extract the content of the outer parentheses coming after a name and AS.)

    library(gsubfn)
    library(RSQLite)
    
    sql <- "WITH
      tbl_set AS (SELECT * FROM iris WHERE Species = 'setosa'),
      tbl_ver AS (SELECT * FROM iris WHERE Species = 'versicolor' AND 
      \"Sepal.Length\" > (SELECT avg(\"Sepal.Length\") FROM tbl_set))
    SELECT * FROM tbl_set
        UNION ALL SELECT * FROM tbl_ver
    "
    
    # insert @ after outer parentheses
    p <- proto(
      pre = function(this) this$depth <- -1,
      fun = function(this, x) {
        if (x == "(") this$depth <- this$depth + 1
        ret <- if (this$depth == 0) paste0(x, "@") else x
        if (x == ")") this$depth <- this$depth - 1
        ret
      }
    )
    
    # extract CTEs
    cte <- sql |>
      chartr("\n", " ", x = _) |>
      gsubfn("[\\(\\)]", p, x = _) |>
      strapply("(([a-z_]+) +AS \\(@.*?\\)@)", perl = TRUE) |>
      unlist() |>
      trimws() |> 
      gsub("@", "", x = _)
    
    # cte names
    nms <- sub(" .*", "", cte)
    
    # form SQL statements
    sqls <- cte |>
      sub("\\((.*)\\)", "\\1", x = _) |>
      paste("CREATE TEMPORARY TABLE", ... = _) |>
      gsub(".@", "", x = _) |>
      setNames(nms)
    
    # run statements to create temporary tables from cte's
    library(RSQLite)
    con <- dbConnect(SQLite())
    dbWriteTable(con, "iris", iris)
    for(s in sqls) dbExecute(con, s)
    
    # get data
    L <- c(lapply(nms, dbReadTable, conn = con), 
      list(tbl_all = dbGetQuery(con, sql)))
    
    dbDisconnect(con)