Search code examples
rdplyrr-dbidbplyrmonetdblite

Mutate variables in database tables directly using dplyr


Here is mtcars data in the MonetDBLite database file.

library(MonetDBLite)
library(tidyverse)
library(DBI)

dbdir <- getwd()
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)

dbWriteTable(conn = con, name = "mtcars_1", value = mtcars)

data_mt <- con %>% tbl("mtcars_1")

I want to use dplyr mutate to create new variables and add (commit!) that to the database table? Something like

data_mt %>% select(mpg, cyl) %>% mutate(var = mpg/cyl) %>% dbCommit(con)

The desired output should be same when we do:

dbSendQuery(con, "ALTER TABLE mtcars_1 ADD COLUMN var DOUBLE PRECISION")
dbSendQuery(con, "UPDATE mtcars_1 SET var=mpg/cyl") 

How can do that?


Solution

  • Here's a couple of functions, create and update.tbl_lazy.

    They respectively implement CREATE TABLE, which was straightforward, and the ALTER TABLE/UPDATE pair which is much less so:

    CREATE

    create <- function(data,name){
      DBI::dbSendQuery(data$src$con,
                       paste("CREATE TABLE", name,"AS", dbplyr::sql_render(data)))
      dplyr::tbl(data$src$con,name)
    }
    

    example:

    library(dbplyr)
    library(DBI)
    con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
    copy_to(con, head(iris,3),"iris")
    
    tbl(con,"iris") %>% mutate(Sepal.Area= Sepal.Length * Sepal.Width) %>% create("iris_2")
    
    # # Source:   table<iris_2> [?? x 6]
    # # Database: sqlite 3.22.0 []
    #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Area
    #          <dbl>       <dbl>        <dbl>       <dbl> <chr>        <dbl>
    # 1          5.1         3.5          1.4         0.2 setosa        17.8
    # 2          4.9         3            1.4         0.2 setosa        14.7
    # 3          4.7         3.2          1.3         0.2 setosa        15.0
    

    UPDATE

    update.tbl_lazy <- function(.data,...,new_type="DOUBLE PRECISION"){
      quos <- rlang::quos(...)
      dots <- rlang::exprs_auto_name(quos, printer = tidy_text)
    
      # extract key parameters from query
      sql <- dbplyr::sql_render(.data)
      con  <- .data$src$con
      table_name <-gsub(".*?(FROM (`|\")(.+?)(`|\")).*","\\3",sql)
      if(grepl("\nWHERE ",sql)) where <-  regmatches(sql, regexpr("WHERE .*",sql))
      else where <- ""
      new_cols <- setdiff(names(dots),colnames(.data))
    
      # Add empty columns to base table
      if(length(new_cols)){
        alter_queries <- paste("ALTER TABLE",table_name,"ADD COLUMN",new_cols,new_type)
        purrr::walk(alter_queries, ~{
          rs <- DBI::dbSendStatement(con, .)
          DBI::dbClearResult(rs)})}
    
      # translate unevaluated dot arguments to SQL instructions as character
      translations  <- purrr::map_chr(dots, ~ translate_sql(!!! .))
      # messy hack to make translations work
      translations <- gsub("OVER \\(\\)","",translations) 
    
      # 2 possibilities: called group_by or (called filter or called nothing)
      if(identical(.data$ops$name,"group_by")){
        # ERROR if `filter` and `group_by` both used
        if(where != "") stop("Using both `filter` and `group by` is not supported")
    
        # Build aggregated table
        gb_cols   <- paste0('"',.data$ops$dots,'"',collapse=", ")
        gb_query0 <- paste(translations,"AS", names(dots),collapse=", ")
        gb_query  <- paste("CREATE TABLE TEMP_GB_TABLE AS SELECT",
                           gb_cols,", ",gb_query0,
                           "FROM", table_name,"GROUP BY", gb_cols)
        rs <- DBI::dbSendStatement(con, gb_query)
        DBI::dbClearResult(rs)
    
        # Delete temp table on exit
        on.exit({
          rs <- DBI::dbSendStatement(con,"DROP TABLE TEMP_GB_TABLE")
          DBI::dbClearResult(rs)
        })
    
        # Build update query
        gb_on <- paste0(table_name,'."',.data$ops$dots,'" = TEMP_GB_TABLE."', .data$ops$dots,'"',collapse=" AND ")
        update_query0 <- paste0(names(dots)," = (SELECT ", names(dots), " FROM TEMP_GB_TABLE WHERE ",gb_on,")",
                                collapse=", ")
        update_query <- paste("UPDATE", table_name, "SET", update_query0)
        rs <- DBI::dbSendStatement(con, update_query)
        DBI::dbClearResult(rs)
    
      } else {
    
        # Build update query in case of no group_by and optional where
        update_query0 <- paste(names(dots),'=',translations,collapse=", ")
        update_query  <- paste("UPDATE", table_name,"SET", update_query0,where)
        rs <- DBI::dbSendStatement(con, update_query)
        DBI::dbClearResult(rs)
      }
      tbl(con,table_name)
    }
    

    example 1, define 2 new numeric columns :

    tbl(con,"iris") %>% update(x=pmax(Sepal.Length,Sepal.Width),
                               y=pmin(Sepal.Length,Sepal.Width))
    
    # # Source:   table<iris> [?? x 7]
    # # Database: sqlite 3.22.0 []
    #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species     x     y
    #          <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl>
    # 1          5.1         3.5          1.4         0.2 setosa    5.1   3.5
    # 2          4.9         3            1.4         0.2 setosa    4.9   3  
    # 3          4.7         3.2          1.3         0.2 setosa    4.7   3.2
    

    example 2, modify an existing column, create 2 new columns of different types :

    tbl(con,"iris") %>%
      update(x= Sepal.Length*Sepal.Width,
             z= 2*y,
             a= Species %||% Species,               
             new_type = c("DOUBLE","VARCHAR(255)"))
    
    # # Source:   table<iris> [?? x 9]
    # # Database: sqlite 3.22.0 []
    #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species     x     y     z a           
    #          <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl> <dbl> <chr>       
    # 1          5.1         3.5          1.4         0.2 setosa   17.8   3.5   7   setosasetosa
    # 2          4.9         3            1.4         0.2 setosa   14.7   3     6   setosasetosa
    # 3          4.7         3.2          1.3         0.2 setosa   15.0   3.2   6.4 setosasetosa
    

    example 3, update where:

    tbl(con,"iris") %>% filter(Sepal.Width > 3) %>% update(a="foo")
    
    # # Source:   table<iris> [?? x 9]
    # # Database: sqlite 3.22.0 []
    #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species     x     y     z a           
    #          <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl> <dbl> <chr>       
    # 1          5.1         3.5          1.4         0.2 setosa   17.8   3.5   7   foo         
    # 2          4.9         3            1.4         0.2 setosa   14.7   3     6   setosasetosa
    # 3          4.7         3.2          1.3         0.2 setosa   15.0   3.2   6.4 foo
    

    example 4 : update by group

    tbl(con,"iris") %>%
      group_by(Species, Petal.Width) %>%
      update(new_col1 = sum(Sepal.Width,na.rm=TRUE), # using a R function
             new_col2 = MAX(Sepal.Length))           # using native SQL
    
    # # Source:   SQL [?? x 11]
    # # Database: sqlite 3.22.0 []
    #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species        x     y     z a            new_col1 new_col2
    #          <dbl>       <dbl>        <dbl>       <dbl> <chr>      <dbl> <dbl> <dbl> <chr>           <dbl>    <dbl>
    # 1          5.1         3.5          1.4         0.2 setosa         1     2   7   foo               6.5      5.1
    # 2          4.9         3            1.4         0.2 setosa         1     2   6   setosasetosa      6.5      5.1
    # 3          7           3.2          4.7         1.4 versicolor     1     2   6.4 foo               3.2      7 
    

    GENERAL NOTES

    • The code uses uses dbplyr::translate_sql so we can use R functions or native ones alike just like in good old mutate calls.

    • update can only be used after one filter call OR one group_by call OR zero of each, anything else and you'll get an error or unexpected results.

    • The group_by implementation is VERY hacky, so no room for defining columns on the fly or grouping by an operation, stick to the basics.

    • update and create both return tbl(con, table_name), which means you can chain as many create or update calls as you wish, with the appropriate amount of group_by and filter in between. In fact all of my 4 examples can be chained.

    • To hammer the nail, create doesn't suffer from the same restrictions, you can have as much dbplyr fun as desired before calling it.

    • I didn't implement type detection, so I needed the new_type parameter, it is recycled in the paste call of the alter_queries definition in my code so it can be a single value or a vector.

    One way to solve the latter would be to extract the variables from the translations variable, find their types in dbGetQuery(con,"PRAGMA table_info(iris)"). Then we need coercion rules between all existing types, and we're set. But as different DBMS have different types I can't think of a general way to do it, and I don't know MonetDBLite.