Search code examples
rsqliter-dbirsqlitedbplyr

Update a table using subquery in SQLite


I want to add a column to my table using ALTER TABLE and UPDATE statements not to recreate the full table.

When using a subquery in my UPDATE statement I don't get the output I expect.

build reproducible data

library(dplyr)
library(dbplyr)
library(DBI)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
copy_to(con, iris[c(1,2,51),],"iris")

tbl(con,"iris")
# # Source:   table<iris> [?? x 5]
# # Database: sqlite 3.19.3 []
#   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#          <dbl>       <dbl>        <dbl>       <dbl>      <chr>
# 1          5.1         3.5          1.4         0.2     setosa
# 2          4.9         3.0          1.4         0.2     setosa
# 3          7.0         3.2          4.7         1.4 versicolor

create the new column in a separate table

DBI::dbSendQuery(con, "CREATE TABLE new_table AS SELECT t2.new_col from
                 iris t1 inner join 
                 (SELECT Species, sum(`Sepal.Width`) as new_col FROM iris GROUP BY Species) t2
                 on t1.Species = t2.Species")

tbl(con,"new_table")
# # Source:   table<new_table> [?? x 1]
# # Database: sqlite 3.19.3 []
#   new_col
#     <dbl>
# 1     6.5
# 2     6.5
# 3     3.2

create the new column in the old table

DBI::dbSendQuery(con, "ALTER TABLE iris ADD COLUMN new_col DOUBLE")

try to plug the new column from new_table there

DBI::dbSendQuery(con, "UPDATE iris SET new_col = (SELECT new_col FROM new_table)")

tbl(con,"iris")
# # Source:   table<iris> [?? x 6]
# # Database: sqlite 3.19.3 []
#   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species new_col
#          <dbl>       <dbl>        <dbl>       <dbl>      <chr>   <dbl>
# 1          5.1         3.5          1.4         0.2     setosa     6.5
# 2          4.9         3.0          1.4         0.2     setosa     6.5
# 3          7.0         3.2          4.7         1.4 versicolor     6.5

As you can see my new_col contains only the value 6.5 where I expected to have 3.2 on the last row. How can I fix this ?


Solution

  • The rows in a table in a SQL database have no inherent order. So you cannot assign a "vector" of values like you would do it in R. However, You can modify your query slightly:

    library(dplyr)
    library(DBI)
    con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
    copy_to(con, iris[c(1,2,51),],"iris")
    

    Create a separate table with aggregated data

    DBI::dbSendQuery(con, "CREATE TABLE new_table AS 
                           SELECT Species, sum(`Sepal.Width`) as new_col FROM iris GROUP BY Species")
    
    tbl(con,"new_table")
    #> # Source:   table<new_table> [?? x 2]
    #> # Database: sqlite 3.22.0 []
    #>   Species    new_col
    #>   <chr>        <dbl>
    #> 1 setosa         6.5
    #> 2 versicolor     3.2
    

    Create the new column in the old table

    DBI::dbSendQuery(con, "ALTER TABLE iris ADD COLUMN new_col DOUBLE")
    

    Move data to original table with correlated sub-query

    DBI::dbSendQuery(con, "UPDATE iris SET new_col = (SELECT new_col FROM new_table t2
                                   WHERE iris.Species = t2.Species)")
    
    tbl(con,"iris")
    #> # Source:   table<iris> [?? x 6]
    #> # Database: sqlite 3.22.0 []
    #>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species    new_col
    #>          <dbl>       <dbl>        <dbl>       <dbl> <chr>        <dbl>
    #> 1          5.1         3.5          1.4         0.2 setosa         6.5
    #> 2          4.9         3            1.4         0.2 setosa         6.5
    #> 3          7           3.2          4.7         1.4 versicolor     3.2
    

    If you have multiple computed columns, you can use UPDATE ... SET (c1, c2, ...) = (...) like this:

    library(dplyr)
    library(dbplyr)
    library(DBI)
    con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
    copy_to(con, iris[c(1,2,51),],"iris")
    
    DBI::dbSendQuery(con, "CREATE TABLE aggs AS 
                           SELECT Species, 
                                  SUM(`Sepal.Width`) AS sw_sum,
                                  AVG(`Sepal.Width`) AS sw_avg 
                           FROM iris GROUP BY Species")
    tbl(con,"aggs")
    #> # Source:   table<aggs> [?? x 3]
    #> # Database: sqlite 3.22.0 []
    #>   Species    sw_sum sw_avg
    #>   <chr>       <dbl>  <dbl>
    #> 1 setosa        6.5   3.25
    #> 2 versicolor    3.2   3.2
    
    DBI::dbSendQuery(con, "ALTER TABLE iris ADD COLUMN sw_sum DOUBLE")
    DBI::dbSendQuery(con, "ALTER TABLE iris ADD COLUMN sw_avg DOUBLE")
    
    DBI::dbSendQuery(con, "UPDATE iris 
                           SET (sw_sum, sw_avg) = (SELECT sw_sum, sw_avg 
                               FROM aggs WHERE iris.Species = aggs.Species)")
    
    tbl(con,"iris")
    #> # Source:   table<iris> [?? x 7]
    #> # Database: sqlite 3.22.0 []
    #>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  sw_sum sw_avg
    #>          <dbl>       <dbl>        <dbl>       <dbl> <chr>     <dbl>  <dbl>
    #> 1          5.1         3.5          1.4         0.2 setosa      6.5   3.25
    #> 2          4.9         3            1.4         0.2 setosa      6.5   3.25
    #> 3          7           3.2          4.7         1.4 versico…    3.2   3.2
    

    This should also work on Postgres, but probably not with SQL Server.

    Actually, one does not need the intermediate table in this case:

    library(dplyr)
    library(dbplyr)
    library(DBI)
    con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
    copy_to(con, iris[c(1,2,51),],"iris")
    
    DBI::dbSendQuery(con, "ALTER TABLE iris ADD COLUMN sw_sum DOUBLE")
    DBI::dbSendQuery(con, "ALTER TABLE iris ADD COLUMN sw_avg DOUBLE")
    
    DBI::dbSendQuery(con, "UPDATE iris 
                           SET (sw_sum, sw_avg) = 
                                  (SELECT sw_sum, sw_avg FROM 
                                        (SELECT Species, 
                                                SUM(`Sepal.Width`) AS sw_sum, 
                                                AVG(`Sepal.Width`) AS sw_avg 
                                         FROM iris GROUP BY Species) aggs 
                                   WHERE iris.Species = aggs.Species)")
    
    tbl(con,"iris")
    #> # Source:   table<iris> [?? x 7]
    #> # Database: sqlite 3.22.0 []
    #>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  sw_sum sw_avg
    #>          <dbl>       <dbl>        <dbl>       <dbl> <chr>     <dbl>  <dbl>
    #> 1          5.1         3.5          1.4         0.2 setosa      6.5   3.25
    #> 2          4.9         3            1.4         0.2 setosa      6.5   3.25
    #> 3          7           3.2          4.7         1.4 versico…    3.2   3.2
    

    The intermediate table might be helpful in other cases, though. For example, when it is created using R as in the linked question.