Search code examples
rdataframeodbcrodbcsqldf

create sql expression in R for certain condition


I get the data from the sql server to perform regression analysis, and then the regression results i return back to another sql table.

library("RODBC")
library(sqldf)

dbHandle <- odbcDriverConnect("driver={SQL Server};server=MYSERVER;database=MYBASE;trusted_connection=true")

sql <- 
  "select
Dt
,CustomerName
,ItemRelation
,SaleCount
,DocumentNum
,DocumentYear
,IsPromo

from dbo.mytable"

df <- sqlQuery(dbHandle, sql)

After this query i must perform regression analysis separately for groups

my_lm <- function(df) {
  lm(SaleCount~IsPromo, data = df)
}

reg=df %>% 
  group_by(CustomerName,ItemRelation,DocumentNum,DocumentYear) %>% 
  nest() %>%
  mutate(fit = map(data, my_lm),
         tidy = map(fit, tidy)) %>%
  select(-fit, - data) %>%
  unnest()
View(reg)

#save to sql table
sqlSave(dbHandle, as.data.frame(reg), "dbo.mytableforecast", verbose = TRUE)  # use "append = TRUE" to add rows to an existing table

odbcClose(dbHandle)

The question:

The script works automatically, i.e. in the scheduler there is task that script in certain time was launched. For example, today was loaded 100 observations.

From 01.01.2017-10.04.2017

Script performed regression and returned data to sql table. Tomorrow will loaded new 100 observations.

11.04.2017-20.07.2017

I.E. when tomorrow the data will loaded and the script will start at 10 pm, it must work only with data from 11.04.2017-20.07.2017, and not from 01.01.2017-20.07.2017

the situation is complicated by the fact that after the regression the column Dt is dropped, so the solution given me here does not work Automatic transfer data from the sql to R because Dt is absent.

How can i set the condition for schedule select Dt ,CustomerName ,ItemRelation ,SaleCount ,DocumentNum ,DocumentYear ,IsPromo from dbo.mytable "where Dt>the last date when the script was launched"

is it possible to create this expression?

data example from sql

df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 
15L, 15L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 
18L, 19L), .Label = c("2017-10-12 00:00:00.000", "2017-10-13 00:00:00.000", 
"2017-10-14 00:00:00.000", "2017-10-15 00:00:00.000", "2017-10-16 00:00:00.000", 
"2017-10-17 00:00:00.000", "2017-10-18 00:00:00.000", "2017-10-19 00:00:00.000", 
"2017-10-20 00:00:00.000", "2017-10-21 00:00:00.000", "2017-10-22 00:00:00.000", 
"2017-10-23 00:00:00.000", "2017-10-24 00:00:00.000", "2017-10-25 00:00:00.000", 
"2017-10-26 00:00:00.000", "2017-10-27 00:00:00.000", "2017-10-28 00:00:00.000", 
"2017-10-29 00:00:00.000", "2017-10-30 00:00:00.000"), class = "factor"), 
    CustomerName = structure(c(1L, 11L, 12L, 13L, 14L, 15L, 16L, 
    17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 11L, 12L, 
    13L, 14L, 15L, 16L, 17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
    9L, 10L), .Label = c("x1", "x10", "x11", "x12", "x13", "x14", 
    "x15", "x16", "x17", "x18", "x2", "x3", "x4", "x5", "x6", 
    "x7", "x8", "x9"), class = "factor"), ItemRelation = c(13322L, 
    13322L, 13322L, 13322L, 13322L, 13322L, 13322L, 11706L, 13322L, 
    11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 
    11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13163L, 
    13322L, 158010L, 11706L, 13163L, 13322L, 158010L, 11706L, 
    13163L, 13322L, 158010L, 11706L), SaleCount = c(10L, 3L, 
    1L, 0L, 9L, 5L, 5L, 11L, 7L, 0L, 5L, 11L, 1L, 0L, 0L, 19L, 
    10L, 0L, 1L, 12L, 1L, 11L, 6L, 0L, 167L, 7L, 0L, 16L, 165L, 
    1L, 0L, 0L, 29L, 0L, 0L, 11L), DocumentNum = c(36L, 36L, 
    36L, 36L, 36L, 36L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 
    36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 131L, 36L, 
    89L, 51L, 131L, 36L, 89L, 51L, 131L, 36L, 89L, 51L), DocumentYear = c(2017L, 
    2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
    2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
    2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
    2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L), 
    IsPromo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("Dt", "CustomerName", 
"ItemRelation", "SaleCount", "DocumentNum", "DocumentYear", "IsPromo"
), class = "data.frame", row.names = c(NA, -36L))

Solution

  • Consider saving the max DT (retrieved before regression that drops field) in a log file at the end of your scheduled script, then add a log read-in at beginning of script for the last logged date to include in WHERE clause:

    # READ DATE FROM LOG FILE
    log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
    
    # QUERY WITH WHERE CLAUSE
    sql <- paste0("SELECT Dt, CustomerName, ItemRelation, SaleCount, 
                          DocumentNum, DocumentYear, IsPromo
                   FROM dbo.mytable WHERE Dt > '", log_dt, "'")
    
    df <- sqlQuery(dbHandle, sql)
    
    # RETRIEVE MAX DATE VALUE
    max_DT <- as.character(max(df$Dt))
    
    # ... regression
    
    # WRITE DATE TO LOG FILE
    cat(max_DT, file="/path/to/SQL_MaxDate.txt")
    

    Better yet, use parameterization with RODBCext to avoid string concatenation and quoting:

    library(RODBC)
    library(RODBCext)
    
    # READ DATE FROM LOG FILE
    log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
    
    dbHandle <- odbcDriverConnect(...)
    
    # PREPARED STATEMENT WITH PLACEHOLDER
    sql <- "SELECT Dt, CustomerName, ItemRelation, SaleCount, 
                   DocumentNum, DocumentYear, IsPromo
            FROM dbo.mytable WHERE Dt > ?")
    
    # EXECUTE QUERY BINDING PARAM VALUE
    df <- sqlExecute(dbHandle, sql, log_dt, fetch=TRUE)
    
    # RETRIEVE MAX DATE VALUE
    max_DT <- as.character(max(df$Dt))
    
    # ... regression
    
    # WRITE DATE TO LOG FILE
    cat(max_DT, file="/path/to/SQL_MaxDate.txt")