I get the data from the sql server to perform regression analysis, and then the regression results i return back to another sql table.
library("RODBC")
library(sqldf)
dbHandle <- odbcDriverConnect("driver={SQL Server};server=MYSERVER;database=MYBASE;trusted_connection=true")
sql <-
"select
Dt
,CustomerName
,ItemRelation
,SaleCount
,DocumentNum
,DocumentYear
,IsPromo
from dbo.mytable"
df <- sqlQuery(dbHandle, sql)
After this query i must perform regression analysis separately for groups
my_lm <- function(df) {
lm(SaleCount~IsPromo, data = df)
}
reg=df %>%
group_by(CustomerName,ItemRelation,DocumentNum,DocumentYear) %>%
nest() %>%
mutate(fit = map(data, my_lm),
tidy = map(fit, tidy)) %>%
select(-fit, - data) %>%
unnest()
View(reg)
#save to sql table
sqlSave(dbHandle, as.data.frame(reg), "dbo.mytableforecast", verbose = TRUE) # use "append = TRUE" to add rows to an existing table
odbcClose(dbHandle)
The question:
The script works automatically, i.e. in the scheduler there is task that script in certain time was launched. For example, today was loaded 100 observations.
From 01.01.2017-10.04.2017
Script performed regression and returned data to sql table. Tomorrow will loaded new 100 observations.
11.04.2017-20.07.2017
I.E. when tomorrow the data will loaded and the script will start at 10 pm, it must work only with data from 11.04.2017-20.07.2017, and not from 01.01.2017-20.07.2017
the situation is complicated by the fact that after the regression the column Dt is dropped, so the solution given me here does not work Automatic transfer data from the sql to R because Dt is absent.
How can i set the condition for schedule select Dt ,CustomerName ,ItemRelation ,SaleCount ,DocumentNum ,DocumentYear ,IsPromo from dbo.mytable "where Dt>the last date when the script was launched"
is it possible to create this expression?
data example from sql
df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L,
15L, 15L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L, 18L, 18L,
18L, 19L), .Label = c("2017-10-12 00:00:00.000", "2017-10-13 00:00:00.000",
"2017-10-14 00:00:00.000", "2017-10-15 00:00:00.000", "2017-10-16 00:00:00.000",
"2017-10-17 00:00:00.000", "2017-10-18 00:00:00.000", "2017-10-19 00:00:00.000",
"2017-10-20 00:00:00.000", "2017-10-21 00:00:00.000", "2017-10-22 00:00:00.000",
"2017-10-23 00:00:00.000", "2017-10-24 00:00:00.000", "2017-10-25 00:00:00.000",
"2017-10-26 00:00:00.000", "2017-10-27 00:00:00.000", "2017-10-28 00:00:00.000",
"2017-10-29 00:00:00.000", "2017-10-30 00:00:00.000"), class = "factor"),
CustomerName = structure(c(1L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L), .Label = c("x1", "x10", "x11", "x12", "x13", "x14",
"x15", "x16", "x17", "x18", "x2", "x3", "x4", "x5", "x6",
"x7", "x8", "x9"), class = "factor"), ItemRelation = c(13322L,
13322L, 13322L, 13322L, 13322L, 13322L, 13322L, 11706L, 13322L,
11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13322L,
11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13163L,
13322L, 158010L, 11706L, 13163L, 13322L, 158010L, 11706L,
13163L, 13322L, 158010L, 11706L), SaleCount = c(10L, 3L,
1L, 0L, 9L, 5L, 5L, 11L, 7L, 0L, 5L, 11L, 1L, 0L, 0L, 19L,
10L, 0L, 1L, 12L, 1L, 11L, 6L, 0L, 167L, 7L, 0L, 16L, 165L,
1L, 0L, 0L, 29L, 0L, 0L, 11L), DocumentNum = c(36L, 36L,
36L, 36L, 36L, 36L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L,
36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 131L, 36L,
89L, 51L, 131L, 36L, 89L, 51L, 131L, 36L, 89L, 51L), DocumentYear = c(2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L),
IsPromo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("Dt", "CustomerName",
"ItemRelation", "SaleCount", "DocumentNum", "DocumentYear", "IsPromo"
), class = "data.frame", row.names = c(NA, -36L))
Consider saving the max DT (retrieved before regression that drops field) in a log file at the end of your scheduled script, then add a log read-in at beginning of script for the last logged date to include in WHERE
clause:
# READ DATE FROM LOG FILE
log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
# QUERY WITH WHERE CLAUSE
sql <- paste0("SELECT Dt, CustomerName, ItemRelation, SaleCount,
DocumentNum, DocumentYear, IsPromo
FROM dbo.mytable WHERE Dt > '", log_dt, "'")
df <- sqlQuery(dbHandle, sql)
# RETRIEVE MAX DATE VALUE
max_DT <- as.character(max(df$Dt))
# ... regression
# WRITE DATE TO LOG FILE
cat(max_DT, file="/path/to/SQL_MaxDate.txt")
Better yet, use parameterization with RODBCext to avoid string concatenation and quoting:
library(RODBC)
library(RODBCext)
# READ DATE FROM LOG FILE
log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
dbHandle <- odbcDriverConnect(...)
# PREPARED STATEMENT WITH PLACEHOLDER
sql <- "SELECT Dt, CustomerName, ItemRelation, SaleCount,
DocumentNum, DocumentYear, IsPromo
FROM dbo.mytable WHERE Dt > ?")
# EXECUTE QUERY BINDING PARAM VALUE
df <- sqlExecute(dbHandle, sql, log_dt, fetch=TRUE)
# RETRIEVE MAX DATE VALUE
max_DT <- as.character(max(df$Dt))
# ... regression
# WRITE DATE TO LOG FILE
cat(max_DT, file="/path/to/SQL_MaxDate.txt")