Example of my data
mydata=structure(list(generated_id = c(1003477323030100, 1003477323030100,
1003477323030100, 1003477323030100, 1003477323030100, 1003477323030100,
1003477323030100, 1003477323030100, 1003477323030100, 1003477323030100,
1003477323030100, 1003477323030100, 1003477323030100, 1003477323030100,
1003477323030100, 1003477323030100, 1003477323030100), campaign_id.x = c(23843069854050700,
23843069854050700, 23843069854050700, 23843069854050700, 23843069854050700,
23843069854050700, 23843069854050700, 23843069854050700, 23843069854050700,
23843069854050700, 23843069854050700, 23843069854050700, 23843069854050700,
23843069854050700, 23843069854050700, 23843069854050700, 23843069854050700
), campaign_id.y = c(23843069854050700, 23843069854050700, 23843069854050700,
23843069854050700, 23843069854050700, 23843069854050700, 23843069854050700,
23843069854050700, 23843069854050700, 23843069854050700, 23843069854050700,
23843069854050700, 23843069854050700, 23843069854050700, 23843069854050700,
23843069854050700, 23843069854050700), spent = c(73.5, 73.5,
73.5, 73.5, 73.5, 73.5, 73.5, 73.5, 73.5, 73.5, 73.5, 29.74,
29.74, 29.74, 29.74, 29.74, 29.74), date = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("04.10.2018",
"26.09.2018"), class = "factor"), realpurchase_cash = c(1.49,
1.49, 1.49, 1.49, 1.49, 1.49, 1.49, 1.49, 1.49, 1.49, 1.49, 1.49,
1.49, 1.49, 1.49, 1.49, 1.49), utc_time.y = structure(c(5L, 8L,
2L, 1L, 4L, 4L, 9L, 10L, 6L, 3L, 7L, 5L, 8L, 2L, 1L, 4L, 4L), .Label = c("01.10.2018 22:26",
"05.10.2018 22:34", "05.10.2018 22:35", "06.10.2018 13:43", "07.10.2018 15:55",
"30.09.2018 11:22", "30.09.2018 11:23", "30.09.2018 12:00", "30.09.2018 12:23",
"30.09.2018 18:12"), class = "factor")), .Names = c("generated_id",
"campaign_id.x", "campaign_id.y", "spent", "date", "realpurchase_cash",
"utc_time.y"), class = "data.frame", row.names = c(NA, -17L))
I need to restructure as follows:
if for the group
generated_id +capmaing_id.x+campaing_id.y
the aggregated up to 90 days value ofrealpurchase_cash
is greater than the aggregated up to 90 days value of spent, then the whole group is assign to 1, otherwise 0. To aggregate spent by sum by months , it is column date, but to aggregaterealpurchase_cash
by sum by months , it is columnutc_time.y
so aggregated sum for spent 984 for 2 months, and aggregated sum for realpurchase_cash=25, so flag=0
each group have data up to 90 days no more.
I.E.output
i decided use sqldf solution, cause i work with sql i do so
a1s <- sqldf("
select
generated_id,
[capmaing_id.x],
[campaign_id.y],
spent,
[date],
[utc_time.y],
realpurchase_cash,
--SUM(spent) over (partition by generated_id,[capmaing_id.x],[campaign_id.y]) as sum_spent,
--SUM(realpurchase_cash) over (partition by generated_id,[capmaing_id.x],[campaign_id.y]) as sum_realpurchase_cash
case when SUM(realpurchase_cash) over (partition by generated_id,[capmaing_id.x],[campaign_id.y])>SUM(spent) over (partition by generated_id,[capmaing_id.x],[campaign_id.y]) then 1 else 0 end as flag
from newest3
")
and get the error
Error in result_create(conn@ptr, statement) : near "over": syntax error
How to do correct?
I assume the question is why an error is being given.
Until RSQLite upgrades to the latest verison of the SQLite database the windowing will not work. Instead use the RPostgreSQL backend. With that backend use "..."
rather than [...]
and also fix the spelling and other errors in the sql statement shown in the question.
The gives no syntax errors (assuming the PostgreSQL server is installed and running).
library(sqldf)
library(RPostgreSQL)
a1s <- sqldf('
SELECT
"generated_id",
"campaign_id.x",
"campaign_id.y",
"spent",
"date",
"utc_time.y",
"realpurchase_cash",
--SUM(spent) over (partition by generated_id,[campaign_id.x],[campaign_id.y]) as sum_spent,
--SUM(realpurchase_cash) over (partition by generated_id,[campaign_id.x],[campaign_id.y]) as sum_realpurchase_cash
CASE WHEN SUM("realpurchase_cash") OVER
(PARTITION BY "generated_id", "campaign_id.x", "campaign_id.y") >
SUM(spent) OVER (PARTITION BY "generated_id", "campaign_id.x", "campaign_id.y")
THEN 1 ELSE 0
END AS "flag"
FROM "mydata"')
giving:
> a1s
generated_id campaign_id.x campaign_id.y spent date utc_time.y realpurchase_cash flag
1 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 07.10.2018 15:55 1.49 0
2 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 30.09.2018 12:00 1.49 0
3 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 05.10.2018 22:34 1.49 0
4 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 01.10.2018 22:26 1.49 0
5 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 06.10.2018 13:43 1.49 0
6 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 06.10.2018 13:43 1.49 0
7 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 30.09.2018 12:23 1.49 0
8 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 30.09.2018 18:12 1.49 0
9 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 30.09.2018 11:22 1.49 0
10 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 05.10.2018 22:35 1.49 0
11 1.003477e+15 2.384307e+16 2.384307e+16 73.50 04.10.2018 30.09.2018 11:23 1.49 0
12 1.003477e+15 2.384307e+16 2.384307e+16 29.74 26.09.2018 07.10.2018 15:55 1.49 0
13 1.003477e+15 2.384307e+16 2.384307e+16 29.74 26.09.2018 30.09.2018 12:00 1.49 0
14 1.003477e+15 2.384307e+16 2.384307e+16 29.74 26.09.2018 05.10.2018 22:34 1.49 0
15 1.003477e+15 2.384307e+16 2.384307e+16 29.74 26.09.2018 01.10.2018 22:26 1.49 0
16 1.003477e+15 2.384307e+16 2.384307e+16 29.74 26.09.2018 06.10.2018 13:43 1.49 0
17 1.003477e+15 2.384307e+16 2.384307e+16 29.74 26.09.2018 06.10.2018 13:43 1.49 0