I have a dataset that has multiple temperature readings per day. I am looking to return the hottest 7-day period by average temperature.
DROP TABLE IF EXISTS oshkosh;
CREATE EXTERNAL TABLE IF NOT EXISTS oshkosh(year STRING, month STRING, day STRING, time STRING, temp FLOAT, dewpoint FLOAT, humidity INT, sealevel FLOAT, visibility FLOAT, winddir STRING, windspeed FLOAT, gustspeed FLOAT, precip FLOAT, events STRING, condition STRING, winddirdegrees INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/maria_dev/final/Oshkosh' tblproperties ("skip.header.line.count"="1");
SELECT o.theDate
,AVG(o.temp) over (order by o.theDate range INTERVAL 6 DAY preceding) AS Average
FROM
(SELECT CAST(to_date(from_unixtime(UNIX_TIMESTAMP(CONCAT(year,'-',IF(LENGTH(month)=1,CONCAT(0,month),month),'-',IF(LENGTH(day)=1,CONCAT(0,day),day)),'yyyy-MM-dd'))) as timestamp) as theDate
,temp AS temp
FROM oshkosh
WHERE temp != -9999) as o
this returns the error:
Error while compiling statement: FAILED: ParseException line 2:38 cannot recognize input near 'range' 'INTERVAL' '6' in window_value_expression
I'm not sure if I want a timestamp as o.theDate because it seems that the INTERVAL 6 DAY call may not find a new day because there are 28 temperature readings for the first day of the dataset (and 44 readings for day 2, it's variable for each day).
Try:
SELECT
o.theDate,
AVG(o.temp) over (order by unix_timestamp(o.theDate) range between 604800
preceding and current row) AS Average
FROM
(
SELECT
CAST(to_date(CONCAT(year,'-',IF(LENGTH(month)=1,CONCAT(0,month),month),
'-',IF(LENGTH(day)=1,CONCAT(0,day),day))) AS TIMESTAMP) as theDate,
temp AS temp
FROM oshkosh
WHERE temp != -9999
) as o