Search code examples
sqlsegmentation-faultsnowflake-cloud-data-platformpolynomials

How to use the polynomial function equation to extract coefficients from all segments in all channels using SQL query


I am looking to query the data for a list of channels from Snowflake using SQL query and then used the polynomial function equation to each segment in each channel. The number of rows for each segment is 100 rows. I tried the SQL query below, but I have received this error:

Error: invalid identifier 'engine_temperature_1' (line 32) -->

AVG(engine_temperature_1) - (SUM(x * CAST(TIME AS FLOAT) * CAST(TIME AS FLOAT)) / SUM(CAST(TIME AS FLOAT) * CAST(TIME AS FLOAT))) - (SUM(x * CAST(TIME AS FLOAT)) / SUM(CAST(TIME AS FLOAT) * CAST(TIME AS FLOAT))) AS c

This is the query:

WITH C1 AS (
SELECT
    SERIAL_NUMBER,
    EVENT_DATE,
    EVENT_TS AS TIME,
    CAST (VIMS:_engine_speed AS FLOAT) AS ENGSPD,
    CAST (VIMS:_engine_load AS FLOAT) AS Engine_load,
    CAST (VIMS:_engine_temperature_1 AS FLOAT) AS engine_temperature_1,
    CAST (VIMS:_engine_temperature_2) AS engine_temperature_2,
    CAST (VIMS:_engine_temperature_3 AS FLOAT) AS engine_temperature_3,
    CAST (VIMS:_engine_temperature_4 AS FLOAT) AS engine_temperature_4,
    FLOOR((ROW_NUMBER() OVER (ORDER BY EVENT_TS) - 1) / 100) + 1 AS 
segment_id
FROM "HELIOS_TDH_VIMS_PROD_DB"."BASE"."DATA_LOGGER"
WHERE
    SERIAL_NUMBER in ('KSN00205')
    AND EVENT_DATE = '2022-11-18'
ORDER BY
    SERIAL_NUMBER, EVENT_TS
)
SELECT C1.TIME,
    C1.segment_id,
    AVG(C1.engine_temperature_1) AS avg_temp,
    AVG(C1.engine_temperature_1) - (a * POWER(AVG(C1.TIME), 2) + b * 
AVG(C1.TIME) + c) AS residual
FROM C1
CROSS JOIN (
    SELECT segment_id,
    SUM(x) / COUNT(*) AS a,
    SUM(x * TIME) / SUM(TIME * TIME) AS b,
    AVG(engine_temperature_1) - (SUM(x * CAST(TIME AS FLOAT) * CAST(TIME AS 
FLOAT)) / SUM(CAST(TIME AS FLOAT) * CAST(TIME AS FLOAT))) - (SUM(x * 
CAST(TIME AS FLOAT)) / SUM(CAST(TIME AS FLOAT) * CAST(TIME AS FLOAT))) AS c
FROM (SELECT
    segment_id,
    engine_temperature_1 - AVG(engine_temperature_1) AS x,
    TIME
    FROM C1)
GROUP BY
    segment_id) 
GROUP BY C1.TIME, C1.segment_id 

What should change in that query to fix the error above.


Solution

  • You are getting the error because when you pull things from C1 in your CROSS JOIN SUB-SELECT you don't pull it in:

    thus:

    WITH c1 AS (
        SELECT
            serial_number,
            event_date,
            event_ts AS time,
            vims:_engine_speed::float AS ENGSPD,
            vims:_engine_load::float AS Engine_load,
            vims:_engine_temperature_1::float AS engine_temperature_1,
            vims:_engine_temperature_2::float AS engine_temperature_2,
            vims:_engine_temperature_3::float) AS engine_temperature_3,
            vims:_engine_temperature_4::float) AS engine_temperature_4,
            FLOOR((ROW_NUMBER() OVER (ORDER BY event_ts) - 1) / 100) + 1 AS segment_id
        FROM helios_tdh_vims_prod_db.base.data_logger
        WHERE serial_number in ('KSN00205')
            AND event_date = '2022-11-18'
        ORDER BY serial_number, event_ts
    )
    SELECT 
        c1.time,
        c1.segment_id,
        AVG(C1.engine_temperature_1) AS avg_temp,
        AVG(C1.engine_temperature_1) - (a * POWER(AVG(C1.TIME), 2) + b * AVG(C1.TIME) + c) AS residual
    FROM C1
    CROSS JOIN (
        SELECT 
            segment_id,
            SUM(x) / COUNT(*) AS a,
            SUM(x * TIME) / SUM(TIME * TIME) AS b,
            AVG(engine_temperature_1) - (SUM(x * TIME::float * TIME::float) / SUM(TIME::float * TIME::float)) - (SUM(x * TIME::float) / SUM(TIME::float * TIME::float)) AS c
        FROM (
            SELECT
                segment_id,
                engine_temperature_1 - AVG(engine_temperature_1) AS x,
                engine_temperature_1,
                TIME
            FROM C1
        )
        GROUP BY segment_id
    ) 
    GROUP BY c1.time, c1.segment_id 
    

    should work. I would be inclined to to the float and squaring once, to make the code more readable. As the optimizer should do this anyways.

    WITH c1 AS (
        SELECT
            serial_number,
            event_date,
            event_ts AS time,
            vims:_engine_speed::float AS ENGSPD,
            vims:_engine_load::float AS Engine_load,
            vims:_engine_temperature_1::float AS engine_temperature_1,
            vims:_engine_temperature_2::float AS engine_temperature_2,
            vims:_engine_temperature_3::float) AS engine_temperature_3,
            vims:_engine_temperature_4::float) AS engine_temperature_4,
            FLOOR((ROW_NUMBER() OVER (ORDER BY event_ts) - 1) / 100) + 1 AS segment_id
        FROM helios_tdh_vims_prod_db.base.data_logger
        WHERE serial_number in ('KSN00205')
            AND event_date = '2022-11-18'
        ORDER BY serial_number, event_ts
    )
    SELECT 
        c1.time,
        c1.segment_id,
        AVG(C1.engine_temperature_1) AS avg_temp,
        AVG(C1.engine_temperature_1) - (a * POWER(AVG(c1.time), 2) + b * AVG(c1.time) + c) AS residual
    FROM C1
    CROSS JOIN (
        SELECT 
            segment_id,
            SUM(x) / COUNT(*) AS a,
            SUM(x * time) / SUM(time * time) AS b,
            AVG(engine_temperature_1) - (SUM(x * time_f_sq) / SUM(time_f_sq)) - (SUM(x * time_f) / SUM(time_f_sq)) AS c
        FROM (
            SELECT
                segment_id,
                engine_temperature_1 - AVG(engine_temperature_1) AS x,
                engine_temperature_1,
                time,
                time * time as time_sq,
                time::float as time_f,
                time_f * time_f as time_f_sq
            FROM C1
        )
        GROUP BY segment_id
    ) 
    GROUP BY c1.time, c1.segment_id 
    

    now that I run it:

    WITH fake_data(event_ts, _engine_temperature_1) as (
        select * from values
        (current_timestamp, 50.0),
        (current_timestamp, 100.0),
        (current_timestamp, 150.0)
    ),c1 AS (
        SELECT
            event_ts AS time,
            _engine_temperature_1 AS engine_temperature_1,
            FLOOR((ROW_NUMBER() OVER (ORDER BY event_ts) - 1) / 100) + 1 AS segment_id
        FROM fake_data
    ), c2 as (
        SELECT
            segment_id,
            engine_temperature_1 - AVG(engine_temperature_1) AS x,
            engine_temperature_1,
            time,
            time * time as time_sq,
            time::float as time_f,
            time_f * time_f as time_f_sq
        FROM C1
    ), c3 as (
        SELECT 
            segment_id,
            SUM(x) / COUNT(*) AS a,
            SUM(x * time) / SUM(time_sq) AS b,
            AVG(engine_temperature_1) - (SUM(x * time_f_sq) / SUM(time_f_sq)) - (SUM(x * time_f) / SUM(time_f_sq)) AS c
        FROM c2
        GROUP BY segment_id
    )
    SELECT 
        c1.time,
        c1.segment_id,
        AVG(C1.engine_temperature_1) AS avg_temp,
        AVG(C1.engine_temperature_1) - (a * POWER(AVG(c1.time), 2) + b * AVG(c1.time) + c) AS residual
    FROM C1
    CROSS JOIN c3 
    GROUP BY c1.time, c1.segment_id 
    

    it begs the question what is time_ts unit? because time does not normally be squared.

    ok, so if I change to just int's for time, and just make more data that makes sense, and comment out that final maths part, because Snowflake is correct "it just makes no sense"

    WITH fake_data(time, engine_temperature_1, segment_id) as (
        select *  from values
        (10, 50.0, 0),
        (13, 100.0, 0),
        (15, 150.0, 0),
        (111, 51.0, 1),
        (114, 101.0, 1),
        (116, 151.0, 1)
    ),c1 AS (
        select * from fake_data
        --SELECT
        --    event_ts AS time,
        --    _engine_temperature_1 AS engine_temperature_1,
        --    FLOOR((ROW_NUMBER() OVER (ORDER BY event_ts) - 1) / 100) + 1 AS segment_id
        --FROM fake_data
    ), c2 as (
        SELECT
            segment_id,
            engine_temperature_1 - AVG(engine_temperature_1) over(partition by segment_id) AS x,
            engine_temperature_1,
            time,
            time * time as time_sq,
            time::float as time_f,
            time_f * time_f as time_f_sq
        FROM C1
    ), c3 as (
        SELECT 
            segment_id,
            SUM(x) / COUNT(*) AS a,
            SUM(x * time) / SUM(time_sq) AS b,
            AVG(engine_temperature_1) - (SUM(x * time_f_sq) / SUM(time_f_sq)) - (SUM(x * time_f) / SUM(time_f_sq)) AS c
        FROM c2
        GROUP BY segment_id
    )
    SELECT 
        c1.time,
        c1.segment_id,
        *
        --AVG(C1.engine_temperature_1) AS avg_temp,
       --avg_temp - (c3.a * POWER(AVG(c1.time), 2) + c3.b * AVG(c1.time) + c3.c) AS residual
    FROM C1
    CROSS JOIN c3 
    --GROUP BY c1.time, c1.segment_id 
    

    gives:

    enter image description here

    which makes me think you shouldn't be doing a cross join, but still what you really are wanting/expecting is missing, and guess the gaps is too hard.