Search code examples
mysqlgroup-bymedian

Is there a way to optimize a group by median query?


I wrote a query that finds the median of values for each month. Doing this was difficult enough because MySQL doesn't have a built in median function so I really had to think outside the box with my intermediate SQL skills. But now the problem is that it takes really long (1 or 2 mins sometimes) to run the query. Is there a way to optimize this query? Or perhaps I should write a python script that finds the median and pushes it to the database using a connector?

Here is the query:

SET @row_num_pos := 0;
SET @median_group_pos := '';
SET @row_num_neg := 0;
SET @median_group_neg := '';

SELECT 
    p.month_num AS 'month_num',
    CASE
        WHEN p.month_num = 1 THEN 'Jan'
        WHEN p.month_num = 2 THEN 'Feb'
        WHEN p.month_num = 3 THEN 'Mar'
        WHEN p.month_num = 4 THEN 'Apr'
        WHEN p.month_num = 5 THEN 'May'
        WHEN p.month_num = 6 THEN 'Jun'
        WHEN p.month_num = 7 THEN 'Jul'
        WHEN p.month_num = 8 THEN 'Aug'
        WHEN p.month_num = 9 THEN 'Sep'
        WHEN p.month_num = 10 THEN 'Oct'
        WHEN p.month_num = 11 THEN 'Nov'
        WHEN p.month_num = 12 THEN 'Dec'
    END AS 'Timeline',
    p.ck_pos_median AS 'CK+ Median',
    n.ck_neg_median AS 'CK- Median'
FROM
    (SELECT 
        s.median_month_pos AS 'month_num',
            ROUND(AVG(ck_pos), 1) AS 'ck_pos_median'
    FROM
        (SELECT 
        @row_num_pos:=CASE
                WHEN @median_group_pos = q.month_num THEN @row_num_pos + 1
                ELSE 1
            END AS 'count_of_group',
            @median_group_pos:=q.month_num AS 'median_month_pos',
            q.month_num,
            q.ck_pos,
            (SELECT 
                    COUNT(*)
                FROM
                    Biocept_DB.result_management_report
                WHERE
                    ck_pos IS NOT NULL
                        AND MONTH(order_date) = q.month_num) AS total_month
    FROM
        (SELECT 
        MONTH(order_date) AS 'month_num', ck_pos
    FROM
        Biocept_DB.result_management_report
    WHERE
        ck_pos IS NOT NULL
    ORDER BY MONTH(order_date) , ck_pos ASC) AS q) AS s
    WHERE
        s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
    GROUP BY s.median_month_pos) AS p
        JOIN
    (SELECT 
        s.median_month_neg AS 'month_num',
            ROUND(AVG(ck_neg), 1) AS 'ck_neg_median'
    FROM
        (SELECT 
        @row_num_neg:=CASE
                WHEN @median_group_neg = q.month_num THEN @row_num_neg + 1
                ELSE 1
            END AS 'count_of_group',
            @median_group_neg:=q.month_num AS 'median_month_neg',
            q.month_num,
            q.ck_neg,
            (SELECT 
                    COUNT(*)
                FROM
                    Biocept_DB.result_management_report
                WHERE
                    ck_neg IS NOT NULL
                        AND MONTH(order_date) = q.month_num) AS total_month
    FROM
        (SELECT 
        MONTH(order_date) AS 'month_num', ck_neg
    FROM
        Biocept_DB.result_management_report
    WHERE
        ck_neg IS NOT NULL
    ORDER BY MONTH(order_date) , ck_neg ASC) AS q) AS s
    WHERE
        s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
    GROUP BY s.median_month_neg) AS n ON p.month_num = n.month_num
ORDER BY p.month_num;

SET @row_num_pos := NULL;
SET @median_group_pos := NULL;
SET @row_num_neg := NULL;
SET @median_group_neg := NULL;

Here is the table it generates: enter image description here


Solution

  • I have modified your query a little bit. I hope that the calculation is correct. With you sample date the results are the same.

    I my environment your query take 6.22 seconds and my only 20 ms. so it looks 300 times faster.

    please test my query and let me know if it works for you. is the speed not good we can optimize a little more with virtual columns.

    Please don't forget to set group_concat_max_len to a good value for you.

    SET SESSION group_concat_max_len = 1000000;
    

    Query

    SELECT  r.Timeline AS `month_number`
            , SUBSTRING_INDEX(SUBSTRING_INDEX( 'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec', ',' , r.Timeline ) , ',' , -1)
                AS Timeline
            , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1)+1 ) , ',' , -1) +
                SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1) + 1 - ((r.cnt_ck_pos&1) XOR 1) ) , ',' , -1) 
                ) / 2 AS 'ck_pos_median'
            , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1)+1 ) , ',' , -1) +
                SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1) + 1 - ((r.cnt_ck_neg&1) XOR 1) ) , ',' , -1) 
                ) / 2 AS 'ck_neg_median'
    FROM (
            SELECT  MONTH(`order_date`) AS 'Timeline'
                , SUM(IF(ck_pos is NULL,0,1)) AS cnt_ck_pos
                , GROUP_CONCAT(ck_pos ORDER BY ck_pos) as grp_ck_pos
                , SUM(IF(ck_neg is NULL,0,1)) AS cnt_ck_neg
                , GROUP_CONCAT(ck_neg ORDER BY ck_neg) as grp_ck_neg
            FROM result_management_report
            where (ck_pos is not null or ck_neg is not null) = 1
            GROUP BY Timeline
    ) r;
    

    Table definition

    Sample new query

    mysql> SELECT r.Timeline AS `month_number`
        -> , SUBSTRING_INDEX(SUBSTRING_INDEX( 'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec', ',' , r.Timeline ) , ',' , -1)
        -> AS Timeline
        -> , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1)+1 ) , ',' , -1) +
        -> SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_pos, ',' , (r.cnt_ck_pos>>1) + 1 - ((r.cnt_ck_pos&1) XOR 1) ) , ',' , -1) 
        -> ) / 2 AS 'ck_pos_median'
        -> , ( SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1)+1 ) , ',' , -1) +
        -> SUBSTRING_INDEX( SUBSTRING_INDEX(r.grp_ck_neg, ',' , (r.cnt_ck_neg>>1) + 1 - ((r.cnt_ck_neg&1) XOR 1) ) , ',' , -1) 
        -> ) / 2 AS 'ck_neg_median'
        -> FROM (
        -> SELECT MONTH(`order_date`) AS 'Timeline'
        -> , SUM(IF(ck_pos is NULL,0,1)) AS cnt_ck_pos
        -> , GROUP_CONCAT(ck_pos ORDER BY ck_pos) as grp_ck_pos
        -> , SUM(IF(ck_neg is NULL,0,1)) AS cnt_ck_neg
        -> , GROUP_CONCAT(ck_neg ORDER BY ck_neg) as grp_ck_neg
        -> FROM result_management_report
        -> where (ck_pos is not null or ck_neg is not null) = 1
        -> GROUP BY Timeline
        -> ) r;
    +--------------+----------+---------------+---------------+
    | month_number | Timeline | ck_pos_median | ck_neg_median |
    +--------------+----------+---------------+---------------+
    |            1 | Jan      |             2 |             2 |
    |            2 | Feb      |             2 |             3 |
    |            3 | Mar      |             2 |             3 |
    |            4 | Apr      |             4 |             4 |
    |            5 | May      |             2 |             3 |
    |            6 | Jun      |             3 |             3 |
    |            7 | Jul      |             4 |             4 |
    |            8 | Aug      |             3 |             7 |
    |            9 | Sep      |             4 |            12 |
    |           10 | Oct      |             5 |             8 |
    |           11 | Nov      |             4 |             9 |
    |           12 | Dec      |             2 |            12 |
    +--------------+----------+---------------+---------------+
    12 rows in set (0.02 sec)
    

    Sample old query

    mysql> 
    mysql> SET @row_num_pos := 0;
    Query OK, 0 rows affected (0.00 sec)
    
    mysql> SET @median_group_pos := '';
    Query OK, 0 rows affected (0.00 sec)
    
    mysql> SET @row_num_neg := 0;
    Query OK, 0 rows affected (0.00 sec)
    
    mysql> SET @median_group_neg := '';
    Query OK, 0 rows affected (0.00 sec)
    
    mysql> 
    mysql> SELECT 
        ->     p.month_num AS 'month_num',
        ->     CASE
        ->         WHEN p.month_num = 1 THEN 'Jan'
        ->         WHEN p.month_num = 2 THEN 'Feb'
        ->         WHEN p.month_num = 3 THEN 'Mar'
        ->         WHEN p.month_num = 4 THEN 'Apr'
        ->         WHEN p.month_num = 5 THEN 'May'
        ->         WHEN p.month_num = 6 THEN 'Jun'
        ->         WHEN p.month_num = 7 THEN 'Jul'
        ->         WHEN p.month_num = 8 THEN 'Aug'
        ->         WHEN p.month_num = 9 THEN 'Sep'
        ->         WHEN p.month_num = 10 THEN 'Oct'
        ->         WHEN p.month_num = 11 THEN 'Nov'
        ->         WHEN p.month_num = 12 THEN 'Dec'
        ->     END AS 'Timeline',
        ->     p.ck_pos_median AS 'CK+ Median',
        ->     n.ck_neg_median AS 'CK- Median'
        -> FROM
        ->     (SELECT 
        ->         s.median_month_pos AS 'month_num',
        ->             ROUND(AVG(ck_pos), 1) AS 'ck_pos_median'
        ->     FROM
        ->         (SELECT 
        ->         @row_num_pos:=CASE
        ->                 WHEN @median_group_pos = q.month_num THEN @row_num_pos + 1
        ->                 ELSE 1
        ->             END AS 'count_of_group',
        ->             @median_group_pos:=q.month_num AS 'median_month_pos',
        ->             q.month_num,
        ->             q.ck_pos,
        ->             (SELECT 
        ->                     COUNT(*)
        ->                 FROM
        ->                     result_management_report
        ->                 WHERE
        ->                     ck_pos IS NOT NULL
        ->                         AND MONTH(order_date) = q.month_num) AS total_month
        ->     FROM
        ->         (SELECT 
        ->         MONTH(order_date) AS 'month_num', ck_pos
        ->     FROM
        ->         result_management_report
        ->     WHERE
        ->         ck_pos IS NOT NULL
        ->     ORDER BY MONTH(order_date) , ck_pos ASC) AS q) AS s
        ->     WHERE
        ->         s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
        ->     GROUP BY s.median_month_pos) AS p
        ->         JOIN
        ->     (SELECT 
        ->         s.median_month_neg AS 'month_num',
        ->             ROUND(AVG(ck_neg), 1) AS 'ck_neg_median'
        ->     FROM
        ->         (SELECT 
        ->         @row_num_neg:=CASE
        ->                 WHEN @median_group_neg = q.month_num THEN @row_num_neg + 1
        ->                 ELSE 1
        ->             END AS 'count_of_group',
        ->             @median_group_neg:=q.month_num AS 'median_month_neg',
        ->             q.month_num,
        ->             q.ck_neg,
        ->             (SELECT 
        ->                     COUNT(*)
        ->                 FROM
        ->                     result_management_report
        ->                 WHERE
        ->                     ck_neg IS NOT NULL
        ->                         AND MONTH(order_date) = q.month_num) AS total_month
        ->     FROM
        ->         (SELECT 
        ->         MONTH(order_date) AS 'month_num', ck_neg
        ->     FROM
        ->         result_management_report
        ->     WHERE
        ->         ck_neg IS NOT NULL
        ->     ORDER BY MONTH(order_date) , ck_neg ASC) AS q) AS s
        ->     WHERE
        ->         s.count_of_group BETWEEN (s.total_month / 2.0) AND (s.total_month / 2.0 + 1)
        ->     GROUP BY s.median_month_neg) AS n ON p.month_num = n.month_num
        -> ORDER BY p.month_num;
    +-----------+----------+------------+------------+
    | month_num | Timeline | CK+ Median | CK- Median |
    +-----------+----------+------------+------------+
    |         1 | Jan      |        2.0 |        2.0 |
    |         2 | Feb      |        2.0 |        3.0 |
    |         3 | Mar      |        2.0 |        3.0 |
    |         4 | Apr      |        4.0 |        4.0 |
    |         5 | May      |        2.0 |        3.0 |
    |         6 | Jun      |        3.0 |        3.0 |
    |         7 | Jul      |        4.0 |        4.0 |
    |         8 | Aug      |        3.0 |        7.0 |
    |         9 | Sep      |        4.0 |       12.0 |
    |        10 | Oct      |        5.0 |        8.0 |
    |        11 | Nov      |        4.0 |        9.0 |
    |        12 | Dec      |        2.0 |       12.0 |
    +-----------+----------+------------+------------+
    12 rows in set (6.22 sec)
    
    mysql>