Search code examples
mysqlgroup-bymedian

MySQL median query that retrieves median for each grouped IDs


The median query returns a result similar to the following:

Vendor_id | Median invoice_total
97.............| 418

I'd like the result of the median query to look similar to the following avg function query:

SELECT vendor_id, avg(invoice_total)
FROM invoices
GROUP BY vendor_id;

Median query:

SELECT t3.vendor_id, AVG(middle_values) AS 'median'
FROM (
    SELECT t1.invoice_total AS 'middle_values', t1.vendor_id
    FROM
    (
        SELECT @row:=@row+1 as `row`, iv.invoice_total, iv.vendor_id
        FROM invoices AS iv, (SELECT @row:=0) AS r
        WHERE iv.vendor_id = 97
        ORDER BY iv.invoice_total
    ) AS t1,
    (
        SELECT COUNT(*) as 'count'
        FROM invoices iv
        WHERE iv.vendor_id = 97
    ) AS t2
-- the following condition will return 1 record for odd number sets, or 2 records for even number sets.
WHERE t1.row >= t2.count/2 and t1.row <= ((t2.count/2) +1)) AS t3

I believe the main crucial part is the 3rd (and nested) select statement.

SELECT @row:=@row+1 as `row`
FROM (SELECT @row:=@row+1 as `row`, vendor_id, invoice_total
      FROM invoices
      ORDER BY vendor_id, invoice_total) t, invoices inv
WHERE inv.vendor_id = t.vendor_id;

If @row counter could be ressetted each time the query transitions on to a different vendor_id. That would be a huge step.

Table:

CREATE TABLE IF NOT EXISTS `invoices` (
  `invoice_id` int(11) NOT NULL AUTO_INCREMENT,
  `vendor_id` int(11) NOT NULL,
  `invoice_number` varchar(50) NOT NULL,
  `invoice_date` date NOT NULL,
  `invoice_total` decimal(9,2) NOT NULL,
  `payment_total` decimal(9,2) NOT NULL DEFAULT '0.00',
  `credit_total` decimal(9,2) NOT NULL DEFAULT '0.00',
  `terms_id` int(11) NOT NULL,
  `invoice_due_date` date NOT NULL,
  `payment_date` date DEFAULT NULL,
  PRIMARY KEY (`invoice_id`),
  KEY `invoices_fk_vendors` (`vendor_id`),
  KEY `invoices_fk_terms` (`terms_id`),
  KEY `invoices_invoice_date_ix` (`invoice_date`),
  CONSTRAINT `invoices_fk_terms` FOREIGN KEY (`terms_id`) REFERENCES `terms` (`terms_id`),
  CONSTRAINT `invoices_fk_vendors` FOREIGN KEY (`vendor_id`) REFERENCES `vendors` (`vendor_id`)
) ENGINE=InnoDB AUTO_INCREMENT=119 DEFAULT CHARSET=latin1;

Inserts:

INSERT INTO `invoices` VALUES (118, 97, '456792', '2011-08-03', 565.60, 0.00, 0.00, 2, '2011-09-02', NULL);
INSERT INTO `invoices` VALUES (117, 97, '456791', '2011-08-03', 4390.00, 0.00, 0.00, 2, '2011-09-02', NULL);
INSERT INTO `invoices` VALUES (116, 97, '456701', '2011-08-02', 270.50, 0.00, 0.00, 2, '2011-09-01', NULL);
INSERT INTO `invoices` VALUES (115, 97, '456789', '2011-08-01', 8344.50, 0.00, 0.00, 2, '2011-08-31', NULL);
INSERT INTO `invoices` VALUES (114, 123, '963253249', '2011-08-02', 127.75, 127.75, 0.00, 3, '2011-09-01', '2011-09-04');
INSERT INTO `invoices` VALUES (113, 37, '547480102', '2011-08-01', 224.00, 0.00, 0.00, 3, '2011-08-31', NULL);
INSERT INTO `invoices` VALUES (112, 110, '0-2436', '2011-07-31', 10976.06, 0.00, 0.00, 3, '2011-08-30', NULL);
INSERT INTO `invoices` VALUES (111, 123, '263253257', '2011-07-30', 22.57, 22.57, 0.00, 3, '2011-08-29', '2011-09-03');

Solution

  • Try this for assigning row number within vendor_ids

    SELECT 
        t.*,
        @rn:=IF(vendor_id = @prev_vid,
            @rn + 1,
            IF(@prev_vid:=vendor_id, 1, 1)) rn
    FROM
        (SELECT 
            *
        FROM
            invoices
        ORDER BY vendor_id , invoice_total) t
            CROSS JOIN
        (SELECT @rn:=0, @prev_vid:=- 1) t2;
    

    Which make your final query this:

    SELECT 
        t1.vendor_id, AVG(t1.invoice_total) median
    FROM
        (SELECT 
            t.*,
                @rn:=IF(vendor_id = @prev_vid, @rn + 1, IF(@prev_vid:=vendor_id, 1, 1)) row
        FROM
            (SELECT 
            *
        FROM
            invoices
        ORDER BY vendor_id , invoice_total) t
        CROSS JOIN (SELECT @rn:=0, @prev_vid:=- 1) t2) AS t1
            INNER JOIN
        (SELECT 
            vendor_id, COUNT(*) cnt
        FROM
            invoices
        GROUP BY vendor_id) AS t2 ON t1.vendor_id = t2.vendor_id
            AND t1.row >= t2.cnt / 2
            AND t1.row <= ((t2.cnt / 2) + 1)
    GROUP BY t1.vendor_id;