Search code examples
mysqlgreatest-n-per-group

MySQL: Insert top N rows from each group into a table. Assume millions of groups


Doing this for one group is simple:

INSERT INTO top_dancers_by_group
    SELECT group_id, dancer_id
    FROM dancers
    WHERE group_id = 1
    ORDER BY how_good_they_do_a_half_turn DESC
    LIMIT 1000

Now let's assume there's a groups table which contains thousands of group IDs. How do I perform this insert for every group ID in pure MySQL?


Solution

  • I would use a stored procedure:

    delimiter $$
    create procedure insert_top_dancers_by_group()
    begin
        declare gId int;
        declare done tinyint default 0;
        declare curGroup cursor for
            select distinct group_id from dancers;
        declare continue handler for not found
            set done = 1;
    
        open curGroup;
        group_insert: while done=0 do
            fetch curGroup into gId;
            if done = 0 then
                -- If you want to remove the previous stored dancers for this group:
                delete from top_dancers_by_group where group_id = gId;
                -- Insert the top dancers for this group:
                insert into top_dancers_by_group
                    select group_id, dancer_id
                    from dancers
                    where group_id = gId
                    order by how_good_they_do_a_half_turn DESC
                    limit 1000;
            end if;
        end while;
        close curGroup;
    end $$
    delimiter ;
    

    Hope this helps.


    You can also use a parameter in this procedure to define how many rows are inserted:

    delimiter $$
    create procedure insert_top_n_dancers_by_group(n int)
    begin
        declare gId int;
        declare done tinyint default 0;
        declare curGroup cursor for
            select distinct group_id from dancers;
        declare continue handler for not found
            set done = 1;
    
        open curGroup;
        group_insert: while done=0 do
            fetch curGroup into gId;
            if done = 0 then
                -- If you want to remove the previous stored dancers for this group:
                delete from top_dancers_by_group where group_id = gId;
                -- Insert the top dancers for this group:
                insert into top_dancers_by_group
                    select group_id, dancer_id
                    from dancers
                    where group_id = gId
                    order by how_good_they_do_a_half_turn DESC
                    limit n;
            end if;
        end while;
        close curGroup;
    end $$
    delimiter ;
    

    Once you've created the procedure(s), you can call them like this:

    call insert_top_dancers_by_group();