Search code examples
sqldb2

Tracking student performance over time with SQL


I have this table in SQL of student grades:

CREATE TABLE myt 
(
    id INT PRIMARY KEY,
    name VARCHAR(50),
    letter CHAR(1),
    date DATE
);

INSERT INTO myt (id, name, letter, date) 
VALUES
(1, 'Alice', 'C', '2023-01-28'),
(2, 'Alice', 'B', '2023-02-28'),
(3, 'Alice', 'A', '2023-03-28'),
(4, 'Bob', 'B', '2023-01-09'),
(5, 'Bob', 'C', '2023-02-09'),
(6, 'Bob', 'B', '2023-03-09'),
(7, 'Charlie', 'B', '2023-01-19'),
(8, 'Charlie', 'A', '2023-02-19'),
(9, 'Charlie', 'A', '2023-03-19'),
(10, 'Charlie', 'A', '2023-04-19'),
(11, 'Charlie', 'A', '2023-05-19'),
(12, 'David', 'B', '2023-01-05'),
(13, 'David', 'C', '2023-02-05'),
(14, 'David', 'A', '2023-03-05'),
(15, 'David', 'B', '2023-04-05'),
(16, 'David', 'B', '2023-05-05'),
(17, 'David', 'A', '2023-06-05'),
(18, 'Emma', 'B', '2023-01-15'),
(19, 'Emma', 'A', '2023-02-15'),
(20, 'Emma', 'A', '2023-03-15'),
(21, 'Emma', 'A', '2023-04-15'),
(22, 'Emma', 'A', '2023-05-15'),
(23, 'Frank', 'B', '2023-01-06'),
(24, 'Frank', 'A', '2023-02-06'),
(25, 'Frank', 'A', '2023-03-06'),
(26, 'Frank', 'A', '2023-04-06'),
(27, 'Grace', 'A', '2023-01-27'),
(28, 'Grace', 'B', '2023-02-27'),
(29, 'Grace', 'B', '2023-03-27'),
(30, 'Henry', 'B', '2023-01-31'),
(31, 'Henry', 'A', '2023-03-03'),
(32, 'Henry', 'A', '2023-03-31'),
(33, 'Henry', 'A', '2023-05-01'),
(34, 'Henry', 'A', '2023-05-31'),
(35, 'Isabel', 'B', '2023-01-20'),
(36, 'Isabel', 'C', '2023-02-20'),
(37, 'Isabel', 'A', '2023-03-20'),
(38, 'Isabel', 'A', '2023-04-20'),
(39, 'Isabel', 'C', '2023-05-20'),
(40, 'Jack', 'B', '2023-01-30'),
(41, 'Jack', 'C', '2023-03-02'),
(42, 'Jack', 'A', '2023-03-30'),
(43, 'Jack', 'B', '2023-04-30'),
(44, 'Jack', 'B', '2023-05-30'),
(45, 'Jack', 'C', '2023-06-30');

Consider the following problem: I want to see that for someone who begins with a B, once they get the first A: how many of them remain as purely A students for their remaining rows, and how many do not.

I wrote this SQL code which first identifies students that have their first grade as B and then sees how many of them only get A's after their first A:

WITH RankedGrades AS 
(
    SELECT 
        name,
        letter,
        date,
        ROW_NUMBER() OVER (PARTITION BY name ORDER BY date) as grade_sequence
    FROM 
        myt
),
BStarters AS 
(
    SELECT DISTINCT name
    FROM RankedGrades
    WHERE grade_sequence = 1 AND letter = 'B'
),
FirstASequence AS 
(
    SELECT 
        r.name,
        MIN(r.grade_sequence) as first_a_sequence
    FROM 
        RankedGrades r
    JOIN
        BStarters b ON r.name = b.name
    WHERE 
        r.letter = 'A'
    GROUP BY 
        r.name
),
FirstADetails AS 
(
    SELECT 
        r.name,
        r.date as first_a_date,
        fas.first_a_sequence as a_sequence_number
    FROM 
        RankedGrades r
    JOIN 
        FirstASequence fas ON r.name = fas.name
    WHERE 
        r.grade_sequence = fas.first_a_sequence
),
SubsequentPerformance AS 
(
    SELECT 
        f.name,
        CASE WHEN MAX(CASE WHEN r.letter != 'A' THEN 1 ELSE 0 END) = 0 
             THEN 'Pure A'
             ELSE 'Mixed Grades'
        END as grade_pattern
    FROM 
        FirstADetails f
    JOIN 
        RankedGrades r ON f.name = r.name
    WHERE 
        r.grade_sequence > f.a_sequence_number
    GROUP BY 
        f.name
),
Summary AS 
(
    SELECT 
        grade_pattern,
        COUNT(*) as student_count,
        SUM(COUNT(*)) OVER () as total_students
    FROM 
        SubsequentPerformance
    GROUP BY 
        grade_pattern
)
SELECT 
    grade_pattern,
    student_count,
    ROUND(CAST(student_count AS FLOAT) / total_students * 100, 2) as percentage
FROM 
    Summary
ORDER BY 
    student_count DESC;

The final answer looks like this:

  grade_pattern student_count percentage
  --------------------------------------
        Pure A             4      57.14
  Mixed Grades             3      42.86

I am trying to extend this code for all combinations. E.g. Students that start with B, after the first C how many have pure C's ... Students that start with B, after the first A how many have pure A's ... Students that start with A, after the first C how many have pure C's etc etc.

Is it possible to do this in the same query? Or does a new query have to be written each time?


Solution

  • You did the hard work, there only needed to replace the hardcoded A and Bs with a list of all previously found values for the given user.

    Below is the diff from the code you posted,
    you'll find the full solution on an SQLFiddle (in PostgreSQL but this should be mostly compatible)
    (you may want to uncomment the 'Koala' student, to introduce diversity, because as your example was B-to-A focused, all other categories fall 100 % into the "Mixed" category).

    Note that to minimize the diff I kept the names as is,
    but if you adopt the solution, first_a_sequence and first_a_letter should be rewritten first_other_letter_sequence and first_other_letter (in fact the "other letter" may be the same as the starter one, in case you want to know how many people started at A and kept an A the full time along).

    --- /tmp/1.sql  2025-01-14 23:50:06.645054000 +0100
    +++ /tmp/2.sql  2025-01-14 23:49:40.995233000 +0100
    @@ -10,28 +10,28 @@
     ),
     BStarters AS
     (
    -    SELECT DISTINCT name
    +    SELECT DISTINCT name, letter first_letter
         FROM RankedGrades
    -    WHERE grade_sequence = 1 AND letter = 'B'
    +    WHERE grade_sequence = 1
     ),
     FirstASequence AS
     (
         SELECT
    -        r.name,
    +        r.name, b.first_letter, r.letter,
             MIN(r.grade_sequence) as first_a_sequence
         FROM
             RankedGrades r
         JOIN
             BStarters b ON r.name = b.name
    -    WHERE
    -        r.letter = 'A'
         GROUP BY
    -        r.name
    +        r.name, b.first_letter, r.letter
     ),
     FirstADetails AS
     (
         SELECT
             r.name,
    +       fas.first_letter,
    +       r.letter as first_a_letter,
             r.date as first_a_date,
             fas.first_a_sequence as a_sequence_number
         FROM
    @@ -44,9 +44,9 @@
     SubsequentPerformance AS
     (
         SELECT
    -        f.name,
    -        CASE WHEN MAX(CASE WHEN r.letter != 'A' THEN 1 ELSE 0 END) = 0
    -             THEN 'Pure A'
    +        f.name, f.first_letter, f.first_a_letter,
    +        CASE WHEN MAX(CASE WHEN r.letter != f.first_a_letter THEN 1 ELSE 0 END) = 0
    +             THEN 'Pure '||f.first_a_letter
                  ELSE 'Mixed Grades'
             END as grade_pattern
         FROM
    @@ -56,24 +56,24 @@
         WHERE
             r.grade_sequence > f.a_sequence_number
         GROUP BY
    -        f.name
    +        f.name, f.first_letter, f.first_a_letter
     ),
     Summary AS
     (
         SELECT
    -        grade_pattern,
    +        first_letter, first_a_letter, grade_pattern,
             COUNT(*) as student_count,
    -        SUM(COUNT(*)) OVER () as total_students
    +        SUM(COUNT(*)) OVER (partition by first_letter, first_a_letter) as total_students
         FROM
             SubsequentPerformance
         GROUP BY
    -        grade_pattern
    +        first_letter, first_a_letter, grade_pattern
     )
     SELECT
    -    grade_pattern,
    +    first_letter "Starting with", first_a_letter "entering a sequence of", grade_pattern "finished as",
         student_count,
    -    ROUND(CAST(student_count AS FLOAT) / total_students * 100, 2) as percentage
    +    ROUND(CAST(student_count AS FLOAT) / total_students * 10000) / 100 as percentage -- Adapt to PostgreSQL which doesn't have round(x, precision).
     FROM
         Summary
     ORDER BY
    -    student_count DESC;
    +    first_letter desc, first_a_letter, student_count DESC;