Search code examples
sqlsql-servert-sqlwindow-functions

Partitioning in SQL


I've been trying to find the median number of times an account (person) is seen (appt_id) by a provider (provider_code) in a given period. The attached SQL doesn't seen to be capturing all of the provider_codes and I can't figure out why. Desired outcome is that all provider_code are listed with a median number.

*I don't have access to MS SQL Server 2012 or Newer - yes we are way behind the times and yes it does make life much more difficult.

Sample_data

SELECT
   provider_code, office_location,
  CONVERT(INT, count(account)) AS Median

FROM
(
   SELECT
      office_location,provider_code,
   account,appt_date,dept_code,appt_status,appt_class,
      ROW_NUMBER( ) OVER (
         PARTITION BY office_location,provider_code
         ORDER BY account ASC) as RowAsc, 

      ROW_NUMBER( ) OVER (
         PARTITION BY office_location,provider_code
         ORDER BY account DESC) as RowDesc

   FROM appointments_view WITH(NOLOCK)


WHERE account IS NOT NULL AND appt_date BETWEEN '1/1/17' /*24 month prior*/ AND '1/1/19' 


 ) X


WHERE 
   RowAsc IN (RowDesc, RowDesc - 1, RowDesc + 1)


GROUP BY office_location,provider_code
ORDER BY office_location,provider_code

Solution

  • For a median you could use the window function PERCENTILE_CONT or PERCENTILE_DISC
    (MS Sql Server 2012+)

    Example snippet:

    declare @Appointments table (
     appt_id int primary key identity(4046100,1), 
     appt_date date not null default GetDate(), 
     account int not null, 
     provider_code varchar(10) not null, 
     office_location char(3) not null default 'REN', 
     appt_class char(3) not null 
     );
    
    insert into @Appointments (appt_date, account, provider_code, appt_class) values
    ('2019-02-01',100001,'FOO1','IND'),('2019-02-01',100002,'FOO1','IND'),('2019-02-01',100002,'FOO1','PSY'),('2019-02-01',100002,'FOO1','IND'),
    ('2019-02-01',100002,'FOO1','IND'),('2019-02-01',100003,'FOO1','IND'),('2019-02-01',100003,'FOO1','IND'),('2019-02-01',100003,'FOO1','IND');
    
    select provider_code, office_location, MAX(MedianContTotalAppointments) AS MedianApt
    from
    (
        select provider_code, office_location, account
        , count(appt_id) as TotalAppointments
        , PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY count(appt_id)) OVER (PARTITION BY provider_code, office_location) AS MedianContTotalAppointments
        -- , PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY count(*)) OVER (PARTITION BY provider_code, office_location) AS MedianDiscTotalAppointments
        from @Appointments
        where account IS NOT NULL 
          and appt_date BETWEEN cast('2017-02-01' as date) AND cast('2019-02-01' as date)
        group by provider_code, office_location, account
    ) q
    group by provider_code, office_location
    order by provider_code, office_location;
    

    Returns:

    provider_code   office_location     MedianApt
    FOO1            REN                 3
    

    In a MS Sql Server version before 2012, then this example snippet might work:

    declare @Appointments table (
     appt_id int primary key identity(4046100,1), 
     appt_date date not null default GetDate(), 
     account int not null, 
     provider_code varchar(10) not null, 
     office_location char(3) not null default 'REN', 
     appt_class char(3) not null 
     );
    
    insert into @Appointments (appt_date, account, provider_code, appt_class) values
     ('2019-02-01',100001,'FOO1','IND'),('2019-02-01',100002,'FOO1','IND'),('2019-02-01',100002,'FOO1','PSY'),('2019-02-01',100002,'FOO1','IND')
    ,('2019-02-01',100002,'FOO1','IND'),('2019-02-01',100003,'FOO1','IND'),('2019-02-01',100003,'FOO1','IND'),('2019-02-01',100003,'FOO1','IND')
    --,('2019-02-01',100004,'FOO1','IND'),('2019-02-01',100004,'FOO1','IND')
    ;
    
    select provider_code, office_location, AVG(TotalAppointments) AS MedianApt
    from
    (
        select provider_code, office_location, account
        , COUNT(appt_id) as TotalAppointments
        , ROW_NUMBER() OVER (PARTITION BY provider_code, office_location ORDER BY COUNT(appt_id) ASC) AS rn
        , COUNT(*) OVER (PARTITION BY provider_code, office_location) AS cnt
        from @Appointments
        where account IS NOT NULL 
          and appt_date BETWEEN cast('2017-02-01' as date) AND cast('2019-02-01' as date)
        group by provider_code, office_location, account
    ) q
    where rn in (FLOOR((cnt+1)*0.5), CEILING((cnt+1)*0.5))
    group by provider_code, office_location
    order by provider_code, office_location;