Search code examples
sqlperformancegroup-bycountsas

SAS Code for Counts by Group for a Large Dataset


I'm looking for an efficient way to produce sums by week ('weekstart') as a variable ('weekcount') within a dataset of tens of millions of rows. I need it as a variable for further analysis later on, before reducing the dataset. The code below takes too long to run and had to terminate, so I'm wondering if there's a more efficient way of producing these group sums for later use.

proc sql;
select weekstart, count(*) as Weekcount
from pharm
group by weekstart
order by weekstart;
quit;

Thank you


Solution

  • You can use a hash object to compute bin frequency when a very large table is not being processed efficiently by Proc FREQ or Proc SQL.

    data have;
      call streaminit(20230420);
    
      do _n_ = 1 to 2e6;
        pid = rand('integer', 1, 2.5e5);
        weekstart = intnx ('week', '01jan2015'd + rand('integer', 1, 2500), 1);
        output;
      end;
      format weekstart yymmdd10.;
    run;
    
    data weekstart_freq;
      weekstart = 0;
    
      declare hash bins(ordered:'a', suminc:'_n_');
      bins.defineKey('weekstart');
      bins.defineDone();
    
      do while (not lastrow);
        set have(keep=weekstart) end=lastrow;
        bins.ref(); /* perform bin counting via suminc var */
      end;
    
      declare hiter foreachbin('bins');
    
      do while (0=foreachbin.next());
        bins.sum(sum:freq);  /* retrieve bin count */
        output;
      end;
    
      stop;
    run;
    

    Here is a similar alternative that uses the HASH keysum: argument to make the freq variable automatically part of the .output () data set

    data _null_ ;
      weekstart = 0;
    
      declare hash bins(ordered:'a', suminc:'_n_', keysum:'freq');
      bins.defineKey('weekstart');
      bins.defineDone();
      retain freq 0 ;
    
      do while (not lastrow);
        set have(keep=weekstart) end=lastrow;
        bins.ref(); /* perform bin counting (freq accumulation) via suminc var */
      end ;
    
      bins.output(dataset:'weekstart_freq2') ;
      stop ;
    run ;