Search code examples
loopssasprocpercentileweighted-average

SAS calculate CDF as the percentage of values <= any defined values in a weighted variable


The idea is, for a weighted variable "leadb" in a dataset, starts with a value B, and then finds the CDF as the percentage of values <= B, together with a confidence interval for that percentage.

I can do this using PROC SURVEYMEAN for any single value specified, but I don't know how to let SAS to give me multiple percentages at the same time. If I would like to calculate the percentage for values between 0 and max+1, with the interval of (max+1)/100, how should I revise my code?

Thanks!

data Test;
  set data2012;
  if leadb <= 5 then lead5 = 1;
  else if leadb ne . then lead5 = 0;
  else lead5=.;
  if (gender = 2 and age >= 16 and age <= 49) then wocba = 1;
  else wocba = 0;
run;

proc surveymeans data=Test;
  strata stratum;
  cluster psu;
  weight weight2;
  var lead5;
  domain wocba; 
  ods output domain=mystats;
run; 

data mystats;
  set mystats;
  where wocba = 1;
  lower = max(lowerclmean,0); /* since lower bound might be zero, but proportion is >= 0 */
  upper = max(upperclmean,0); 
run;

proc print data=mystats;
  title "Proportion of blood lead values >= 5 for women of child-bearing age (16-49)";
  title2 "Weighted by rates of giving birth by age and race";
  title3 "With a 95% confidence interval";
run;

Solution

  • To calculate a cumulative distribution function taking weighted survey design into account, you need to do several steps.

    1. Bin the data into the intervals you want.
    2. Run the binned data through PROC SURVEYFREQ to get weighted percents for each interval
    3. Use the weighted percents to calculate the cumulative weighted percent
    4. Calculate the 95% confidence interval

    (Note: I'm not sure how legitimate it is to use the StdErr estimated for percent on the cumulative percent. You have to decide that for yourself. But if it were me, I would use it.)

    See the code below. I hope this helps!

    *** GENERATE TEST DATA ***;
    data have;
        do i=1 to 200;
    
            leadb = ranexp(123321) * 5;
    
            *** SURVEY VARIABLES ***;
            stratum = mod(i, 12) + 1;
            if ranuni(456654) > 0.5 then psu = 1;
            else psu = 2; 
            weight2 = ranuni(1991) * 1000;
    
            *** DOMAIN VARIABLE ***;
            if ranuni(789987) > 0.7 then wocba = 1;
            else wocba = 0;
    
            output;
        end;
    run;
    
    
    *** GET MIN/MAX ***;
    proc summary data=have;
        var leadb;
        output out=stats min=min  max=max ;
    run;
    
    
    *** USE MIN/MAX TO CREATE INTERVALS TO BIN THE DATA BY APPLYING A FORMAT ***;
    *** CREATE A CONTROL DATASET THAT WILL BE CONVERTED INTO A FORMAT ***;
    data control_dset;
        set stats (drop=_type_ _freq_);
        min=floor(min);
        max=ceil(max);
        *** CALCULATE INTERVAL BASED ON MIN AND MAX OF DATA ***;
        interval = round( (max - min + 1)/100 , 0.1);
    
        fmtname = 'leadfmt';
        type = 'n';
        eexcl = 'Y';    *** END VALUE IS EXCLUDED FROM RANCE ***;
        do i = min to max by interval;
            start = i;
            end = i + interval;
    
            label = start;
            output;
        end;
    run;
    
    *** CONVERT CONTROL DATASET TO A FORMAT ***;
    proc format cntlin=control_dset;
    run;
    
    *** APPLY FORMAT TO BIN THE DATA INTO INTERVALS ***;
    data start;
        set have;
        lead_interval = put(leadb, leadfmt.) + 0;
    run;
    
    
    ODS TRACE ON / LISTING;
    
    *** USE SURVEMEANS TO GET CUMULATIVE FREQUENCIES FOR BINNED CATEGORIES ***;
    *** NOTE: SURVEYFREQ DOES -NOT- HAVE A DOMAIN STATEMENT ***;
    *** INSTEAD, PUT DOMAIN VARIABLE IN TABLE STATEMENT AND THEN GET APPROPRIATE ROW OR COL PERCENT FROM OUTPUT ***;
    proc surveyfreq data=start;
        ods output summary=summary; 
        ods output crosstabs=crosstabs; 
        strata stratum;
        cluster psu;
        weight weight2;
        *** USE THE DOMAIN / SUBPOPULATION VARIABLE IN THE TABLE STATEMENT ***; 
        tables wocba * lead_interval / row ;
    run; 
    
    ods trace off;
    
    
    *** CALCULATE CUMULATIVE PERCENT ***;
    data really_close;
        set crosstabs;
        retain CumRowPercent 0;
    
        *** SUBSET ROW PERCENTS FOR DOMAIN, ALSO DELETE IF COUNT = 0 ***;
        if wocba = 1 and strip(F_lead_interval) not= 'Total' and frequency > 0;
    
        *** CALCULATE CUMULATIVE PERCENT ***;
        CumRowPercent = sum(RowPercent, CumRowPercent);
    
        drop Percent StdErr StdDev;
    run;
    
    
    *** I AM NOT SURE HOW LEGITIMATE IT IS TO USE USE THE RowStdErr WITH THE CUMULATIVE ROW PERCENTS ***;
    *** CONSULT YOUR FAVORITE STATISTICIAN FOR A FIRM OPINION!!! ***;
    
    *** GET T-STATISTIC TO CALCULATE 95% CONFIDENCE INTERVAL ***;
    data tstat;
        *** SUMMARY STATISTICS FROM PROC SURVEYFREQ ***;
        set summary end=lastrec;
        retain nclus nstrat;
    
        if index( upcase(Label1), 'STRATA') then nstrat = nvalue1;
        else if index( upcase(Label1), 'CLUSTER') then nclus = nvalue1;
    
        *** DEGREES OF FREEDOM = NUMBER OF CLUSTERS - NUMBER OF STRATA ***;
        df = nclus - nstrat;
    
        *** GET T-STATISTIC FOR 95% CONFIDENCE INTERVAL ***;
        tstat = abs( quantile('T', 0.05/2, df) );
    
        if lastrec;
        drop label1 cvalue1 nvalue1;
    run;
    
    
    *** CALCULATE 95% CI ***;
    data want;
        set really_close ;
        if _N_ =1 then set tstat;
    
        CumRowPct_Lower = CumRowPercent - tstat * RowStdErr;
        CumRowPct_Upper = CumRowPercent + tstat * RowStdErr;
    
        if CumRowPct_Lower < 0 then CumRowPct_Lower = 0;
        if CumRowPct_Upper > 100 then CumRowPct_Upper = 100;
    
        keep lead_interval CumRowPercent CumRowPct_Lower CumRowPct_Upper;
    run;