Search code examples
sasdatastep

SAS removing duplicates by the reappearance of chunks of records


I have a dataset with the reappearance of chunks of data in identical length groups of observations, such as:

    data have;
    input name $    identifier ;
    cards;
    mary     1
    mary     2
    mary     2
    mary     4
    mary     5
    mary     7
    mary     6
    adam     2
    adam     3
    adam     3
    adam     7
    /*remove*/
    mary     1
    mary     2
    mary     2
    mary     4
    mary     5
    mary     7
    mary     6
    /*remove*/
    adam     8
    mary     1
    mary     2
    mary     3
    mary     4
    mary     5
    mary     7
    mary     6
    adam    9
    mary     1
    mary     2
    mary     3

    ;

I'm hoping to remove the chunk of reappearance of mary marked by /remove/ with ordered identifier. The outcome should be like the following:

mary     1
mary     2
mary     4
mary     5
mary     6
mary     7
adam    2 
adam    3 
adam    7
adam    8
mary     1
mary     2
mary     3
mary     4
mary     5
mary     6
mary     7
adam     9
mary     1
mary     2
mary     3

Thank you for any help! Someone suggested a method by Hash table, but I suspect that I may not have enough memory to process the code. Could this be done by datasteps or proc sql?


Solution

  • If the maximum number of records per group is small enough then here is method you could use that builds a string with the list of identifiers in the group and uses it as one of the keys in the HASH.

    data want ;
    do until (last.name);
      set have ;
      by name notsorted ;
      length taglist $200 ;
      taglist=catx('|',taglist,identifier);
    end;
    if _n_=1 then do;
      dcl hash h();
      h.defineKey('name','taglist');
      h.defineDone();
    end;
    found = 0 ne h.add();
    do until (last.name);
      set have ;
      by name notsorted ;
      if not found then output;
    end;
    drop found taglist;
    run;
    

    If the keys are too large to fit into a hash object then you will need to do multiple pass. First find the groups. Then find the first occurrence of each type of group. Then generate the data for those groups.

    data pass1 ;
      group + 1;
      first_obs=row+1;
      do until (last.name);
        set have ;
        by name notsorted ;
        length taglist $200 ;
        taglist=catx('|',taglist,identifier);
        row+1;
      end;
      last_obs=row;
      output;
      keep group name taglist first_obs last_obs;
    run;
    
    proc sql ;
      create table pass2 as
        select group,first_obs,last_obs
        from pass1
        group by name,taglist
        having min(group)=group
        order by group
      ;
    quit;
    
    data want;
      set pass2;
      do obs=first_obs to last_obs;
        set have point=obs;
        output;
      end;
      drop /*group*/ first_obs last_obs ;
    run;
    

    Result:

    Obs    group    name    identifier
    
      1      1      mary         1
      2      1      mary         2
      3      1      mary         2
      4      1      mary         4
      5      1      mary         5
      6      1      mary         7
      7      1      mary         6
      8      2      adam         2
      9      2      adam         3
     10      2      adam         3
     11      2      adam         7
     12      4      adam         8
     13      5      mary         1
     14      5      mary         2
     15      5      mary         3
     16      5      mary         4
     17      5      mary         5
     18      5      mary         7
     19      5      mary         6
     20      6      adam         9
     21      7      mary         1
     22      7      mary         2
     23      7      mary         3