Search code examples
sasduplicates4gl

Removing similar rows in SAS


I've got two tables with similar structure:
- First table: id and col1,col2,col3 - all numerics.
- Second table: id and col4,col5,col6 - all numerics.

I want to remove from the first one all rows which are similar to any of the rows from the second tagble. I consider a row to be similiar to other row when any column from the group col1-col3 is equal to any of the columns from the group col4-col6. Now I'm doing it in 9 consecutive data steps (first checks whether col1=col4, second col1=col5 , ..., ninth col3=col6), which probably is not the optimal solution.

Any ideas how to improve this?


Solution

  • This is my solution:

    data vec1;
      set ds2;
      array cvar{*} col4 col5 col6;
      do ijk=1 to dim(cvar);
        compvar=cvar(ijk);
        output;
      end;
    run;
    
    proc sql noprint;
      select distinct compvar into :cvars separated by ' '
      from vec1;
    quit;
    %let numcvar=&sqlobs;
    
    data ds1(drop=i);
      set ds1;
      array myvar(i) col:;
      do over myvar;
        if myvar in (&cvars.) then delete;
      end;
    run;
    

    If you run into trouble with the length of the CVARS macro variable you could use this instead:

    data vec1;
      set ds2;
      array cvar{*} col:;
      do ijk=1 to dim(cvar);
        compvar=cvar(ijk);
        output;
      end;
    run;
    
    proc sort data=vec1 out=vec2(keep=compvar) nodupkey;
      by compvar;
    run;
    
    proc transpose data=vec2 out=flat prefix=x;
    run;
    
    data ds1(keep=id col:);
      set ds1b;
      if _n_=1 then set flat;
      array myvar(i) col:;
      array xvar(j) x:;
      do over myvar;
        do over xvar;
          if myvar=xvar then delete;
        end;
      end;
    run;
    

    The PROC SORT could be eliminated but it makes it more efficient for big data sets.

    Or you could generate a format on the fly:

    data vec1;
      set ds2;
      array cvar{*} col4 col5 col6;
      do ijk=1 to dim(cvar);
        compvar=cvar(ijk);
        output;
      end;
    run;
    
    proc sort data=vec1 out=vec2 nodupkey;
      by compvar;
    run;
    
    data fmt1;
      set vec2;
      length start $20;
      fmtname="remobs";
      start=compress(put(compvar,best.));
      label="remove";
    run;
    
    proc format lib=work cntlin=fmt1;
    run;
    
    data ds1(drop=i);
      set ds1;
      array myvar(i) col:;
      do over myvar;
        if put(myvar,remobs.)="remove" then delete;
      end;
    run;
    

    I suspect this last method would be faster than the two preceding solutions.

    UPDATE

    Using hash objects

    data vec1;
      set ds2;
      array cvar{*} col4 col5 col6;
      do ijk=1 to dim(cvar);
        compvar=cvar(ijk);
        output;
      end;
    run;
    
    proc sort data=vec1 out=vec2 nodupkey;
      by compvar;
    run;
    
    data ds1_new(keep=id col1 col2 col3);
      if _n_ = 0 then set work.vec2;
      declare hash myhash(DATASET:'work.vec2') ; 
      rc=myhash.defineKey('compvar'); 
      myhash.defineDone();
      set ds1;
      array rcarr{*} rc1-rc3;
      array lookup{*} col1 col2 col3;
      do i=1 to dim(lookup);
        rcarr(i)=myhash.find(key: lookup(i));
        if rcarr(i)=0 then delete;
      end;
    run;