Search code examples
textsasmacrosanalyticsmatching

How to implement the Jacccard macro in a data step for a matching problem?


I tried implementing a solution into my sas code but with no luck. I'm trying to add a jaccard distance column. to my dataset. I keep getting errors : variable name & is not valid invalid value for the keep option The idea is to solve a matching problem between two datasets and to take into consideration the typing errors.

data table_test;
    input nom1 $3. nom2 $3.;
cards;
abcade
vdenfr
azfefs
;
run;

%macro kshingling
(string
,k=5
,out=&sysmacroname.
)
;

data &out.;
   string = strip(prxchange('s#\s# #',-1,symget('string')));
   do _n_ = 1 to lengthn(string)-&k.+1;
      ngram = substr(string,_n_,&k.);
      output;
   end;
run;

%mend;



%macro jaccard
(string1
,string2
)
;

%kshingling(&string1.,k=2,out=s1)
%kshingling(&string2.,k=2,out=s2)

proc append base=s1 data=s2; run;

proc freq data=s1 noprint;
   tables string*ngram / out=s2;
run;

proc transpose data=s2 out=s1(drop=_name_ _label_); 
by string notsorted;
var count;
id ngram;
run;

proc stdize data=s1 out=s2 missing=0 reponly;
var _numeric_;
run;

proc distance data=s2 method=jaccard absent=0 out=s1; 
var anominal(_numeric_);
id string;
run;

data t(keep=&string1.);
set s1(firstobs=2);
run;

data _null_;
set t;
call symput('Jaccard',&string1.);
%put Distance de Jaccard = &Jaccard;
run;

%mend;

data test;
set table_test;
call symput('n1',nom1);
call symput('n2',nom2);
%jaccard(&n1,&n2);
run;

data Jacc;

Dist_Jacc=&Jaccard;
run;

data Final; merge table_test Jacc; run;





Solution

  • Looks to me like the OUTPUT of your macro is the dataset T. You can use PROC APPEND to aggregate the results of multiple macro calls into a single dataset. You can then combine that data with your input dataset of ngrams.

    data _null_;
      set table_test;
      call execute(cats('%nrstr(%jaccard)(',nom1,',',nom2,');'));
      call execute('proc append base=result data=t; run;');
    run;
    
    data want;
       set table_test;
       set result;
    run;
    

    BUT you will need to make sure the generated T dataset has THE EXACT SAME STRUCTURE each time.

    So change the ending steps of the macro to this single step so that the dataset T always consists of ONE observation and ONE variable and the variable is named Jaccard. You can also use the %GLOBAL statement to make sure that the value of JACCARD macro variable is available after the macro finishes.

    %if not %symexist(jaccard) %then %global jaccard;
    data t ;
      set s1(keep=&string1. rename=(&string1.=Jaccard) obs=2 firstobs=2);
      call symputx('Jaccard',Jaccard);
    run;
    %put Distance de Jaccard = &Jaccard;