What I need to do is compare two strings and mark the differences with begining/ending marks for changes. Example:
this is string number one.
this string is string number two.
output would be
this [is|string is] string number [one|two].
I've been trying to figure this out for some time now. And I found something I blieved would help me do this, but I am unable to make this happen.
http://www.angusj.com/delphi/textdiff.html
I have it about 80% working here, but I've got no idea how to get it to do exactly what I want it to do. Any help would be appreciated.
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, Diff, StdCtrls;
type
TForm2 = class(TForm)
Edit1: TEdit;
Edit2: TEdit;
Button1: TButton;
Memo1: TMemo;
Diff: TDiff;
procedure Button1Click(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;
var
Form2: TForm2;
s1,s2:string;
implementation
{$R *.dfm}
procedure TForm2.Button1Click(Sender: TObject);
var
i: Integer;
lastKind: TChangeKind;
procedure AddCharToStr(var s: string; c: char; kind, lastkind: TChangeKind);
begin
if (kind lastkind) AND (lastkind = ckNone) and (kind ckNone) then s:=s+'[';
if (kind lastkind) AND (lastkind ckNone) and (kind = ckNone) then s:=s+']';
case kind of
ckNone: s := s + c;
ckAdd: s := s + c;
ckDelete: s := s + c;
ckModify: s := s + '|' + c;
end;
end;
begin
Diff.Execute(pchar(edit1.text), pchar(edit2.text), length(edit1.text), length(edit2.text));
//now, display the diffs ...
lastKind := ckNone;
s1 := ''; s2 := '';
form2.caption:= inttostr(diff.Count);
for i := 0 to Diff.count-1 do
begin
with Diff.Compares[i] do
begin
//show changes to first string (with spaces for adds to align with second string)
if Kind = ckAdd then
begin
AddCharToStr(s1,' ',Kind, lastKind);
end
else
AddCharToStr(s1,chr1,Kind,lastKind);
if Kind = ckDelete then
begin
AddCharToStr(s2,' ',Kind, lastKind)
end
else AddCharToStr(s2,chr2,Kind,lastKind);
lastKind := Kind;
end;
end;
memo1.Lines.Add(s1);
memo1.Lines.Add(s2);
end;
end.
I took the basicdemo1 from angusj.com and modified it to get this far.
To solve the problem you describe, you'd essentially have to do something like what is done in biological sequence alignment of DNA or protein data. If you only have two strings (or one constant reference string), it can approached by dynamic programming-based pairwise alignment algorithms such as the Needleman Wunsch algorithm* and related algorithms. (Multiple sequence alignment gets much more complicated.)
[* Edit: link should be: http://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm ]
Edit 2:
Since you seem to be interested in comparing at the level of words rather than characters, you could (1) split the input strings into arrays of strings, where each array element represents a word and then (2) carry out the alignment at the level of these words. This has the benefit of the search space for the alignment becoming smaller and thus you'd expect it to be faster overall. I have adapted and 'Delphified' the pseudo-code example from the wikipedia-article accordingly:
program AlignWords;
{$APPTYPE CONSOLE}
function MaxChoice (C1, C2, C3: integer): integer;
begin
Result:= C1;
if C2 > Result then Result:= C2;
if C3 > Result then Result:= C3;
end;
function WordSim (const S1, S2: String): integer; overload;
//Case-sensitive!
var i, l1, l2, minL: integer;
begin
l1:= length(S1);
l2:= length(S2);
Result:= l1-l2;
if Result > 0 then Result:= -Result;
if (S1='') or (S2='') then exit;
minL:= l1;
if l2 < l1 then minL:= l2;
for i := 1 to minL do if S1[i]<>S2[i] then dec(Result);
end;
procedure AlignWordsNW (const A, B: Array of String; GapChar: Char; const Delimiter: ShortString; GapPenalty: integer; out AlignmentA, AlignmentB: string);
// Needleman-Wunsch alignment
// GapPenalty should be a negative value!
var
F: array of array of integer;
i, j,
Choice1, Choice2, Choice3,
Score, ScoreDiag, ScoreUp, ScoreLeft :integer;
function GapChars (const S: String): String;
var i: integer;
begin
assert (length(S)>0);
Result:='';
for i := 0 to length(S) - 1 do Result:=Result + GapChar;
end;
begin
SetLength (F, length(A)+1, length(B)+1);
for i := 0 to length(A) do F[i,0]:= GapPenaltyi;
for j := 0 to length(B) do F[0,j]:= GapPenaltyj;
for i:=1 to length(A) do begin
for j:= 1 to length(B) do begin
Choice1:= F[i-1,j-1] + WordSim(A[i-1], B[j-1]);
Choice2:= F[i-1, j] + GapPenalty;
Choice3:= F[i, j-1] + GapPenalty;
F[i,j]:= maxChoice (Choice1, Choice2, Choice3);
end;
end;
AlignmentA:= '';
AlignmentB:= '';
i:= length(A);
j:= length(B);
while (i > 0) and (j > 0) do begin
Score := F[i,j];
ScoreDiag:= F[i-1,j-1];
ScoreUp:= F[i,j-1];
ScoreLeft:= F[i-1,j];
if Score = ScoreDiag + WordSim(A[i-1], B[j-1]) then begin
AlignmentA:= A[i-1] + Delimiter + AlignmentA;
AlignmentB:= B[j-1] + Delimiter + AlignmentB;
dec (i);
dec (j);
end else if Score = ScoreLeft + GapPenalty then begin
AlignmentA:= A[i-1] + Delimiter + AlignmentA;
AlignmentB:= GapChars (A[i-1]) + Delimiter + AlignmentB;
dec(i);
end else begin
assert (Score = ScoreUp + GapPenalty);
AlignmentA:= GapChars(B[j-1]) + Delimiter + AlignmentA;
AlignmentB:= B[j-1] + Delimiter + AlignmentB;
dec (j);
end;
end;
while (i > 0) do begin
AlignmentA:= A[i-1] + Delimiter + AlignmentA;
AlignmentB:= GapChars(A[i-1]) + Delimiter + AlignmentB;
dec(i);
end;
while (j > 0) do begin
AlignmentA:= GapChars(B[j-1]) + Delimiter + AlignmentA;
AlignmentB:= B[j-1] + Delimiter + AlignmentB;
dec(j);
end;
end;
Type
TStringArray = Array Of String;
Var
as1, as2: TStringArray;
s1, s2: string;
BEGIN
as1:= TStringArray.create ('this','is','string','number','one.');
as2:= TStringArray.Create ('this','string','is','string','number','two.');
AlignWordsNW (as1, as2, '-',' ',-1, s1,s2);
writeln (s1);
writeln (s2);
END.
The output on this example is
this ------ is string number ---- one. this string is string number two. ----
It's not perfect, but you get the idea. From this sort of output, you should be able to do what you want. Note that you might want to tweak the GapPenalty
and the Similarity Scoring Function WordSim
to fit your needs.