Search code examples
sqlalgorithmt-sqljaro-winkler

An implementation of the Jaro Winkler distance algorithm in Transact SQL


I've been wondering for months about how to implement this algorithm in Transact SQL, https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance

How can it be done?


Solution

  • Today I finally stumbled upon this Stack Overflow-answer by leebickmtu showing an implementation in C#, originally ported from Java. I took the liberty to port it to a Transact SQL function, enjoy!

    IF OBJECT_ID (N'dbo.InlineMax', N'FN') IS NOT NULL
        DROP FUNCTION dbo.InlineMax;
    GO
    
    CREATE FUNCTION dbo.InlineMax(@valueOne int, @valueTwo int)
        RETURNS FLOAT
    AS
    BEGIN
        IF @valueOne > @valueTwo
        BEGIN
            RETURN @valueOne
        END
    
        RETURN ISNULL(@valueTwo, @valueOne)
    END;
    GO
    
    IF OBJECT_ID (N'dbo.InlineMin', N'FN') IS NOT NULL
        DROP FUNCTION dbo.InlineMin;
    GO
    
    CREATE FUNCTION dbo.InlineMin(@valueOne int, @valueTwo int)
        RETURNS FLOAT
    AS
    BEGIN
        IF @valueOne < @valueTwo
            RETURN @valueOne
    
        RETURN @valueTwo
    END;
    GO
    
    IF OBJECT_ID (N'dbo.JaroWinklerDistance', N'FN') IS NOT NULL
        DROP FUNCTION dbo.JaroWinklerDistance;
    GO
    
    CREATE FUNCTION dbo.JaroWinklerDistance(@stringOne varchar(MAX), @stringTwo varchar(MAX))
    RETURNS FLOAT
    WITH EXECUTE AS CALLER
    AS
    BEGIN
        DECLARE @mWeightThreshold FLOAT; SET @mWeightThreshold = 0.7;
        DECLARE @mNuMChars INT; SET @mNumChars = 4;
        DECLARE @lLen1 int; SET @lLen1 = LEN(@stringOne)
        DECLARE @lLen2 int; SET @lLen2 = LEN(@stringTwo)
    
        IF @lLen1 = 0
            RETURN CASE WHEN @lLen2 = 0 THEN 1 ELSE 0 END
    
        DECLARE @lSearchRange int; SET @lSearchRange = dbo.InlineMax(0,dbo.InlineMax(@lLen1, @lLen2)/2 - 1);
    
        DECLARE @lMatched1 TABLE (position int not null, [status] bit not null)
        DECLARE @lMatched2 TABLE (position int not null, [status] bit not null)
    
        DECLARE @lNumCommon int; SET @lNumCommon = 0
        DECLARE @i int; SET @i = 1; WHILE(@i <= @lLen1)
        BEGIN
            DECLARE @lStart int; SET @lStart = dbo.InlineMax(1, @i - @lSearchRange)
            DECLARE @lEnd int; SET @lEnd = dbo.InlineMin(@i + @lSearchRange + 1, @lLen2)
    
            DECLARE @j int; SET @j = @lStart; WHILE(@j <= @lEnd)
            BEGIN
                IF((SELECT [status] FROM @lMatched2 WHERE position = @j) = 1)
                BEGIN
                    SET @j = @j + 1
                    CONTINUE
                END
    
                IF (SELECT SUBSTRING(@stringOne, @i, 1)) <> (SELECT SUBSTRING(@stringTwo, @j, 1))
                BEGIN
                    SET @j = @j + 1
                    CONTINUE
                END
    
                INSERT INTO @lMatched1 (position, [status]) VALUES(@i, 1)
                INSERT INTO @lMatched2 (position, [status]) VALUES(@j, 1)
    
                SET @lNumCommon = @lNumCommon + 1
                BREAK
            END
    
            SET @i = @i + 1
        END
    
        IF @lNumCommon = 0
        BEGIN
            RETURN 0.0;
        END
    
        DECLARE @lNumHalfTransposed int; SET @lNumHalfTransposed = 0
        DECLARE @k INT; SET @k = 1;
        DECLARE @stopLoop bit; SET @stopLoop = 0;
        SET @i = 1; WHILE(@i <= @lLen1)
        BEGIN
            IF ((SELECT [status] FROM @lMatched1 WHERE position = @i) = 1)
            BEGIN
                SET @i = @i + 1
                CONTINUE;
            END
    
            WHILE(@stopLoop = 0)
            BEGIN
                IF((SELECT [status] FROM @lMatched2 WHERE position = @k) = 0)
                    SET @k = @k + 1
                ELSE
                    BREAK
    
                IF((SELECT SUBSTRING(@stringOne, @i, 1)) <> (SELECT SUBSTRING(@stringTwo, @k, 1)))
                    SET @lNumHalfTransposed = @lNumHalfTransposed + 1
    
                SET @k = @k + 1
            END
    
            SET @i = @i + 1
        END
    
        DECLARE @lNumTransposed INT; SET @lNumTransposed = @lNumHalfTransposed/2;
    
        DECLARE @lNumCommonD FLOAT; SET @lNumCommonD = @lNumCommon
        DECLARE @lWeight FLOAT; SET @lWeight = (@lNumCommonD / @lLen1 + @lNumCommonD / @lLen2 + (@lNumCommon - @lNumTransposed) / @lNumCommonD) / 3.0;
    
        IF(@lWeight <= @mWeightThreshold)
            RETURN @lWeight
        DECLARE @lMax INT; SET @lMax = dbo.InlineMin(@mNumChars, dbo.InlineMin(@lLen1, @lLen2))
        DECLARE @lPos INT; SET @lPos = 0
        WHILE(@lPos < @lMax AND (SELECT SUBSTRING(@stringOne, @lPos, 1)) = (SELECT SUBSTRING(@stringTwo, @lPos, 1)))
        BEGIN
            SET @lPos = @lPos + 1
        END
    
        IF @lPos = 0
            RETURN @lWeight
    
        RETURN @lWeight + 0.1 * @lPos * (1.0 - @lWeight)
    END;
    GO