Search code examples
c#sql-serversql-server-2008-r2collationsqlclr

How can I get a SQL String's collation within a CLR function?


I'm writing a Levenshtein Distance function in C# to calculate the edit distance between two strings. The problem is that I'd like to call the method multiple times with different collations but only one collation ever makes it across the SQL to CLR interface - and that is the default collation of the database.

Here is the code for the CLR Function:

[SqlFunction(IsDeterministic = true, Name = "LevenshteinDistance")]
public static SqlInt64 Distance(SqlString textA, SqlString textB)
{
    // get a collation-aware comparer so string/character comparisons 
    // will match the inputs' specified collation
    var aCompareInfo = textA.CompareInfo;
    var compareOptions = ConvertCompareOptions(textA.SqlCompareOptions);
    var aLength = textA.Value.Length;
    var bLength = textB.Value.Length;

    // degenerate cases
    if (aCompareInfo.Compare(textA.Value, 0, aLength, textB.Value, 0, bLength, compareOptions) == 0) { return 0; }
    if (aLength == 0) { return bLength; }
    if (bLength == 0) { return aLength; }

    // create two work vectors of integer distances
    var previousDistances = new SqlInt64[Maximum(aLength, bLength) + 1];
    var currentDistances = new SqlInt64[Maximum(aLength, bLength) + 1];

    // initialize previousDistances (the previous row of distances)
    // this row is A[0][i]: edit distance for an empty textA
    // the distance is just the number of characters to delete from textB
    for (var i = 0; i < previousDistances.Length; i++)
    {
        previousDistances[i] = i;
    }

    for (var i = 0; i < aLength; i++)
    {
        // calculate currentDistances from the previous row previousDistances

        // first element of currentDistances is A[i+1][0]
        //   edit distance is delete (i+1) chars from textA to match empty textB
        currentDistances[0] = i + 1;

        // use formula to fill in the rest of the row
        for (var j = 0; j < bLength; j++)
        {
            var cost = (aCompareInfo.Compare(textA.Value, i, 1, textB.Value, j, 1, compareOptions) == 0) ? 0 : 1;
            currentDistances[j + 1] = Minimum(currentDistances[j] + 1, previousDistances[j + 1] + 1, previousDistances[j] + cost);
        }

        // copy currentDistances to previousDistances for next iteration
        for (var j = 0; j < previousDistances.Length; j++)
        {
            previousDistances[j] = currentDistances[j];
        }
    }

    return currentDistances[bLength];
}

After deploying the CLR assembly to SQL Server (2008 R2) and calling it like this:

print dbo.LevenshteinDistance('abc' collate Latin1_General_CI_AI, 'ABC' collate Latin1_General_CI_AI)
print dbo.LevenshteinDistance('abc' collate Latin1_General_CS_AS_KS_WS, N'ABC' collate Latin1_General_CS_AS_KS_WS)

Both calls return zero (0). Because I specified a case-sensitive collation for the second call, I expected that second call to return three (3).

Using CLR functions in SQL Server, is it possible to specify collations other than the database default and have them used within a CLR function? If so, how?


Solution

  • Not seeing any alternatives on the Internet or responses to this question, I decided to specify the desired collation attributes as function parameters and select a CultureInfo object and CompareOptions based upon the inputs or the default collation passed in from the database.

    [SqlFunction(IsDeterministic = true, Name = "LevenshteinDistance")]
    public static SqlInt64 Distance(SqlString textA, SqlString textB, int? lcid, bool? caseInsensitive, bool? accentInsensitive, bool? kanaInsensitive, bool? widthInsensitive)
    {
        // get a collation-aware comparer so string/character comparisons 
        // will match the inputs' specified collation
        //var aCompareInfo = textA.CompareInfo;
        var aCompareInfo = CultureInfo.GetCultureInfo(lcid ?? textA.LCID).CompareInfo;
        //var compareOptions = ConvertCompareOptions(textA.SqlCompareOptions);
        var compareOptions = GetCompareOptions(caseInsensitive, accentInsensitive, kanaInsensitive, widthInsensitive);
    
        // ...  more code ...
    
        // first comparison
        if (aCompareInfo.Compare(textA.Value, 0, aLength, textB.Value, 0, bLength, compareOptions) == 0) { return 0; }
    
        // ...  more code ...
    
        var cost = (aCompareInfo.Compare(textA.Value, i, 1, textB.Value, j, 1, compareOptions) == 0) ? 0 : 1;
    
        // ...  more code ...
    }
    
    private static CompareOptions GetCompareOptions(bool? caseInsensitive, bool? accentInsensitive, bool? kanaInsensitive, bool? widthInsensitive)
    {
        var compareOptions = CompareOptions.None;
    
        compareOptions |= (caseInsensitive ?? false) ? CompareOptions.IgnoreCase : CompareOptions.None;
        compareOptions |= (accentInsensitive ?? false) ? CompareOptions.IgnoreNonSpace : CompareOptions.None;
        compareOptions |= (kanaInsensitive ?? false) ? CompareOptions.IgnoreKanaType : CompareOptions.None;
        compareOptions |= (widthInsensitive ?? false) ? CompareOptions.IgnoreWidth : CompareOptions.None;
    
        return compareOptions;
    }
    

    After updating my assembly and UDF declarations, I can call the function like so:

    print dbo.LevenshteinDistance('abc', 'ABC', null, 1, 1, 1, 1)
    print dbo.LevenshteinDistance('abc', 'ABC', null, 0, 0, 0, 0)
    

    And now the first call returns 0 (database default culture, everything Insensitive) while the second call returns 3 (database default culture, everything Sensitive).