Search code examples
sql-serversql-server-2012

Is there any way to do HTML decode in SQL Server?


I have the following records in one of my table

CD&M Communications 
auburndale oil & propane inc  
C F La Fountaine #7561  
Laramie County Fire District # 2  
AmeriGas Propane LP #2250  

Is there a way to remove the characters like &, #7561, #2250 etc.

"&" should be replaced with "&" as per C# HTMLDECODE function


Solution

  • The following SQL function would work in your case or it would be a good starting point for you to extend it. However, please note the String manipulations in the Database [SQL Server] would be slower compared to the string manipulations in application layer.

    GO
    
    IF OBJECT_ID('dbo.MyHTMLDecode') IS NOT NULL BEGIN DROP FUNCTION dbo.MyHTMLDecode END
    
    GO
    CREATE FUNCTION dbo.MyHTMLDecode (@vcWhat VARCHAR(MAX))
    RETURNS VARCHAR(MAX)
    AS
    BEGIN
        DECLARE @vcResult VARCHAR(MAX)
        DECLARE @siPos INT
            ,@vcEncoded VARCHAR(7)
            ,@siChar INT
    
        SET @vcResult = RTRIM(LTRIM(CAST(REPLACE(@vcWhat COLLATE Latin1_General_BIN, CHAR(0), '') AS VARCHAR(MAX))))
    
        SELECT @vcResult = REPLACE(REPLACE(@vcResult, ' ', ' '), ' ', ' ')
    
        IF @vcResult = ''
            RETURN @vcResult
    
        SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)
    
        WHILE @siPos > 0
        BEGIN
            SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 6)
                ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 3) AS INT)
                ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
                ,@siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)
        END
    
        SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)
    
        WHILE @siPos > 0
        BEGIN
            SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 7)
                ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 4) AS INT)
                ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
                ,@siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)
        END
    
        SELECT @siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)
    
        WHILE @siPos > 0
        BEGIN
            SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 5)
                ,@vcResult = REPLACE(@vcResult, @vcEncoded, '')
                ,@siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)
        END
    
        SELECT @vcResult = REPLACE(REPLACE(@vcResult, NCHAR(160), ' '), CHAR(160), ' ')
    
        SELECT @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult, '&amp;', '&'), '&quot;', '"'), '&lt;', '<'), '&gt;', '>'), '&amp;amp;', '&')
    
        RETURN @vcResult
    END
    
    GO
    

    Illustration:

      DECLARE @S VARCHAR(MAX)='CD&amp;amp;amp;M Communications 
        auburndale oil &amp;amp;amp; propane inc  
        C F La Fountaine #7561  
        Laramie County Fire District # 2  
        AmeriGas Propane LP #2250'
    
        SELECT dbo.MyHTMLDecode (@s)
    

    OUTPUT:

    CD&M Communications 
    auburndale oil & propane inc  
    C F La Fountaine   
    Laramie County Fire District # 2  
    AmeriGas Propane LP