Search code examples
c#htmlstring-comparison

Compare two strings containing html tags with same attributes but different orders using C#


I'd like to compare between two strings having html tags in different orders.

Example:

string str1="""<p><strong style="font-size: 36px; color: rgb(153, 51, 255);">Hello </strong><em><u>World</u></em></p>""";

string str2="""<p><strong style="color: rgb(153, 51, 255); font-size: 36px;">Hello </strong><em><u>World</u></em></p>""";

I care about the text as well as the style so I want the result of the comparison to be identical because it gives the same text with the same html result. However, a normal string comparison gives that the two strings are different.

how can I compare these two strings using c# not javascript.


Solution

  • to compare the 2 string you have to make few steps:

    1. you need to reorder the attributes of the html tags in both string in the same order.
    2. you need to order the values inside some attributes that takes multiple values like style
    3. trim the text value

    to do that you first need to add nuget package HtmlAgilityPack, here is sample code

    class Program
    {
        static void Main()
        {
            string str1 = "<p><strong style=\"font-size: 36px; color: rgb(153, 51, 255);\">Hello </strong><em><u>World</u></em></p>";
            string str2 = "<p><strong style=\"color: rgb(153, 51, 255); font-size: 36px;\">Hello </strong><em><u>World</u></em></p>";
    
            bool areIdentical = AreHtmlStringsIdentical(str1, str2);
            Console.WriteLine($"The HTML strings are identical: {areIdentical}");
        }
    
        static bool AreHtmlStringsIdentical(string html1, string html2)
        {
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(html1);
            var doc2 = new HtmlDocument();
            doc2.LoadHtml(html2);
    
            string canonicalHtml1 = GetCanonicalHtml(doc1.DocumentNode);
            string canonicalHtml2 = GetCanonicalHtml(doc2.DocumentNode);
    
            return canonicalHtml1 == canonicalHtml2;
        }
    
        static string GetCanonicalHtml(HtmlNode node)
        {
            if (node.NodeType == HtmlNodeType.Text)
            {
                return NormalizeWhitespace(node.InnerText);
            }
    
            var builder = new StringBuilder();
            builder.Append('<').Append(node.Name);
    
            var sortedAttributes = node.Attributes.OrderBy(a => a.Name);
            foreach (var attribute in sortedAttributes)
            {
                string value = attribute.Value;
                if (attribute.Name == "style")
                {
                    value = NormalizeStyleAttribute(value);
                }
                builder.Append(' ')
                       .Append(attribute.Name)
                       .Append("=\"")
                       .Append(NormalizeWhitespace(value))
                       .Append('"');
            }
            builder.Append('>');
    
            foreach (var child in node.ChildNodes)
            {
                builder.Append(GetCanonicalHtml(child));
            }
    
            builder.Append("</").Append(node.Name).Append('>');
    
            return builder.ToString();
        }
    
        static string NormalizeWhitespace(string input)
        {
            return Regex.Replace(input.Trim(), @"\s+", " ");
        }
    
        static string NormalizeStyleAttribute(string style)
        {
            var styles = style.Split(';')
                              .Select(s => s.Trim())
                              .Where(s => !string.IsNullOrEmpty(s))
                              .Select(s =>
                              {
                                  var parts = s.Split(':');
                                  return new
                                  {
                                      Name = parts[0].Trim(),
                                      Value = parts.Length > 1 ? NormalizeWhitespace(parts[1].Trim()) : ""
                                  };
                              })
                              .OrderBy(s => s.Name)
                              .Select(s => $"{s.Name}: {s.Value}");
    
            return string.Join("; ", styles) + (styles.Any() ? ";" : "");
        }
    }