I'm looking for an api that will visually show html difference for both structure, characters/words, and style. This tool must also support double byte characters and be flexible enough for me to add it to my existing website to show the results of the comparison easily. I'm currently using the Component Software COM implementation which doesn't support double byte characters and hasn't been updated in about six years.
This is what I used:
I had to write my own methods to do the compare but after a little work it looks fine. This implementation compares test as passed in so it works fine if you are just comparing 2 text strings. My diff_prettyHtml call was changed to:
public string diff_prettyHtml(List<Diff> diffs)
StringBuilder html = new StringBuilder();
foreach (Diff aDiff in diffs)
string text = aDiff.text.Replace("&", "&").Replace("<", "<")
.Replace(">", ">").Replace("\n", "<br>");
switch (aDiff.operation)
case Operation.INSERT:
html.Append("<ins class='diff'>").Append(text)
case Operation.DELETE:
html.Append("<del class='diff'>").Append(text)
case Operation.EQUAL:
return html.ToString();
Now if you want to do a compare preview of 2 html strings this is a little different. This is what I did:
DiffMatchPatch.diff_match_patch diff = new DiffMatchPatch.diff_match_patch();
List<DiffMatchPatch.Diff> differences = diff.diff_main(oldHtml,
return diff.diff_previewHtml(differences);
public string diff_previewHtml(List<Diff> diffs) {
StringBuilder html = new StringBuilder();
foreach (Diff aDiff in diffs) {
string text = aDiff.text;
switch (aDiff.operation) {
case Operation.INSERT:
html.Append("<ins class='diff'>").Append(text)
case Operation.DELETE:
html.Append("<del class='diff'>").Append(text)
case Operation.EQUAL:
return html.ToString();
The unicode class is as follows:
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
namespace HtmlCompare
class Unicoder
private Hashtable _htmlHash = new Hashtable();
private const string _htmlPattern = @"<(S*?)[^>]*>.*?|<.*?\/>";
private List<string> _blockElements = "img,br".Split(',').ToList<string>();
private int _currentHash = 44032;
public string pushHash(string tag)
if (_htmlHash[tag] == null)
//_htmlHash[tag] = char.Parse("\\u" + Convert.ToString(_currentHash,16));
_htmlHash[tag] = char.ConvertFromUtf32(_currentHash);
return _htmlHash[tag].ToString();
private string tagMatch(Match tag)
return pushHash(tag.Value);
public string html2plain(string html)
MatchEvaluator tagEvaluator = new MatchEvaluator(tagMatch);
return Regex.Replace(html, _htmlPattern, tagEvaluator, RegexOptions.IgnoreCase | RegexOptions.Multiline);
private string ProcessDiffTag(string tagStart, string tagEnd, string contents)
ArrayList diffTagParts = new ArrayList();
MatchCollection matches = Regex.Matches(contents,
RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (matches.Count > 0)
int contentsStringIndex = 0;
int contentsStringEndIndex = 0;
int lastContentStringIndex = 0;
bool lastTag = false;
TagDefinition definition;
foreach (Match currentMatch in matches)
contentsStringIndex = currentMatch.Index;
contentsStringEndIndex = contentsStringIndex + currentMatch.Length;
lastTag = (currentMatch == matches[matches.Count - 1]);
// did we miss text that isn't a tag?
if (contentsStringIndex > lastContentStringIndex)
definition = new TagDefinition();
definition.Tag = false;
definition.Text = contents.Substring(lastContentStringIndex, contentsStringIndex - lastContentStringIndex);
AddTagDefinition(diffTagParts, definition);
else if (lastTag && contents.Length > contentsStringEndIndex) // something after the last tag?
definition = new TagDefinition();
definition.Tag = false;
definition.Text = contents.Substring(contentsStringEndIndex, contents.Length - contentsStringEndIndex);
AddTagDefinition(diffTagParts, definition);
// work on current tag
definition = new TagDefinition();
definition.Tag = true;
definition.OpeningTag = !IsClosingTag(currentMatch.Value);
definition.TagType = GetTagType(currentMatch.Value);
definition.Text = currentMatch.Value;
AddTagDefinition(diffTagParts, definition);
lastContentStringIndex = contentsStringEndIndex;
return GoThroughDiffParts(diffTagParts,
return string.Concat(tagStart, contents, tagEnd);
private string GetTagType(string tag)
int startIndex = 1; // skip <
if (tag.StartsWith("</"))
startIndex = 2; // skip </
int endIndex = tag.IndexOf(" ");
if (endIndex == -1)
endIndex = tag.IndexOf(">");
return tag.Substring(startIndex, endIndex - startIndex);
private string GoThroughDiffParts(ArrayList parts, string startTag, string endTag)
IEnumerator enumerator = parts.GetEnumerator();
StringBuilder before = new StringBuilder(string.Empty);
StringBuilder middle = new StringBuilder(string.Empty);
StringBuilder after = new StringBuilder(string.Empty);
TagDefinition definition;
while (enumerator.MoveNext())
definition = (TagDefinition)enumerator.Current;
if (!definition.Used) // have we already used this part?
definition.Used = true;
if (_blockElements.Contains(definition.TagType))
else if (definition.MatchingIndex == -1) // no matching tag
if (definition.Tag) // html tag?
if (definition.OpeningTag)
if (!definition.Tag) // text and has a matching tag
TagDefinition matchingTag = (TagDefinition)parts[definition.MatchingIndex];
if (matchingTag.OpeningTag)
matchingTag.Text += definition.Text;
matchingTag.Text = string.Concat(definition.Text, matchingTag.Text);
definition.Used = true;
bool includeDiffTag = true;
if (string.IsNullOrEmpty(middle.ToString()))
includeDiffTag = false; // we don't want the ins/del tag around nothing
else if (string.IsNullOrWhiteSpace(middle.ToString())) // spacing should be kept
middle = new StringBuilder(" " + middle.Replace("\n", "<br />"));
middle.Insert(0, startTag); // <ins>[middle]
middle.Insert(0, before); // [before]<ins>[middle]
if (includeDiffTag)
middle.Append(endTag); // [before]<ins>[middle]</ins>
middle.Append(after); // [before]<ins>[middle]</ins>[end]
return middle.ToString();
private string DiffTagMatch(Match tag)
string tagStart = tag.Groups[1].Value;
string tagEnd = tag.Groups[5].Value;
string contents = tag.Groups[4].Value;
if (string.IsNullOrEmpty(contents))
return string.Empty; // we don't want the ins/del tag around nothing
else if (string.IsNullOrWhiteSpace(contents)) // spacing should be kept
return string.Concat(tagStart, " ", contents.Replace("\n", "<br />"), tagEnd);
return ProcessDiffTag(tagStart,
private bool IsClosingTag(string tag)
return tag.Contains("</") && !tag.ToLower().Contains("<img") && !tag.ToLower().Contains("<br");
public string CleanUpMisplacedDiffTags(string html)
return Regex.Replace(html, @"(\<((ins|del).*?)\>)(.*?)(\<\/((ins|del).*?)\>)", DiffTagMatch, RegexOptions.IgnoreCase | RegexOptions.Multiline);
public string plain2html(string plain)
IDictionaryEnumerator enumerator = _htmlHash.GetEnumerator();
while (enumerator.MoveNext())
plain = Regex.Replace(plain,
RegexOptions.IgnoreCase | RegexOptions.Multiline);
return CleanUpMisplacedDiffTags(plain);
private void AddTagDefinition(ArrayList list, TagDefinition tag)
IEnumerator enumerator = list.GetEnumerator();
TagDefinition currentDefinition;
int index = 0;
int insertingIndex = list.Count;
while (enumerator.MoveNext())
currentDefinition = (TagDefinition)enumerator.Current;
//if (!tag.OpeningTag && currentDefinition.MatchingIndex == -1)
// currentDefinition.MatchingIndex = insertingIndex;
if (tag.MatchingIndex == -1 && // matching tag not found yet
(currentDefinition.OpeningTag && !tag.OpeningTag) && // opening & closing
currentDefinition.TagType == currentDefinition.TagType) // same tag type
tag.MatchingIndex = index;
currentDefinition.MatchingIndex = insertingIndex;
private class TagDefinition
public bool Tag { get; set; }
public string TagType { get; set; }
public string Text { get; set; }
public int MatchingIndex { get; set; }
public bool OpeningTag { get; set; }
public bool Used { get; set; }
public TagDefinition()
this.Tag = false;
this.Text = string.Empty;
this.TagType = string.Empty;
this.MatchingIndex = -1;
this.OpeningTag = false;
this.Used = false;