I would like to convert Unicode text
Text with accent é
to
Text with accent é
'System.Web.HttpUtility.HtmlEncode' and 'System.Net.WebUtility.HtmlEncode' will produce
Text with accent é
Using an System.Xml.XmlWriter with ASCII encoding results in
Text with accent é
I need the name not any hex or decimal code.
I found many entity names here: https://unicode-table.com/en/html-entities/
Is there maybe some kind of library available?
Edit:
My target is to have an ASCII encoded file (XML) that uses entity NAMES instead of CODES.
If some codes no do have an entity NAME then they should be null (or the CODE).
This is a requirement for a project.
I would use a more official reference: https://html.spec.whatwg.org/multipage/named-characters.html
The HTML entity list will probably keep growing in the future so you need a dynamic solution. Here is how I would do it (using HtmlAgilityPack):
private string ConvertCharsToHTMLEntities(string text)
{
string res = "";
using (WebClient client = new WebClient())
{
// Getting the HTML entities table
string page = client.DownloadString("https://html.spec.whatwg.org/multipage/named-characters.html");
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(page);
List<List<string>> table = doc.DocumentNode.SelectSingleNode("//div[@id='named-character-references-table']//table")
.Descendants("tr")
.Skip(1)
.Where(tr => tr.Elements("td").Count() > 1)
.Select(tr => tr.Elements("td").Select(td => td.InnerText.Trim()).ToList())
.ToList();
// Converting the table to pairs of code point and HTML entity
Dictionary<string, string> entitys = new Dictionary<string, string>();
table.ForEach(a => a[1].Split(' ').ToList().ForEach(b => entitys[b] = a[0]));
// recreating the string with replacing code point to HTML entity
foreach (char c in text)
{
string codePoint = string.Format("U+{0:X5}", Convert.ToUInt32(c));
res += entitys.ContainsKey(codePoint) ? entitys[codePoint] : c.ToString();
}
}
return res;
}