Search code examples
c#xmlserializationxmlserializerxmlreader

How to deserialize an XML file containing an \u0000 using XmlSerializer?


I have a problem about XmlSerializer. In my huge XML file, there are some Null characters (\u0000) and so XmlSerializer (Deserializer) gives me an error. I found out that I need to set Normalization to false (via: https://msdn.microsoft.com/en-us/library/aa302290.aspx), so I tried this:

XmlSerializer deserializer = new XmlSerializer(typeof(T));
XmlTextReader reader = new XmlTextReader(filename);
reader.Normalization = false;
return (T)deserializer.Deserialize(reader);

I tried also second possibility, when I used XmlReader, because is also suggested by MSDN, and I tried to set CheckCharacters to false like this:

 XmlSerializer deserializer = new XmlSerializer(typeof(T));
 XmlReaderSettings settings = new XmlReaderSettings() { CheckCharacters = false }; 
 using (XmlReader reader = XmlReader.Create(filename, settings))
 {
       return (T)deserializer.Deserialize(reader);
 }

`

but both solutions give me the same result: InvalidOperationException on the line and column in XML where is the Null character.

Could you please give me an advice about that? I need to "load" the XML structure to my defined class. Without lines with these characters its working fine.

Thanks! :)

Edit: I forgot to say, that I've tried to load the content to a string and update the string, but inserted content is to big, so I get System.OutOfMemoryException and if I try to parse the file line by line, it's too slow. :(


Solution

  • You can go to reader level instead - subclass TextReader class to perform cleanup & fetch it to the XmlSerializer.

    var deserializer = new XmlSerializer(typeof(T));
    T instance;
    using(var cleanupTextReader = new CleanupTextReader(reader)) {
      instance = deserializer.Deserialize(cleanupTextReader);
    }
    

    Where CleanupTextReader is something like:

    internal sealed class CleanupTextReader : TextReader
    {
        private TextReader _in;
    
        internal CleanupTextReader(TextReader t)
        {
            _in = t;
        }
    
        public override void Close()
        {
            _in.Close();
        }
    
        protected override void Dispose(bool disposing)
        {
            if (disposing)
            {
                ((IDisposable) _in).Dispose();
            }
        }
    
        public override int Peek()
        {
            return _in.Peek();
        }
    
        public override int Read()
        {
            while(true)
            {
                var result = _in.Read();
                if (result != '\u0000')
                {
                    return result;
                }
            }
        }
    
        private string CleanupString(string value)
        {
            if (string.IsNullOrEmpty(value) || value.IndexOfAny(new char['\u0000']) < 0)
            {
                return value;
            }
            var builder = new StringBuilder(value.Length);
            foreach (var ch in value)
            {
                if (ch != '\u0000')
                {
                    builder.Append(ch);
                }
            }
            return builder.ToString();
        }
    
        private int CleanupBuffer(char[] buffer, int index, int count)
        {
            int adjustedCount = count;
            if (count > 0)
            {
                var readIndex = index;
                var writeIndex = index;
                while (readIndex < index + count)
                {
                    var ch = buffer[readIndex];
                    readIndex++;
                    if (ch == '\u0000')
                    {
                        adjustedCount--;
                    }
                    else
                    {
                        buffer[writeIndex] = ch;
                        writeIndex++;
                    }
                }
            }
            return adjustedCount;
        }
    
        public override int Read(char[] buffer, int index, int count)
        {
            while (true)
            {
                int reallyRead = _in.Read(buffer, index, count);
                if (reallyRead <= 0)
                {
                    return reallyRead;
                }
    
                int cleanRead = CleanupBuffer(buffer, index, reallyRead);
                if (cleanRead != 0)
                {
                    return cleanRead;
                }
            }
        }
    
        public override int ReadBlock(char[] buffer, int index, int count)
        {
            while (true)
            {
                int reallyRead = _in.ReadBlock(buffer, index, count);
                if (reallyRead <= 0)
                {
                    return reallyRead;
                }
    
                int cleanRead = CleanupBuffer(buffer, index, reallyRead);
                if (cleanRead != 0)
                {
                    return cleanRead;
                }
            }
        }
    
        public override string ReadLine()
        {
            return CleanupString(_in.ReadLine());
        }
    
        public override string ReadToEnd()
        {
            return CleanupString(_in.ReadToEnd());
        }
    }