Search code examples
c#excelxmlxlsxxmlreader

Update XLSX file changes whilst reading the file with XmlReader


We had a code which was loading the Excel XLSX document into the memory, doing some modifications with it and saving it back.

XmlDocument doc = new XmlDocument();
doc.Load(pp.GetStream());
XmlNode rootNode = doc.DocumentElement;

if (rootNode == null) return;
ProcessNode(rootNode);

if (this.fileModified)
{
    doc.Save(pp.GetStream(FileMode.Create, FileAccess.Write));
}

This was working good with small files, but throwing OutOfMemory exceptions with some large Excel files. So we decided to change the approach and use XmlReader class to not load the file into the memory at once.

PackagePartCollection ppc = this.Package.GetParts();
foreach (PackagePart pp in ppc)
{
     if (!this.xmlContentTypesXlsx.Contains(pp.ContentType)) continue;

     using (XmlReader reader = XmlReader.Create(pp.GetStream()))
     {
          reader.MoveToContent();
          while (reader.EOF == false)
          {
             XmlDocument doc;
             XmlNode rootNode;
             if (reader.NodeType == XmlNodeType.Element && reader.Name == "hyperlinks")
             {
                   doc = new XmlDocument();
                   rootNode = doc.ReadNode(reader);
                   if (rootNode != null)
                   {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode);  // how can I save updated changes back to the file?
                   }
              }
              else if (reader.NodeType == XmlNodeType.Element && reader.Name == "row")
              {
                    doc = new XmlDocument();
                    rootNode = doc.ReadNode(reader);

                    if (rootNode != null)
                    {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode); // how can I save updated changes back to the file?
                    }
              }
              else
              {
                    reader.Read();
              }
          }
     }
}

This reads the file node by node and processes nodes we need (and changes some values there). However, I'm not sure how we can update those values back to the original Excel file. I tried to use XmlWriter together with the XmlReader, but was not able to make it work. Any ideas?

UPDATE:

I tried to use @dbc's suggestions from the comments section, but it seems too slow to me. It probably will not throw OutOfMemory exceptions for huge files, but processing will take forever.

PackagePartCollection ppc = this.Package.GetParts();
foreach (PackagePart pp in ppc)
{
     if (!this.xmlContentTypesXlsx.Contains(pp.ContentType)) continue;

     StringBuilder strBuilder = new StringBuilder();
     
     using (XmlReader reader = XmlReader.Create(pp.GetStream()))
     {
        using (XmlWriter writer = this.Package.FileOpenAccess == FileAccess.ReadWrite ? XmlWriter.Create(strBuilder) : null)
        {
          reader.MoveToContent();
          while (reader.EOF == false)
          {
             XmlDocument doc;
             XmlNode rootNode;
             if (reader.NodeType == XmlNodeType.Element && reader.Name == "hyperlinks")
             {
                   doc = new XmlDocument();
                   rootNode = doc.ReadNode(reader);
                   if (rootNode != null)
                   {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode);
                        writer?.WriteRaw(rootNode.OuterXml);
                   }
              }
              else if (reader.NodeType == XmlNodeType.Element && reader.Name == "row")
              {
                    doc = new XmlDocument();
                    rootNode = doc.ReadNode(reader);

                    if (rootNode != null)
                    {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode);
                        writer?.WriteRaw(rootNode.OuterXml);
                    }
              }
              else
              {
                    WriteShallowNode(writer, reader); // Used from the @dbc's suggested stackoverflow answers
                    reader.Read();
              }
            }

            writer?.Flush();
         }
      }
}

NOTE 1: I'm using StringBuilder for the test, but was planning to switch to a temp file in the end. NOTE 2: I tried flushing the XmlWriter after every 100 elements, but it's still slow.

Any ideas?


Solution

  • I did some more modifications with @dbc's help and now it works as I wanted.

    PackagePartCollection ppc = this.Package.GetParts();
    foreach (PackagePart pp in ppc)
    {
      try
      {
         if (!this.xmlContentTypesXlsx.Contains(pp.ContentType)) continue;
    
         string tempFilePath = GetTempFilePath();
         
         using (XmlReader reader = XmlReader.Create(pp.GetStream()))
         {
            using (XmlWriter writer = this.Package.FileOpenAccess == FileAccess.ReadWrite ? XmlWriter.Create(tempFilePath) : null)
            {
              while (reader.EOF == false)
              {
                 if (reader.NodeType == XmlNodeType.Element && reader.Name == "hyperlinks")
                 {
                       XmlDocument doc = new XmlDocument();
                       XmlNode rootNode = doc.ReadNode(reader);
                       if (rootNode != null)
                       {
                            ProcessNode(rootNode);
                            if (writer != null)
                            {
                                rootNode.WriteTo(writer);
                            }
                       }
                  }
                  else if (reader.NodeType == XmlNodeType.Element && reader.Name == "row")
                  {
                        XmlDocument doc = new XmlDocument();
                        XmlNode rootNode = doc.ReadNode(reader);
    
                        if (rootNode != null)
                        {
                            ProcessNode(rootNode);
                            if (writer != null)
                            {
                                rootNode.WriteTo(writer);
                            }
                        }
                  }
                  else
                  {
                        WriteShallowNode(writer, reader); // Used from the @dbc's suggested StackOverflow answers
                        reader.Read();
                  }
                }
             }
          }
    
    
          if (this.packageChanged) // is being set in ProcessNode method
          {
              this.packageChanged = false;
    
              using (var tempFile = File.OpenRead(tempFilePath))
              {
                   tempFile.CopyTo(pp.GetStream(FileMode.Create, FileAccess.Write));
              }
           }
       }
       catch (OutOfMemoryException)
       {
            throw;
       }
       catch (Exception ex)
       {
          Log.Exception(ex, @"Failed to process a file."); // our inner log method
       }
       finally
       {
           if (!string.IsNullOrWhiteSpace(tempFilePath))
           {
                // Delete temp file
           }
       }
    }