Search code examples
pythonxmlsvgbeautifulsoup

BeautifulSoup prettify changes content, not just layout


BeautifulSoup prettify() modifies significant whitespace even if the attribute xml:space is set to "preserve".

Example xml file with significant whitespace:

<svg viewBox="0 0 160 50" xmlns="http://www.w3.org/2000/svg">
  <text y="20" xml:space="default">    Default    spacing</text>
  <text y="40" xml:space="preserve">    <tspan>reserved    spacing</tspan></text>
</svg>

Code:

from bs4 import BeautifulSoup

xml_string_with_significant_whitespace ='''
<svg viewBox="0 0 160 50" xmlns="http://www.w3.org/2000/svg">
  <text y="20" xml:space="default">    Default    spacing</text>
  <text y="40" xml:space="preserve">    <tspan>reserved    spacing</tspan></text>
</svg>
'''
soup = BeautifulSoup(xml_string_with_significant_whitespace, "xml")

# no modifications made

print(soup.prettify())  # modifies significant whitespace
# print(str(soup)) # doesn't modify significant whitespace

Output:

<svg viewBox="0 0 160 50" xmlns="http://www.w3.org/2000/svg">
 <text xml:space="default" y="20">
  Default    spacing
 </text>
 <text xml:space="preserve" y="40">
  <tspan>
   reserved    spacing
  </tspan>
 </text>
</svg>

Text will be moved due to modified whitespace.

How do I prevent prettify() from changing the meaning of the xml file, instead of just changing the layout?


Solution

  • As MendelG expained in his answer BeautifulSoup prettify() does change the meaning of documents and prettify() is only meant as an aid for readability.

    I wanted a solution that would reformat the document without changing its meaning and with the least amount of changes.

    The following code prettifies xml without modifying significant whitespace (xml:space="preserve"), CDATA and XML declarations:

    import xml.sax
    from io import StringIO
    import re
    import sys
    
    class Prettifier(xml.sax.ContentHandler, xml.sax.handler.LexicalHandler):
        def __init__(self, print_method=None):
            self.level = -1
            self.preserve_space_stack=[False]
            self.last_was_opening_tag = False #tags without content or empty content don't need closing tag on next line
            self.indent = " "*4
            self.first_tag = True
            self.external_print_method = print_method
            self.string=""
            self.CDATA = False
    
        def get_string(self):
            return self.string
    
        def print_method(self, text="", end="\n"):
            if self.external_print_method:
                self.external_print_method(text, end)
            self.string += text + end
    
        # Call when an element starts
        def startElement(self, tag, attributes):
            self.level += 1       
    
            if 'xml:space' in attributes:
                self.preserve_space_stack.append(attributes['xml:space'] == 'preserve')
            else:
                self.preserve_space_stack.append(self.preserve_space_stack[-1])
    
            attributes_string = " ".join([f'{key}="{value}"' for key,value in attributes.items()])
      
    
            if self.preserve_space_stack[-1]:
                if self.preserve_space_stack[-2] == False:
                    self.print_method()
                    self.print_method(self.indent*self.level, end="")
    
                self.print_method(f"<{tag}", end="")
                self.print_method(" "*(len(attributes_string)!=0), end="")
                self.print_method(attributes_string, end="")
                self.print_method(">", end="")
            else:
                if not self.first_tag:
                    self.print_method()
    
                if len(attributes_string) > 60:
                    attributes_string = f"\n{self.indent*self.level + ' '*(len(tag)+2)}".join([f'{key}="{value}"' for key,value in attributes.items()])
            
                self.print_method(self.indent*self.level, end="")
                self.print_method(f"<{tag}", end="")
                self.print_method(" "*(len(attributes_string)!=0), end="")
                self.print_method(attributes_string, end="")
                self.print_method(">", end="")
    
            self.last_was_opening_tag = True   
            self.first_tag = False 
    
    
        # Call when an elements ends
        def endElement(self, tag):        
            if self.preserve_space_stack[-1]:
                self.print_method(f"</{tag}", end="")
                self.print_method(">", end="")
            else:
                if not self.last_was_opening_tag:
                    self.print_method()
                    self.print_method(self.indent*self.level, end="")
                self.print_method(f"</{tag}", end="")
                self.print_method(">",end="")
    
            self.level -= 1
            self.preserve_space_stack.pop()
    
            self.last_was_opening_tag = False
        
        # Call when a character is read
        def characters(self, content):    
            if self.CDATA:
                self.print_method(content, end="")
            else:
                empty_content = False
                if self.preserve_space_stack[-1]:
                    self.print_method(content, end="")
                    empty_content = content == ""
                else:          
                    empty_content = content.strip() == ""
                    if not empty_content:
                        self.print_method()
                        self.print_method(self.indent*(self.level+1), end="")
                        self.print_method(content.strip(), end="")
    
                self.last_was_opening_tag = self.last_was_opening_tag and empty_content
    
        # lexical handler methods:
        def comment(self, content):
            if not self.preserve_space_stack[-1]:
                self.print_method()
                self.print_method(self.indent*(self.level+1), end="")
         
            self.print_method(f"<!--{content}-->", end="")
    
        def startCDATA(self):
            #The contents of the CDATA marked section will be reported through the characters handler.
            if not self.preserve_space_stack[-1]:
                self.print_method()
                self.print_method(self.indent*(self.level+1), end="")
        
            self.print_method("<![CDATA[", end="")
            self.CDATA = True
    
        def endCDATA(self):
            self.print_method("]]>", end="")
            self.CDATA = False
    
            
    def process_xml_declaration(xml_string):
        declaration = ""
        regex = r"^\s*(<\?xml [^\?>]*\?>)"
        matches = re.search(regex, xml_string)
    
        if matches:
            declaration = matches.group(1) + "\n"
        return declaration
    
    def prettify_file(file_name):
        Handler = Prettifier()
        parser = xml.sax.make_parser()
        parser.setFeature(xml.sax.handler.feature_namespaces, 0)# turn off namespaces
        parser.setContentHandler(Handler)
        parser.setProperty(xml.sax.handler.property_lexical_handler, Handler)
    
        parser.parse(file_name)
    
        declaration = ""
        with open(file_name, 'r', encoding='utf8') as f:
            declaration = process_xml_declaration(f.read())
    
        return declaration + Handler.get_string()
    
    def prettify_string(xml_string):
        Handler = Prettifier()
        parser = xml.sax.make_parser()
        parser.setFeature(xml.sax.handler.feature_namespaces, 0)# turn off namespaces
        parser.setContentHandler(Handler)
        parser.setProperty(xml.sax.handler.property_lexical_handler, Handler)
    
        xml_string_stream = StringIO(xml_string)
        parser.parse(xml_string_stream)
    
        declaration = process_xml_declaration(xml_string)
    
        return declaration + Handler.get_string()
    
    if __name__ == "__main__":
        if len(sys.argv) > 1:
            for bad_image_path in sys.argv[1:]:
                svg_string_pretty = prettify_file(bad_image_path)  
                
                with open(bad_image_path, 'w', encoding='utf8', newline='\n') as f:
                    f.write(svg_string_pretty)
        else:
            pass