Search code examples
odooodoo-10language-translation

What is use of sanitize attribute in html type field in odoo?


In html type of field one attribute is available, In which we can pass True/False.

body_html = fields.Html('Body', translate=True, sanitize=False, help="Rich-text/HTML version of the message (placeholders may be used here)")

body_html = fields.Html('Body', translate=True, sanitize=True, help="Rich-text/HTML version of the message (placeholders may be used here)")

If we set True/False then we are getting same result.

What is difference if we set True/False in this field ?


Solution

  • It's just telling Odoo if to clean html code, like deleting scripts, tags/nodes, etc. For more information look into the code.

    def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, strip_style=False, strip_classes=False):
        if not src:
            return src
        src = ustr(src, errors='replace')
        # html: remove encoding attribute inside tags
        doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
        src = doctype.sub(r"", src)
    
        logger = logging.getLogger(__name__ + '.html_sanitize')
    
        # html encode email tags
        part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
        # remove results containing cite="mid:email_like@address" (ex: blockquote cite)
        # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE)
        src = part.sub(lambda m: ('cite=' not in m.group(1) and 'alt=' not in m.group(1)) and cgi.escape(m.group(1)) or m.group(1), src)
        # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
        src = src.replace('<%', cgi.escape('<%'))
        src = src.replace('%>', cgi.escape('%>'))
    
        kwargs = {
            'page_structure': True,
            'style': strip_style,              # True = remove style tags/attrs
            'sanitize_style': sanitize_style,  # True = sanitize styling
            'forms': True,                     # True = remove form tags
            'remove_unknown_tags': False,
            'comments': False,
            'processing_instructions': False
        }
        if sanitize_tags:
            kwargs['allow_tags'] = allowed_tags
            if etree.LXML_VERSION >= (2, 3, 1):
                # kill_tags attribute has been added in version 2.3.1
                kwargs.update({
                    'kill_tags': tags_to_kill,
                    'remove_tags': tags_to_remove,
                })
            else:
                kwargs['remove_tags'] = tags_to_kill + tags_to_remove
    
        if sanitize_attributes and etree.LXML_VERSION >= (3, 1, 0):  # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
            if strip_classes:
                current_safe_attrs = safe_attrs - frozenset(['class'])
            else:
                current_safe_attrs = safe_attrs
            kwargs.update({
                'safe_attrs_only': True,
                'safe_attrs': current_safe_attrs,
            })
        else:
            kwargs.update({
                'safe_attrs_only': False,  # keep oe-data attributes + style
                'strip_classes': strip_classes,  # remove classes, even when keeping other attributes
            })
    
        try:
            # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
            cleaner = _Cleaner(**kwargs)
            cleaned = cleaner.clean_html(src)
            # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
            cleaned = cleaned.replace('%24', '$')
            cleaned = cleaned.replace('%7B', '{')
            cleaned = cleaned.replace('%7D', '}')
            cleaned = cleaned.replace('%20', ' ')
            cleaned = cleaned.replace('%5B', '[')
            cleaned = cleaned.replace('%5D', ']')
            cleaned = cleaned.replace('%7C', '|')
            cleaned = cleaned.replace('&lt;%', '<%')
            cleaned = cleaned.replace('%&gt;', '%>')
            # html considerations so real html content match database value
            cleaned.replace(u'\xa0', '&nbsp;')
        except etree.ParserError, e:
            if 'empty' in str(e):
                return ""
            if not silent:
                raise
            logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
            cleaned = '<p>ParserError when sanitizing</p>'
        except Exception:
            if not silent:
                raise
            logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
            cleaned = '<p>Unknown error when sanitizing</p>'
    
        # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
        if cleaned.startswith('<div>') and cleaned.endswith('</div>'):
            cleaned = cleaned[5:-6]
    
        return cleaned