# cython: language_level=3str """A cleanup tool for HTML. Removes unwanted tags and content. See the `Cleaner` class for details. """ import copy import re from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, XHTML_NAMESPACE from lxml.html import xhtml_to_html, _transform_result __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 'word_break', 'word_break_html'] # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl # Particularly the CSS cleaning; most of the tag cleaning is integrated now # I have multiple kinds of schemes searched; but should schemes be # whitelisted instead? # max height? # remove images? Also in CSS? background attribute? # Some way to whitelist object, iframe, etc (e.g., if you want to # allow *just* embedded YouTube movies) # Log what was deleted and why? # style="behavior: ..." might be bad in IE? # Should we have something for just ? That's the worst of the # metas. # UTF-7 detections? Example: #
+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- # you don't always have to have the charset set, if the page has no charset # and there's UTF7-like code in it. # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _replace_css_javascript = re.compile( r'expression\s*\(.*?\)', re.S|re.I).sub # Do I have to worry about @\nimport? _replace_css_import = re.compile( r'@\s*import', re.I).sub _looks_like_tag_content = re.compile( r'?[a-zA-Z]+|\son[a-zA-Z]+\s*=', (re.ASCII)).search # All kinds of schemes besides just javascript: that can cause # execution: _find_image_dataurls = re.compile( r'data:image/(.+);base64,', re.I).findall _possibly_malicious_schemes = re.compile( r'(javascript|jscript|livescript|vbscript|data|about|mocha):', re.I).findall # SVG images can contain script content _is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search def _has_javascript_scheme(s): safe_image_urls = 0 for image_type in _find_image_dataurls(s): if _is_unsafe_image_type(image_type): return True safe_image_urls += 1 return len(_possibly_malicious_schemes(s)) > safe_image_urls _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx _conditional_comment_re = re.compile( r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) _find_styled_elements = etree.XPath( "descendant-or-self::*[@style]") _find_external_links = etree.XPath( ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), namespaces={'x':XHTML_NAMESPACE}) class Cleaner: """ Instances cleans the document of each of the possible offending elements. The cleaning is controlled by attributes; you can override attributes in a subclass, or set them in the constructor. ``scripts``: Removes any ``