Spaces:
Runtime error
Runtime error
# flake8: noqa | |
""" | |
Shim module between Bleach and html5lib. This makes it easier to upgrade the | |
html5lib library without having to change a lot of code. | |
""" | |
import re | |
import string | |
import warnings | |
# ignore html5lib deprecation warnings to use bleach; we are bleach | |
# apply before we import submodules that import html5lib | |
warnings.filterwarnings( | |
"ignore", | |
message="html5lib's sanitizer is deprecated", | |
category=DeprecationWarning, | |
module="bleach._vendor.html5lib", | |
) | |
from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file | |
HTMLParser, | |
getTreeWalker, | |
) | |
from bleach._vendor.html5lib import ( | |
constants, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file | |
namespaces, | |
prefixes, | |
) | |
from bleach._vendor.html5lib.constants import ( | |
_ReparseException as ReparseException, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib.filters.base import ( | |
Filter, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib.filters.sanitizer import ( | |
allowed_protocols, | |
allowed_css_properties, | |
allowed_svg_properties, | |
attr_val_is_uri, | |
svg_attr_val_allows_ref, | |
svg_allow_local_href, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib.filters.sanitizer import ( | |
Filter as SanitizerFilter, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib._inputstream import ( | |
HTMLInputStream, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib.serializer import ( | |
escape, | |
HTMLSerializer, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib._tokenizer import ( | |
attributeMap, | |
HTMLTokenizer, | |
) # noqa: E402 module level import not at top of file | |
from bleach._vendor.html5lib._trie import ( | |
Trie, | |
) # noqa: E402 module level import not at top of file | |
#: Map of entity name to expanded entity | |
ENTITIES = constants.entities | |
#: Trie of html entity string -> character representation | |
ENTITIES_TRIE = Trie(ENTITIES) | |
#: Token type constants--these never change | |
TAG_TOKEN_TYPES = { | |
constants.tokenTypes["StartTag"], | |
constants.tokenTypes["EndTag"], | |
constants.tokenTypes["EmptyTag"], | |
} | |
TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"] | |
TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"] | |
TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"] | |
TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"] | |
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 | |
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3 | |
HTML_TAGS = frozenset( | |
( | |
"a", | |
"abbr", | |
"address", | |
"area", | |
"article", | |
"aside", | |
"audio", | |
"b", | |
"base", | |
"bdi", | |
"bdo", | |
"blockquote", | |
"body", | |
"br", | |
"button", | |
"canvas", | |
"caption", | |
"cite", | |
"code", | |
"col", | |
"colgroup", | |
"data", | |
"datalist", | |
"dd", | |
"del", | |
"details", | |
"dfn", | |
"dialog", | |
"div", | |
"dl", | |
"dt", | |
"em", | |
"embed", | |
"fieldset", | |
"figcaption", | |
"figure", | |
"footer", | |
"form", | |
"h1", | |
"h2", | |
"h3", | |
"h4", | |
"h5", | |
"h6", | |
"head", | |
"header", | |
"hgroup", | |
"hr", | |
"html", | |
"i", | |
"iframe", | |
"img", | |
"input", | |
"ins", | |
"kbd", | |
"keygen", | |
"label", | |
"legend", | |
"li", | |
"link", | |
"map", | |
"mark", | |
"menu", | |
"meta", | |
"meter", | |
"nav", | |
"noscript", | |
"object", | |
"ol", | |
"optgroup", | |
"option", | |
"output", | |
"p", | |
"param", | |
"picture", | |
"pre", | |
"progress", | |
"q", | |
"rp", | |
"rt", | |
"ruby", | |
"s", | |
"samp", | |
"script", | |
"section", | |
"select", | |
"slot", | |
"small", | |
"source", | |
"span", | |
"strong", | |
"style", | |
"sub", | |
"summary", | |
"sup", | |
"table", | |
"tbody", | |
"td", | |
"template", | |
"textarea", | |
"tfoot", | |
"th", | |
"thead", | |
"time", | |
"title", | |
"tr", | |
"track", | |
"u", | |
"ul", | |
"var", | |
"video", | |
"wbr", | |
) | |
) | |
#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369 | |
#: from mozilla on 2019.07.11 | |
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements | |
HTML_TAGS_BLOCK_LEVEL = frozenset( | |
( | |
"address", | |
"article", | |
"aside", | |
"blockquote", | |
"details", | |
"dialog", | |
"dd", | |
"div", | |
"dl", | |
"dt", | |
"fieldset", | |
"figcaption", | |
"figure", | |
"footer", | |
"form", | |
"h1", | |
"h2", | |
"h3", | |
"h4", | |
"h5", | |
"h6", | |
"header", | |
"hgroup", | |
"hr", | |
"li", | |
"main", | |
"nav", | |
"ol", | |
"p", | |
"pre", | |
"section", | |
"table", | |
"ul", | |
) | |
) | |
class InputStreamWithMemory: | |
"""Wraps an HTMLInputStream to remember characters since last < | |
This wraps existing HTMLInputStream classes to keep track of the stream | |
since the last < which marked an open tag state. | |
""" | |
def __init__(self, inner_stream): | |
self._inner_stream = inner_stream | |
self.reset = self._inner_stream.reset | |
self.position = self._inner_stream.position | |
self._buffer = [] | |
def errors(self): | |
return self._inner_stream.errors | |
def charEncoding(self): | |
return self._inner_stream.charEncoding | |
def changeEncoding(self): | |
return self._inner_stream.changeEncoding | |
def char(self): | |
c = self._inner_stream.char() | |
# char() can return None if EOF, so ignore that | |
if c: | |
self._buffer.append(c) | |
return c | |
def charsUntil(self, characters, opposite=False): | |
chars = self._inner_stream.charsUntil(characters, opposite=opposite) | |
self._buffer.extend(list(chars)) | |
return chars | |
def unget(self, char): | |
if self._buffer: | |
self._buffer.pop(-1) | |
return self._inner_stream.unget(char) | |
def get_tag(self): | |
"""Returns the stream history since last '<' | |
Since the buffer starts at the last '<' as as seen by tagOpenState(), | |
we know that everything from that point to when this method is called | |
is the "tag" that is being tokenized. | |
""" | |
return "".join(self._buffer) | |
def start_tag(self): | |
"""Resets stream history to just '<' | |
This gets called by tagOpenState() which marks a '<' that denotes an | |
open tag. Any time we see that, we reset the buffer. | |
""" | |
self._buffer = ["<"] | |
class BleachHTMLTokenizer(HTMLTokenizer): | |
"""Tokenizer that doesn't consume character entities""" | |
def __init__(self, consume_entities=False, **kwargs): | |
super().__init__(**kwargs) | |
self.consume_entities = consume_entities | |
# Wrap the stream with one that remembers the history | |
self.stream = InputStreamWithMemory(self.stream) | |
# Remember the last token emitted; needed for block element spacing | |
self.emitted_last_token = None | |
def __iter__(self): | |
last_error_token = None | |
for token in super().__iter__(): | |
if last_error_token is not None: | |
if ( | |
last_error_token["data"] == "invalid-character-in-attribute-name" | |
and token["type"] in TAG_TOKEN_TYPES | |
and token.get("data") | |
): | |
# token["data"] is an html5lib attributeMap | |
# (OrderedDict 3.7+ and dict otherwise) | |
# of attr name to attr value | |
# | |
# Remove attribute names that have ', " or < in them | |
# because those characters are invalid for attribute names. | |
token["data"] = attributeMap( | |
(attr_name, attr_value) | |
for attr_name, attr_value in token["data"].items() | |
if ( | |
'"' not in attr_name | |
and "'" not in attr_name | |
and "<" not in attr_name | |
) | |
) | |
last_error_token = None | |
yield token | |
elif ( | |
last_error_token["data"] == "expected-closing-tag-but-got-char" | |
and self.parser.tags is not None | |
and token["data"].lower().strip() not in self.parser.tags | |
): | |
# We've got either a malformed tag or a pseudo-tag or | |
# something that html5lib wants to turn into a malformed | |
# comment which Bleach clean() will drop so we interfere | |
# with the token stream to handle it more correctly. | |
# | |
# If this is an allowed tag, it's malformed and we just let | |
# the html5lib parser deal with it--we don't enter into this | |
# block. | |
# | |
# If this is not an allowed tag, then we convert it to | |
# characters and it'll get escaped in the sanitizer. | |
token["data"] = self.stream.get_tag() | |
token["type"] = TAG_TOKEN_TYPE_CHARACTERS | |
last_error_token = None | |
yield token | |
elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR: | |
# If the token is a parse error, then let the last_error_token | |
# go, and make token the new last_error_token | |
yield last_error_token | |
last_error_token = token | |
else: | |
yield last_error_token | |
yield token | |
last_error_token = None | |
continue | |
# If the token is a ParseError, we hold on to it so we can get the | |
# next token and potentially fix it. | |
if token["type"] == TAG_TOKEN_TYPE_PARSEERROR: | |
last_error_token = token | |
continue | |
yield token | |
if last_error_token: | |
if last_error_token["data"] == "eof-in-tag-name": | |
# Handle the case where the text being parsed ends with < | |
# followed by a series of characters. It's treated as a tag | |
# name that abruptly ends, but we should treat that like | |
# character data | |
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} | |
elif last_error_token["data"] in ( | |
"eof-in-attribute-name", | |
"eof-in-attribute-value-no-quotes", | |
): | |
# Handle the case where the text being parsed ends with < | |
# followed by a series of characters and then space and then | |
# more characters. It's treated as a tag name followed by an | |
# attribute that abruptly ends, but we should treat that like | |
# character data. | |
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} | |
else: | |
yield last_error_token | |
def consumeEntity(self, allowedChar=None, fromAttribute=False): | |
# If this tokenizer is set to consume entities, then we can let the | |
# superclass do its thing. | |
if self.consume_entities: | |
return super().consumeEntity(allowedChar, fromAttribute) | |
# If this tokenizer is set to not consume entities, then we don't want | |
# to consume and convert them, so this overrides the html5lib tokenizer's | |
# consumeEntity so that it's now a no-op. | |
# | |
# However, when that gets called, it's consumed an &, so we put that back in | |
# the stream. | |
if fromAttribute: | |
self.currentToken["data"][-1][1] += "&" | |
else: | |
self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"}) | |
def tagOpenState(self): | |
# This state marks a < that is either a StartTag, EndTag, EmptyTag, | |
# or ParseError. In all cases, we want to drop any stream history | |
# we've collected so far and we do that by calling start_tag() on | |
# the input stream wrapper. | |
self.stream.start_tag() | |
return super().tagOpenState() | |
def emitCurrentToken(self): | |
token = self.currentToken | |
if ( | |
self.parser.tags is not None | |
and token["type"] in TAG_TOKEN_TYPES | |
and token["name"].lower() not in self.parser.tags | |
): | |
# If this is a start/end/empty tag for a tag that's not in our | |
# allowed list, then it gets stripped or escaped. In both of these | |
# cases it gets converted to a Characters token. | |
if self.parser.strip: | |
if ( | |
self.emitted_last_token | |
and token["type"] == TAG_TOKEN_TYPE_START | |
and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL | |
): | |
# If this is a block level tag we're stripping, we drop it | |
# for a newline because that's what a browser would parse | |
# it as | |
new_data = "\n" | |
else: | |
# For all other things being stripped, we throw in an empty | |
# string token | |
new_data = "" | |
else: | |
# If we're escaping the token, we want to escape the exact | |
# original string. Since tokenizing also normalizes data | |
# and this is a tag-like thing, we've lost some information. | |
# So we go back through the stream to get the original | |
# string and use that. | |
new_data = self.stream.get_tag() | |
new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data} | |
self.currentToken = self.emitted_last_token = new_token | |
self.tokenQueue.append(new_token) | |
self.state = self.dataState | |
return | |
self.emitted_last_token = self.currentToken | |
super().emitCurrentToken() | |
class BleachHTMLParser(HTMLParser): | |
"""Parser that uses BleachHTMLTokenizer""" | |
def __init__(self, tags, strip, consume_entities, **kwargs): | |
""" | |
:arg tags: set of allowed tags--everything else is either stripped or | |
escaped; if None, then this doesn't look at tags at all | |
:arg strip: whether to strip disallowed tags (True) or escape them (False); | |
if tags=None, then this doesn't have any effect | |
:arg consume_entities: whether to consume entities (default behavior) or | |
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) | |
""" | |
self.tags = ( | |
frozenset((tag.lower() for tag in tags)) if tags is not None else None | |
) | |
self.strip = strip | |
self.consume_entities = consume_entities | |
super().__init__(**kwargs) | |
def _parse( | |
self, stream, innerHTML=False, container="div", scripting=True, **kwargs | |
): | |
# set scripting=True to parse <noscript> as though JS is enabled to | |
# match the expected context in browsers | |
# | |
# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element | |
# | |
# Override HTMLParser so we can swap out the tokenizer for our own. | |
self.innerHTMLMode = innerHTML | |
self.container = container | |
self.scripting = scripting | |
self.tokenizer = BleachHTMLTokenizer( | |
stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs | |
) | |
self.reset() | |
try: | |
self.mainLoop() | |
except ReparseException: | |
self.reset() | |
self.mainLoop() | |
def convert_entity(value): | |
"""Convert an entity (minus the & and ; part) into what it represents | |
This handles numeric, hex, and text entities. | |
:arg value: the string (minus the ``&`` and ``;`` part) to convert | |
:returns: unicode character or None if it's an ambiguous ampersand that | |
doesn't match a character entity | |
""" | |
if value[0] == "#": | |
if len(value) < 2: | |
return None | |
if value[1] in ("x", "X"): | |
# hex-encoded code point | |
int_as_string, base = value[2:], 16 | |
else: | |
# decimal code point | |
int_as_string, base = value[1:], 10 | |
if int_as_string == "": | |
return None | |
code_point = int(int_as_string, base) | |
if 0 < code_point < 0x110000: | |
return chr(code_point) | |
else: | |
return None | |
return ENTITIES.get(value, None) | |
def convert_entities(text): | |
"""Converts all found entities in the text | |
:arg text: the text to convert entities in | |
:returns: unicode text with converted entities | |
""" | |
if "&" not in text: | |
return text | |
new_text = [] | |
for part in next_possible_entity(text): | |
if not part: | |
continue | |
if part.startswith("&"): | |
entity = match_entity(part) | |
if entity is not None: | |
converted = convert_entity(entity) | |
# If it's not an ambiguous ampersand, then replace with the | |
# unicode character. Otherwise, we leave the entity in. | |
if converted is not None: | |
new_text.append(converted) | |
remainder = part[len(entity) + 2 :] | |
if part: | |
new_text.append(remainder) | |
continue | |
new_text.append(part) | |
return "".join(new_text) | |
def match_entity(stream): | |
"""Returns first entity in stream or None if no entity exists | |
Note: For Bleach purposes, entities must start with a "&" and end with a | |
";". This ignores ambiguous character entities that have no ";" at the end. | |
:arg stream: the character stream | |
:returns: the entity string without "&" or ";" if it's a valid character | |
entity; ``None`` otherwise | |
""" | |
# Nix the & at the beginning | |
if stream[0] != "&": | |
raise ValueError('Stream should begin with "&"') | |
stream = stream[1:] | |
stream = list(stream) | |
possible_entity = "" | |
end_characters = "<&=;" + string.whitespace | |
# Handle number entities | |
if stream and stream[0] == "#": | |
possible_entity = "#" | |
stream.pop(0) | |
if stream and stream[0] in ("x", "X"): | |
allowed = "0123456789abcdefABCDEF" | |
possible_entity += stream.pop(0) | |
else: | |
allowed = "0123456789" | |
# FIXME(willkg): Do we want to make sure these are valid number | |
# entities? This doesn't do that currently. | |
while stream and stream[0] not in end_characters: | |
c = stream.pop(0) | |
if c not in allowed: | |
break | |
possible_entity += c | |
if possible_entity and stream and stream[0] == ";": | |
return possible_entity | |
return None | |
# Handle character entities | |
while stream and stream[0] not in end_characters: | |
c = stream.pop(0) | |
possible_entity += c | |
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): | |
# If it's not a prefix, then it's not an entity and we're | |
# out | |
return None | |
if possible_entity and stream and stream[0] == ";": | |
return possible_entity | |
return None | |
AMP_SPLIT_RE = re.compile("(&)") | |
def next_possible_entity(text): | |
"""Takes a text and generates a list of possible entities | |
:arg text: the text to look at | |
:returns: generator where each part (except the first) starts with an | |
"&" | |
""" | |
for i, part in enumerate(AMP_SPLIT_RE.split(text)): | |
if i == 0: | |
yield part | |
elif i % 2 == 0: | |
yield "&" + part | |
class BleachHTMLSerializer(HTMLSerializer): | |
"""HTMLSerializer that undoes & -> & in attributes and sets | |
escape_rcdata to True | |
""" | |
# per the HTMLSerializer.__init__ docstring: | |
# | |
# Whether to escape characters that need to be | |
# escaped within normal elements within rcdata elements such as | |
# style. | |
# | |
escape_rcdata = True | |
def escape_base_amp(self, stoken): | |
"""Escapes just bare & in HTML attribute values""" | |
# First, undo escaping of &. We need to do this because html5lib's | |
# HTMLSerializer expected the tokenizer to consume all the character | |
# entities and convert them to their respective characters, but the | |
# BleachHTMLTokenizer doesn't do that. For example, this fixes | |
# &entity; back to &entity; . | |
stoken = stoken.replace("&", "&") | |
# However, we do want all bare & that are not marking character | |
# entities to be changed to &, so let's do that carefully here. | |
for part in next_possible_entity(stoken): | |
if not part: | |
continue | |
if part.startswith("&"): | |
entity = match_entity(part) | |
# Only leave entities in that are not ambiguous. If they're | |
# ambiguous, then we escape the ampersand. | |
if entity is not None and convert_entity(entity) is not None: | |
yield f"&{entity};" | |
# Length of the entity plus 2--one for & at the beginning | |
# and one for ; at the end | |
part = part[len(entity) + 2 :] | |
if part: | |
yield part | |
continue | |
yield part.replace("&", "&") | |
def serialize(self, treewalker, encoding=None): | |
"""Wrap HTMLSerializer.serialize and conver & to & in attribute values | |
Note that this converts & to & in attribute values where the & isn't | |
already part of an unambiguous character entity. | |
""" | |
in_tag = False | |
after_equals = False | |
for stoken in super().serialize(treewalker, encoding): | |
if in_tag: | |
if stoken == ">": | |
in_tag = False | |
elif after_equals: | |
if stoken != '"': | |
yield from self.escape_base_amp(stoken) | |
after_equals = False | |
continue | |
elif stoken == "=": | |
after_equals = True | |
yield stoken | |
else: | |
if stoken.startswith("<"): | |
in_tag = True | |
yield stoken | |