Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
# implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Module for the regular expressions crafted from ABNF.""" | |
import sys | |
# https://tools.ietf.org/html/rfc3986#page-13 | |
GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@" | |
GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS) | |
# https://tools.ietf.org/html/rfc3986#page-13 | |
SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;=" | |
SUB_DELIMITERS_SET = set(SUB_DELIMITERS) | |
# Escape the '*' for use in regular expressions | |
SUB_DELIMITERS_RE = r"!$&'()\*+,;=" | |
RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET) | |
ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
DIGIT = "0123456789" | |
# https://tools.ietf.org/html/rfc3986#section-2.3 | |
UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r"._!-~" | |
UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS) | |
NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET) | |
# We need to escape the '-' in this case: | |
UNRESERVED_RE = r"A-Za-z0-9._~\-" | |
# Percent encoded character values | |
PERCENT_ENCODED = PCT_ENCODED = "%[A-Fa-f0-9]{2}" | |
PCHAR = "([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":@]|%s)" % PCT_ENCODED | |
# NOTE(sigmavirus24): We're going to use more strict regular expressions | |
# than appear in Appendix B for scheme. This will prevent over-eager | |
# consuming of items that aren't schemes. | |
SCHEME_RE = "[a-zA-Z][a-zA-Z0-9+.-]*" | |
_AUTHORITY_RE = "[^\\\\/?#]*" | |
_PATH_RE = "[^?#]*" | |
_QUERY_RE = "[^#]*" | |
_FRAGMENT_RE = ".*" | |
# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B | |
COMPONENT_PATTERN_DICT = { | |
"scheme": SCHEME_RE, | |
"authority": _AUTHORITY_RE, | |
"path": _PATH_RE, | |
"query": _QUERY_RE, | |
"fragment": _FRAGMENT_RE, | |
} | |
# See http://tools.ietf.org/html/rfc3986#appendix-B | |
# In this case, we name each of the important matches so we can use | |
# SRE_Match#groupdict to parse the values out if we so choose. This is also | |
# modified to ignore other matches that are not important to the parsing of | |
# the reference so we can also simply use SRE_Match#groups. | |
URL_PARSING_RE = ( | |
r"(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?" | |
r"(?P<path>{path})(?:\?(?P<query>{query}))?" | |
r"(?:#(?P<fragment>{fragment}))?" | |
).format(**COMPONENT_PATTERN_DICT) | |
# ######################### | |
# Authority Matcher Section | |
# ######################### | |
# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2 | |
# The pattern for a regular name, e.g., www.google.com, api.github.com | |
REGULAR_NAME_RE = REG_NAME = "((?:{0}|[{1}])*)".format( | |
"%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + UNRESERVED_RE | |
) | |
# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1, | |
IPv4_RE = r"([0-9]{1,3}\.){3}[0-9]{1,3}" | |
# Hexadecimal characters used in each piece of an IPv6 address | |
HEXDIG_RE = "[0-9A-Fa-f]{1,4}" | |
# Least-significant 32 bits of an IPv6 address | |
LS32_RE = "({hex}:{hex}|{ipv4})".format(hex=HEXDIG_RE, ipv4=IPv4_RE) | |
# Substitutions into the following patterns for IPv6 patterns defined | |
# http://tools.ietf.org/html/rfc3986#page-20 | |
_subs = {"hex": HEXDIG_RE, "ls32": LS32_RE} | |
# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details | |
# about ABNF (Augmented Backus-Naur Form) use in the comments | |
variations = [ | |
# 6( h16 ":" ) ls32 | |
"(%(hex)s:){6}%(ls32)s" % _subs, | |
# "::" 5( h16 ":" ) ls32 | |
"::(%(hex)s:){5}%(ls32)s" % _subs, | |
# [ h16 ] "::" 4( h16 ":" ) ls32 | |
"(%(hex)s)?::(%(hex)s:){4}%(ls32)s" % _subs, | |
# [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 | |
"((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s" % _subs, | |
# [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 | |
"((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s" % _subs, | |
# [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 | |
"((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s" % _subs, | |
# [ *4( h16 ":" ) h16 ] "::" ls32 | |
"((%(hex)s:){0,4}%(hex)s)?::%(ls32)s" % _subs, | |
# [ *5( h16 ":" ) h16 ] "::" h16 | |
"((%(hex)s:){0,5}%(hex)s)?::%(hex)s" % _subs, | |
# [ *6( h16 ":" ) h16 ] "::" | |
"((%(hex)s:){0,6}%(hex)s)?::" % _subs, | |
] | |
IPv6_RE = "(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))".format( | |
*variations | |
) | |
IPv_FUTURE_RE = r"v[0-9A-Fa-f]+\.[%s]+" % ( | |
UNRESERVED_RE + SUB_DELIMITERS_RE + ":" | |
) | |
# RFC 6874 Zone ID ABNF | |
ZONE_ID = "(?:[" + UNRESERVED_RE + "]|" + PCT_ENCODED + ")+" | |
IPv6_ADDRZ_RFC4007_RE = IPv6_RE + "(?:(?:%25|%)" + ZONE_ID + ")?" | |
IPv6_ADDRZ_RE = IPv6_RE + "(?:%25" + ZONE_ID + ")?" | |
IP_LITERAL_RE = r"\[({0}|{1})\]".format( | |
IPv6_ADDRZ_RFC4007_RE, | |
IPv_FUTURE_RE, | |
) | |
# Pattern for matching the host piece of the authority | |
HOST_RE = HOST_PATTERN = "({0}|{1}|{2})".format( | |
REG_NAME, | |
IPv4_RE, | |
IP_LITERAL_RE, | |
) | |
USERINFO_RE = ( | |
"^([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":]|%s)+" % (PCT_ENCODED) | |
) | |
PORT_RE = "[0-9]{1,5}" | |
# #################### | |
# Path Matcher Section | |
# #################### | |
# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information | |
# about the path patterns defined below. | |
segments = { | |
"segment": PCHAR + "*", | |
# Non-zero length segment | |
"segment-nz": PCHAR + "+", | |
# Non-zero length segment without ":" | |
"segment-nz-nc": PCHAR.replace(":", "") + "+", | |
} | |
# Path types taken from Section 3.3 (linked above) | |
PATH_EMPTY = "^$" | |
PATH_ROOTLESS = "%(segment-nz)s(/%(segment)s)*" % segments | |
PATH_NOSCHEME = "%(segment-nz-nc)s(/%(segment)s)*" % segments | |
PATH_ABSOLUTE = "/(%s)?" % PATH_ROOTLESS | |
PATH_ABEMPTY = "(/%(segment)s)*" % segments | |
PATH_RE = "^(%s|%s|%s|%s|%s)$" % ( | |
PATH_ABEMPTY, | |
PATH_ABSOLUTE, | |
PATH_NOSCHEME, | |
PATH_ROOTLESS, | |
PATH_EMPTY, | |
) | |
FRAGMENT_RE = QUERY_RE = ( | |
"^([/?:@" + UNRESERVED_RE + SUB_DELIMITERS_RE + "]|%s)*$" % PCT_ENCODED | |
) | |
# ########################## | |
# Relative reference matcher | |
# ########################## | |
# See http://tools.ietf.org/html/rfc3986#section-4.2 for details | |
RELATIVE_PART_RE = "(//%s%s|%s|%s|%s)" % ( | |
COMPONENT_PATTERN_DICT["authority"], | |
PATH_ABEMPTY, | |
PATH_ABSOLUTE, | |
PATH_NOSCHEME, | |
PATH_EMPTY, | |
) | |
# See http://tools.ietf.org/html/rfc3986#section-3 for definition | |
HIER_PART_RE = "(//%s%s|%s|%s|%s)" % ( | |
COMPONENT_PATTERN_DICT["authority"], | |
PATH_ABEMPTY, | |
PATH_ABSOLUTE, | |
PATH_ROOTLESS, | |
PATH_EMPTY, | |
) | |
# ############### | |
# IRIs / RFC 3987 | |
# ############### | |
# Only wide-unicode gets the high-ranges of UCSCHAR | |
if sys.maxunicode > 0xFFFF: # pragma: no cover | |
IPRIVATE = u"\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD" | |
UCSCHAR_RE = ( | |
u"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF" | |
u"\U00010000-\U0001FFFD\U00020000-\U0002FFFD" | |
u"\U00030000-\U0003FFFD\U00040000-\U0004FFFD" | |
u"\U00050000-\U0005FFFD\U00060000-\U0006FFFD" | |
u"\U00070000-\U0007FFFD\U00080000-\U0008FFFD" | |
u"\U00090000-\U0009FFFD\U000A0000-\U000AFFFD" | |
u"\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD" | |
u"\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD" | |
) | |
else: # pragma: no cover | |
IPRIVATE = u"\uE000-\uF8FF" | |
UCSCHAR_RE = u"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF" | |
IUNRESERVED_RE = u"A-Za-z0-9\\._~\\-" + UCSCHAR_RE | |
IPCHAR = u"([" + IUNRESERVED_RE + SUB_DELIMITERS_RE + u":@]|%s)" % PCT_ENCODED | |
isegments = { | |
"isegment": IPCHAR + u"*", | |
# Non-zero length segment | |
"isegment-nz": IPCHAR + u"+", | |
# Non-zero length segment without ":" | |
"isegment-nz-nc": IPCHAR.replace(":", "") + u"+", | |
} | |
IPATH_ROOTLESS = u"%(isegment-nz)s(/%(isegment)s)*" % isegments | |
IPATH_NOSCHEME = u"%(isegment-nz-nc)s(/%(isegment)s)*" % isegments | |
IPATH_ABSOLUTE = u"/(?:%s)?" % IPATH_ROOTLESS | |
IPATH_ABEMPTY = u"(?:/%(isegment)s)*" % isegments | |
IPATH_RE = u"^(?:%s|%s|%s|%s|%s)$" % ( | |
IPATH_ABEMPTY, | |
IPATH_ABSOLUTE, | |
IPATH_NOSCHEME, | |
IPATH_ROOTLESS, | |
PATH_EMPTY, | |
) | |
IREGULAR_NAME_RE = IREG_NAME = u"(?:{0}|[{1}])*".format( | |
u"%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + IUNRESERVED_RE | |
) | |
IHOST_RE = IHOST_PATTERN = u"({0}|{1}|{2})".format( | |
IREG_NAME, | |
IPv4_RE, | |
IP_LITERAL_RE, | |
) | |
IUSERINFO_RE = ( | |
u"^(?:[" + IUNRESERVED_RE + SUB_DELIMITERS_RE + u":]|%s)+" % (PCT_ENCODED) | |
) | |
IFRAGMENT_RE = ( | |
u"^(?:[/?:@" | |
+ IUNRESERVED_RE | |
+ SUB_DELIMITERS_RE | |
+ u"]|%s)*$" % PCT_ENCODED | |
) | |
IQUERY_RE = ( | |
u"^(?:[/?:@" | |
+ IUNRESERVED_RE | |
+ SUB_DELIMITERS_RE | |
+ IPRIVATE | |
+ u"]|%s)*$" % PCT_ENCODED | |
) | |
IRELATIVE_PART_RE = u"(//%s%s|%s|%s|%s)" % ( | |
COMPONENT_PATTERN_DICT["authority"], | |
IPATH_ABEMPTY, | |
IPATH_ABSOLUTE, | |
IPATH_NOSCHEME, | |
PATH_EMPTY, | |
) | |
IHIER_PART_RE = u"(//%s%s|%s|%s|%s)" % ( | |
COMPONENT_PATTERN_DICT["authority"], | |
IPATH_ABEMPTY, | |
IPATH_ABSOLUTE, | |
IPATH_ROOTLESS, | |
PATH_EMPTY, | |
) | |