Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| """ | |
| emoji.tokenizer | |
| ~~~~~~~~~~~~~~~ | |
| Components for detecting and tokenizing emoji in strings. | |
| """ | |
| from typing import NamedTuple, Dict, Union, Iterator, Any | |
| from emoji import unicode_codes | |
| __all__ = [ | |
| 'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token', | |
| 'tokenize', 'filter_tokens', | |
| ] | |
| _ZWJ = '\u200D' | |
| _SEARCH_TREE = None | |
| class EmojiMatch: | |
| """ | |
| Represents a match of a "recommended for general interchange" (RGI) | |
| emoji in a string. | |
| """ | |
| __slots__ = ('emoji', 'start', 'end', 'data') | |
| def __init__(self, emoji: str, start: int, | |
| end: int, data: Union[dict, None]): | |
| self.emoji = emoji | |
| """The emoji substring""" | |
| self.start = start | |
| """The start index of the match in the string""" | |
| self.end = end | |
| """The end index of the match in the string""" | |
| self.data = data | |
| """The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI""" | |
| def data_copy(self) -> Dict[str, Any]: | |
| """ | |
| Returns a copy of the data from :data:`EMOJI_DATA` for this match | |
| with the additional keys ``match_start`` and ``match_end``. | |
| """ | |
| if self.data: | |
| emj_data = self.data.copy() | |
| emj_data['match_start'] = self.start | |
| emj_data['match_end'] = self.end | |
| return emj_data | |
| else: | |
| return { | |
| 'match_start': self.start, | |
| 'match_end': self.end | |
| } | |
| def is_zwj(self) -> bool: | |
| """ | |
| Checks if this is a ZWJ-emoji. | |
| :returns: True if this is a ZWJ-emoji, False otherwise | |
| """ | |
| return _ZWJ in self.emoji | |
| def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']: | |
| """ | |
| Splits a ZWJ-emoji into its constituents. | |
| :returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self | |
| """ | |
| if self.is_zwj(): | |
| return EmojiMatchZWJ(self) | |
| else: | |
| return self | |
| def __repr__(self) -> str: | |
| return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})' | |
| class EmojiMatchZWJ(EmojiMatch): | |
| """ | |
| Represents a match of multiple emoji in a string that were joined by | |
| zero-width-joiners (ZWJ/``\\u200D``).""" | |
| __slots__ = ('emojis', ) | |
| def __init__(self, match: EmojiMatch): | |
| super().__init__(match.emoji, match.start, match.end, match.data) | |
| self.emojis = [] | |
| """List of sub emoji as EmojiMatch objects""" | |
| i = match.start | |
| for e in match.emoji.split(_ZWJ): | |
| m = EmojiMatch( | |
| e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None)) | |
| self.emojis.append(m) | |
| i += len(e) + 1 | |
| def join(self) -> str: | |
| """ | |
| Joins a ZWJ-emoji into a string | |
| """ | |
| return _ZWJ.join(e.emoji for e in self.emojis) | |
| def is_zwj(self) -> bool: | |
| return True | |
| def split(self) -> 'EmojiMatchZWJ': | |
| return self | |
| def __repr__(self) -> str: | |
| return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})' | |
| class EmojiMatchZWJNonRGI(EmojiMatchZWJ): | |
| """ | |
| Represents a match of multiple emoji in a string that were joined by | |
| zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji | |
| that are not "recommended for general interchange" (non-RGI) by Unicode.org. | |
| The data property of this class is always None. | |
| """ | |
| def __init__(self, first_emoji_match: EmojiMatch, | |
| second_emoji_match: EmojiMatch): | |
| self.emojis = [first_emoji_match, second_emoji_match] | |
| """List of sub emoji as EmojiMatch objects""" | |
| self._update() | |
| def _update(self): | |
| self.emoji = _ZWJ.join(e.emoji for e in self.emojis) | |
| self.start = self.emojis[0].start | |
| self.end = self.emojis[-1].end | |
| self.data = None | |
| def _add(self, next_emoji_match: EmojiMatch): | |
| self.emojis.append(next_emoji_match) | |
| self._update() | |
| class Token(NamedTuple): | |
| """ | |
| A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji | |
| or a single character that is not a unicode emoji. | |
| """ | |
| chars: str | |
| value: Union[str, EmojiMatch] | |
| def tokenize(string, keep_zwj: bool) -> Iterator[Token]: | |
| """ | |
| Finds unicode emoji in a string. Yields all normal characters as a named | |
| tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``. | |
| :param string: String contains unicode characters. MUST BE UNICODE. | |
| :param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be | |
| skipped or should be yielded as normal characters | |
| :return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)`` | |
| """ | |
| tree = get_search_tree() | |
| EMOJI_DATA = unicode_codes.EMOJI_DATA | |
| # result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ] | |
| result = [] | |
| i = 0 | |
| length = len(string) | |
| ignore = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences | |
| while i < length: | |
| consumed = False | |
| char = string[i] | |
| if i in ignore: | |
| i += 1 | |
| if char == _ZWJ and keep_zwj: | |
| result.append(Token(char, char)) | |
| continue | |
| elif char in tree: | |
| j = i + 1 | |
| sub_tree = tree[char] | |
| while j < length and string[j] in sub_tree: | |
| if j in ignore: | |
| break | |
| sub_tree = sub_tree[string[j]] | |
| j += 1 | |
| if 'data' in sub_tree: | |
| emj_data = sub_tree['data'] | |
| code_points = string[i:j] | |
| # We cannot yield the result here, we need to defer | |
| # the call until we are sure that the emoji is finished | |
| # i.e. we're not inside an ongoing ZWJ-sequence | |
| match_obj = EmojiMatch(code_points, i, j, emj_data) | |
| i = j - 1 | |
| consumed = True | |
| result.append(Token(code_points, match_obj)) | |
| elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree: | |
| # the current char is ZWJ and the last match was an emoji | |
| ignore.append(i) | |
| if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]: | |
| # last match was a component, it could be ZWJ+EMOJI+COMPONENT | |
| # or ZWJ+COMPONENT | |
| i = i - sum(len(t.chars) for t in result[-2:]) | |
| if string[i] == _ZWJ: | |
| # It's ZWJ+COMPONENT, move one back | |
| i += 1 | |
| del result[-1] | |
| else: | |
| # It's ZWJ+EMOJI+COMPONENT, move two back | |
| del result[-2:] | |
| else: | |
| # last match result[-1] was a normal emoji, move cursor | |
| # before the emoji | |
| i = i - len(result[-1].chars) | |
| del result[-1] | |
| continue | |
| elif result: | |
| yield from result | |
| result = [] | |
| if not consumed and char != '\uFE0E' and char != '\uFE0F': | |
| result.append(Token(char, char)) | |
| i += 1 | |
| yield from result | |
| def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]: | |
| """ | |
| Filters the output of `tokenize()` | |
| :param matches: An iterable of tuples of the form ``(match_str, result)`` | |
| where ``result`` is either an EmojiMatch or a string. | |
| :param emoji_only: If True, only EmojiMatch are returned in the output. | |
| If False all characters are returned | |
| :param join_emoji: If True, multiple EmojiMatch are merged into | |
| a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ. | |
| :return: An iterable of tuples :class:`Token` ``(char, char)``, | |
| :class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)`` | |
| """ | |
| if not join_emoji and not emoji_only: | |
| yield from matches | |
| return | |
| if not join_emoji: | |
| for token in matches: | |
| if token.chars != _ZWJ: | |
| yield token | |
| return | |
| # Combine multiple EmojiMatch that are separated by ZWJs into | |
| # a single EmojiMatchZWJNonRGI | |
| previous_is_emoji = False | |
| previous_is_zwj = False | |
| pre_previous_is_emoji = False | |
| accumulator = [] | |
| for token in matches: | |
| pre_previous_is_emoji = previous_is_emoji | |
| if previous_is_emoji and token.value == _ZWJ: | |
| previous_is_zwj = True | |
| elif isinstance(token.value, EmojiMatch): | |
| if pre_previous_is_emoji and previous_is_zwj: | |
| if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI): | |
| accumulator[-1].value._add(token.value) | |
| accumulator[-1] = Token(accumulator[-1].chars + | |
| _ZWJ + token.chars, accumulator[-1].value) | |
| else: | |
| prev = accumulator.pop() | |
| accumulator.append( | |
| Token(prev.chars + _ZWJ + token.chars, | |
| EmojiMatchZWJNonRGI( | |
| prev.value, | |
| token.value))) | |
| else: | |
| accumulator.append(token) | |
| previous_is_emoji = True | |
| previous_is_zwj = False | |
| else: | |
| # Other character, not an emoji | |
| previous_is_emoji = False | |
| previous_is_zwj = False | |
| yield from accumulator | |
| if not emoji_only: | |
| yield token | |
| accumulator = [] | |
| yield from accumulator | |
| def get_search_tree() -> Dict[str, Any]: | |
| """ | |
| Generate a search tree for demojize(). | |
| Example of a search tree:: | |
| EMOJI_DATA = | |
| {'a': {'en': ':Apple:'}, | |
| 'b': {'en': ':Bus:'}, | |
| 'ba': {'en': ':Bat:'}, | |
| 'band': {'en': ':Beatles:'}, | |
| 'bandit': {'en': ':Outlaw:'}, | |
| 'bank': {'en': ':BankOfEngland:'}, | |
| 'bb': {'en': ':BB-gun:'}, | |
| 'c': {'en': ':Car:'}} | |
| _SEARCH_TREE = | |
| {'a': {'data': {'en': ':Apple:'}}, | |
| 'b': {'a': {'data': {'en': ':Bat:'}, | |
| 'n': {'d': {'data': {'en': ':Beatles:'}, | |
| 'i': {'t': {'data': {'en': ':Outlaw:'}}}}, | |
| 'k': {'data': {'en': ':BankOfEngland:'}}}}, | |
| 'b': {'data': {'en': ':BB-gun:'}}, | |
| 'data': {'en': ':Bus:'}}, | |
| 'c': {'data': {'en': ':Car:'}}} | |
| _SEARCH_TREE | |
| / | ⧵ | |
| / | ⧵ | |
| a b c | |
| | / | ⧵ | | |
| | / | ⧵ | | |
| :Apple: ba :Bus: bb :Car: | |
| / ⧵ | | |
| / ⧵ | | |
| :Bat: ban :BB-gun: | |
| / ⧵ | |
| / ⧵ | |
| band bank | |
| / ⧵ | | |
| / ⧵ | | |
| bandi :Beatles: :BankOfEngland: | |
| | | |
| bandit | |
| | | |
| :Outlaw: | |
| """ | |
| global _SEARCH_TREE | |
| if _SEARCH_TREE is None: | |
| _SEARCH_TREE = {} | |
| for emj in unicode_codes.EMOJI_DATA: | |
| sub_tree = _SEARCH_TREE | |
| lastidx = len(emj) - 1 | |
| for i, char in enumerate(emj): | |
| if char not in sub_tree: | |
| sub_tree[char] = {} | |
| sub_tree = sub_tree[char] | |
| if i == lastidx: | |
| sub_tree['data'] = unicode_codes.EMOJI_DATA[emj] | |
| return _SEARCH_TREE | |