|
import re |
|
from opencc import OpenCC |
|
|
|
|
|
t2s_converter = OpenCC('t2s') |
|
s2t_converter = OpenCC('s2t') |
|
|
|
|
|
EMOJI_PATTERN = re.compile( |
|
"[" |
|
"\U0001F600-\U0001F64F" |
|
"]+", flags=re.UNICODE |
|
) |
|
|
|
|
|
TRANSLATION_TABLE = str.maketrans({ |
|
'-': ' ', |
|
',': None, |
|
'.': None, |
|
',': None, |
|
'。': None, |
|
'!': None, |
|
'!': None, |
|
'?': None, |
|
'?': None, |
|
'…': None, |
|
';': None, |
|
';': None, |
|
':': None, |
|
':': None, |
|
'\u3000': ' ', |
|
}) |
|
|
|
|
|
BACKSLASH_PATTERN = re.compile(r'\(.*?\)|\[.*?\]') |
|
|
|
SPACE_PATTERN = re.compile('(?<!^)\s+(?!$)') |
|
|
|
|
|
def normalize_text(text, language, strip=True): |
|
""" |
|
对文本进行标准化处理,去除标点符号,转为小写(如果适用) |
|
""" |
|
|
|
text = text.translate(TRANSLATION_TABLE) |
|
|
|
|
|
text = EMOJI_PATTERN.sub('', text) |
|
|
|
|
|
text = SPACE_PATTERN.sub(' ', text) |
|
|
|
|
|
if strip: |
|
text = text.strip() |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
if language == "zh": |
|
text = t2s_converter.convert(text) |
|
if language == "yue": |
|
text = s2t_converter.convert(text) |
|
|
|
return text |
|
|