|
import re
|
|
|
|
|
|
|
|
class Tokenizer():
|
|
def __init__(self):
|
|
pass
|
|
|
|
def tokenize_words(self, doc_string):
|
|
token_list = doc_string.strip().split()
|
|
token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
|
|
return token_list
|
|
|
|
def tokenize_sentences(self, doc_string):
|
|
|
|
pattern = r"[-+]?\d*\.\d+|\d+"
|
|
print(doc_string)
|
|
nums_list = re.findall(pattern, doc_string)
|
|
doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
|
|
|
|
pattern = r'([!\.\?؟]+)[\n]*'
|
|
tmp = re.findall(pattern, doc_string)
|
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
|
|
|
pattern = r':\n'
|
|
tmp = re.findall(pattern, doc_string)
|
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
|
|
|
pattern = r';[\n]*'
|
|
tmp = re.findall(pattern, doc_string)
|
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
|
|
|
pattern = r'؛[\n]*'
|
|
tmp = re.findall(pattern, doc_string)
|
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
|
|
|
pattern = r'^[\s\r\n]*$'
|
|
doc_string = re.sub(pattern, '', doc_string)
|
|
pattern = r'[\n\r]+'
|
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
|
|
|
for number in nums_list:
|
|
pattern = 'floatingpointnumber'
|
|
doc_string = re.sub(pattern, number, doc_string, 1)
|
|
|
|
doc_string = doc_string.split('\t\t')
|
|
doc_string = [x for x in doc_string if len(x) > 0 and not x.isspace()]
|
|
return doc_string
|
|
|
|
def add_tab(self, mystring):
|
|
mystring = mystring.group()
|
|
mystring = mystring.strip(' ')
|
|
mystring = mystring.strip('\n')
|
|
mystring = " " + mystring + "\t\t"
|
|
return mystring
|
|
|
|
def tokenize_long_clauses(self, string_list, max_char_num):
|
|
result = []
|
|
pos_tagger = POSTagger('./g2p_resources/model/perpos.model')
|
|
for string in string_list:
|
|
tmp = string
|
|
if len(tmp) > max_char_num:
|
|
while (len(tmp) > max_char_num):
|
|
first_space_ind = tmp.find(' ', max_char_num)
|
|
second_space_ind = tmp.find(' ', 2*max_char_num)
|
|
pos = [pos_tagger.parse([word])[0] for word in tmp[first_space_ind:second_space_ind].split()]
|
|
try:
|
|
word_ind = [x[1] for x in pos[:20]].index('V')
|
|
split_index = sum(len(i[0]) for i in pos[:word_ind+1]) + word_ind + 1 + first_space_ind
|
|
except:
|
|
split_index = first_space_ind
|
|
result.append(tmp[:split_index])
|
|
tmp = tmp[split_index:]
|
|
if len(tmp) <= max_char_num:
|
|
result.append(tmp)
|
|
else:
|
|
result.append(tmp)
|
|
|
|
return result
|
|
|