mana-tts / Parsivar /tokenizer.py
abreza's picture
add ge2pe
eb57aa1
import re
class Tokenizer():
def __init__(self):
pass
def tokenize_words(self, doc_string):
token_list = doc_string.strip().split()
token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
return token_list
def tokenize_sentences(self, doc_string):
# finding the numbers
pattern = r"[-+]?\d*\.\d+|\d+"
print(doc_string)
nums_list = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
pattern = r'([!\.\?؟]+)[\n]*'
tmp = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, self.add_tab, doc_string)
pattern = r':\n'
tmp = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, self.add_tab, doc_string)
pattern = r';[\n]*'
tmp = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, self.add_tab, doc_string)
pattern = r'؛[\n]*'
tmp = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, self.add_tab, doc_string)
pattern = r'^[\s\r\n]*$'
doc_string = re.sub(pattern, '', doc_string)
pattern = r'[\n\r]+'
doc_string = re.sub(pattern, self.add_tab, doc_string)
for number in nums_list:
pattern = 'floatingpointnumber'
doc_string = re.sub(pattern, number, doc_string, 1)
doc_string = doc_string.split('\t\t')
doc_string = [x for x in doc_string if len(x) > 0 and not x.isspace()]
return doc_string
def add_tab(self, mystring):
mystring = mystring.group() # this method return the string matched by re
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
mystring = " " + mystring + "\t\t" # adding a space after and before punctuation
return mystring
def tokenize_long_clauses(self, string_list, max_char_num):
result = []
pos_tagger = POSTagger('./g2p_resources/model/perpos.model')
for string in string_list:
tmp = string
if len(tmp) > max_char_num:
while (len(tmp) > max_char_num):
first_space_ind = tmp.find(' ', max_char_num)
second_space_ind = tmp.find(' ', 2*max_char_num)
pos = [pos_tagger.parse([word])[0] for word in tmp[first_space_ind:second_space_ind].split()]
try:
word_ind = [x[1] for x in pos[:20]].index('V')
split_index = sum(len(i[0]) for i in pos[:word_ind+1]) + word_ind + 1 + first_space_ind
except:
split_index = first_space_ind
result.append(tmp[:split_index])
tmp = tmp[split_index:]
if len(tmp) <= max_char_num:
result.append(tmp)
else:
result.append(tmp)
return result