Spaces:

abreza
/

mana-tts

Running on Zero

App Files Files Community

mana-tts / Parsivar /tokenizer.py

abreza

add ge2pe

eb57aa1 7 days ago

raw

history blame contribute delete

3.17 kB

	import re



	class Tokenizer():
	def __init__(self):
	pass

	def tokenize_words(self, doc_string):
	token_list = doc_string.strip().split()
	token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
	return token_list

	def tokenize_sentences(self, doc_string):
	# finding the numbers
	pattern = r"[-+]?\d*\.\d+\|\d+"
	print(doc_string)
	nums_list = re.findall(pattern, doc_string)
	doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)

	pattern = r'([!\.\?؟]+)[\n]*'
	tmp = re.findall(pattern, doc_string)
	doc_string = re.sub(pattern, self.add_tab, doc_string)

	pattern = r':\n'
	tmp = re.findall(pattern, doc_string)
	doc_string = re.sub(pattern, self.add_tab, doc_string)

	pattern = r';[\n]*'
	tmp = re.findall(pattern, doc_string)
	doc_string = re.sub(pattern, self.add_tab, doc_string)

	pattern = r'؛[\n]*'
	tmp = re.findall(pattern, doc_string)
	doc_string = re.sub(pattern, self.add_tab, doc_string)

	pattern = r'^[\s\r\n]*$'
	doc_string = re.sub(pattern, '', doc_string)
	pattern = r'[\n\r]+'
	doc_string = re.sub(pattern, self.add_tab, doc_string)

	for number in nums_list:
	pattern = 'floatingpointnumber'
	doc_string = re.sub(pattern, number, doc_string, 1)

	doc_string = doc_string.split('\t\t')
	doc_string = [x for x in doc_string if len(x) > 0 and not x.isspace()]
	return doc_string

	def add_tab(self, mystring):
	mystring = mystring.group() # this method return the string matched by re
	mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
	mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
	mystring = " " + mystring + "\t\t" # adding a space after and before punctuation
	return mystring

	def tokenize_long_clauses(self, string_list, max_char_num):
	result = []
	pos_tagger = POSTagger('./g2p_resources/model/perpos.model')
	for string in string_list:
	tmp = string
	if len(tmp) > max_char_num:
	while (len(tmp) > max_char_num):
	first_space_ind = tmp.find(' ', max_char_num)
	second_space_ind = tmp.find(' ', 2*max_char_num)
	pos = [pos_tagger.parse([word])[0] for word in tmp[first_space_ind:second_space_ind].split()]
	try:
	word_ind = [x[1] for x in pos[:20]].index('V')
	split_index = sum(len(i[0]) for i in pos[:word_ind+1]) + word_ind + 1 + first_space_ind
	except:
	split_index = first_space_ind
	result.append(tmp[:split_index])
	tmp = tmp[split_index:]
	if len(tmp) <= max_char_num:
	result.append(tmp)
	else:
	result.append(tmp)

	return result