|
|
|
import re |
|
import codecs |
|
import textwrap |
|
from num2words import num2words |
|
|
|
import nltk |
|
|
|
|
|
nltk.data.path.append('.') |
|
|
|
_pad = "$" |
|
_punctuation = ';:,.!?¡¿—…"«»“” ' |
|
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' |
|
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" |
|
|
|
|
|
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) |
|
|
|
dicts = {} |
|
for i in range(len((symbols))): |
|
dicts[symbols[i]] = i |
|
|
|
|
|
class TextCleaner: |
|
def __init__(self, dummy=None): |
|
self.word_index_dictionary = dicts |
|
print(len(dicts)) |
|
|
|
def __call__(self, text): |
|
indexes = [] |
|
for char in text: |
|
try: |
|
indexes.append(self.word_index_dictionary[char]) |
|
except KeyError: |
|
print(text) |
|
return indexes |
|
|
|
|
|
def split_into_sentences(text, max_len=120): |
|
sentences = nltk.sent_tokenize(text) |
|
limited_sentences = [i for sent in sentences for i in textwrap.wrap(sent, width=max_len)] |
|
return limited_sentences |
|
|
|
|
|
def store_ssml(text=None, |
|
voice=None): |
|
'''create ssml: |
|
text : list of sentences |
|
voice: https://github.com/MycroftAI/mimic3-voices |
|
''' |
|
print('\n___________________________\n', len(text), |
|
text[0], '\n___________________________________\n') |
|
_s = '<speak>' |
|
for short_text in text: |
|
|
|
|
|
rate = min(max(.87, len(short_text) / 76), 1.14) |
|
|
|
volume = int(74 * np.random.rand() + 24) |
|
|
|
|
|
_s += f'<prosody volume=\'{volume}\'>' |
|
_s += f'<prosody rate=\'{rate}\'>' |
|
_s += f'<voice name=\'{voice}\'>' |
|
_s += '<s>' |
|
_s += short_text |
|
_s += '</s>' |
|
_s += '</voice>' |
|
_s += '</prosody>' |
|
_s += '</prosody>' |
|
_s += '</speak>' |
|
print(len(text), '\n\n\n\n\n\n\n', _s) |
|
|
|
with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f: |
|
f.write(_s) |
|
|
|
|
|
def transliterate_number(number_string, lang='en'): |
|
""" |
|
Converts a number string to words in the specified language, |
|
handling decimals, scientific notation, and preserving text |
|
before and after the numeral. |
|
""" |
|
|
|
if lang == 'rmc-script_latin': |
|
lang = 'sr' |
|
exponential_pronoun = ' puta deset na stepen od ' |
|
comma = ' tačka ' |
|
elif lang == 'ron': |
|
lang = 'ro' |
|
exponential_pronoun = ' tízszer a erejéig ' |
|
comma = ' virgulă ' |
|
elif lang == 'hun': |
|
lang = 'hu' |
|
exponential_pronoun = ' tízszer a erejéig ' |
|
comma = ' virgula ' |
|
elif lang == 'deu': |
|
exponential_pronoun = ' mal zehn hoch ' |
|
comma = ' komma ' |
|
else: |
|
lang = lang[:2] |
|
exponential_pronoun = ' times ten to the power of ' |
|
comma = ' point ' |
|
|
|
def replace_number(match): |
|
prefix = match.group(1) or "" |
|
number_part = match.group(2) |
|
suffix = match.group(5) or "" |
|
|
|
try: |
|
if 'e' in number_part.lower(): |
|
base, exponent = number_part.lower().split('e') |
|
base = float(base) |
|
exponent = int(exponent) |
|
words = num2words( |
|
base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang) |
|
elif '.' in number_part: |
|
integer_part, decimal_part = number_part.split('.') |
|
words = num2words(int(integer_part), lang=lang) + comma + " ".join( |
|
[num2words(int(digit), lang=lang) for digit in decimal_part]) |
|
else: |
|
words = num2words(int(number_part), lang=lang) |
|
return prefix + words + suffix |
|
except ValueError: |
|
return match.group(0) |
|
|
|
pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)' |
|
return re.sub(pattern, replace_number, number_string) |
|
|
|
|
|
def discard_leading_numeral(text): |
|
"""Discards a leading numeral (integer or float) from a string. |
|
|
|
Args: |
|
text: The input string. |
|
|
|
Returns: |
|
The string with the leading numeral removed, or the original string |
|
if it doesn't start with a numeral. |
|
""" |
|
match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text) |
|
if match: |
|
return text[match.end():].lstrip() |
|
else: |
|
return text |
|
|