# -*- coding: utf-8 -*- import re import codecs import textwrap from num2words import num2words # IPA Phonemizer: https://github.com/bootphon/phonemizer import nltk #nltk.download('punkt', download_dir='./') #nltk.download('punkt_tab', download_dir='./') nltk.data.path.append('.') _pad = "$" _punctuation = ';:,.!?¡¿—…"«»“” ' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" # Export all symbols: symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) dicts = {} for i in range(len((symbols))): dicts[symbols[i]] = i class TextCleaner: def __init__(self, dummy=None): self.word_index_dictionary = dicts print(len(dicts)) def __call__(self, text): indexes = [] for char in text: try: indexes.append(self.word_index_dictionary[char]) except KeyError: print(text) return indexes def split_into_sentences(text, max_len=120): sentences = nltk.sent_tokenize(text) limited_sentences = [i for sent in sentences for i in textwrap.wrap(sent, width=max_len)] return limited_sentences def store_ssml(text=None, voice=None): '''create ssml: text : list of sentences voice: https://github.com/MycroftAI/mimic3-voices ''' print('\n___________________________\n', len(text), text[0], '\n___________________________________\n') _s = '' for short_text in text: # 1.44) # 1.24 for bieber rate = min(max(.87, len(short_text) / 76), 1.14) volume = int(74 * np.random.rand() + 24) # text = ('' # THe other voice does not have volume _s += f'' _s += f'' _s += f'' _s += '' _s += short_text _s += '' _s += '' _s += '' _s += '' _s += '' print(len(text), '\n\n\n\n\n\n\n', _s) with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f: f.write(_s) def transliterate_number(number_string, lang='en'): """ Converts a number string to words in the specified language, handling decimals, scientific notation, and preserving text before and after the numeral. """ if lang == 'rmc-script_latin': lang = 'sr' exponential_pronoun = ' puta deset na stepen od ' comma = ' tačka ' elif lang == 'ron': lang = 'ro' exponential_pronoun = ' tízszer a erejéig ' comma = ' virgulă ' elif lang == 'hun': lang = 'hu' exponential_pronoun = ' tízszer a erejéig ' comma = ' virgula ' elif lang == 'deu': exponential_pronoun = ' mal zehn hoch ' comma = ' komma ' else: lang = lang[:2] exponential_pronoun = ' times ten to the power of ' comma = ' point ' def replace_number(match): prefix = match.group(1) or "" number_part = match.group(2) suffix = match.group(5) or "" try: if 'e' in number_part.lower(): base, exponent = number_part.lower().split('e') base = float(base) exponent = int(exponent) words = num2words( base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang) elif '.' in number_part: integer_part, decimal_part = number_part.split('.') words = num2words(int(integer_part), lang=lang) + comma + " ".join( [num2words(int(digit), lang=lang) for digit in decimal_part]) else: words = num2words(int(number_part), lang=lang) return prefix + words + suffix except ValueError: return match.group(0) # Return original if conversion fails pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)' return re.sub(pattern, replace_number, number_string) def discard_leading_numeral(text): """Discards a leading numeral (integer or float) from a string. Args: text: The input string. Returns: The string with the leading numeral removed, or the original string if it doesn't start with a numeral. """ match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text) if match: return text[match.end():].lstrip() else: return text