from re import sub from unidecode import unidecode import copy import os from .tokenizer import Tokenizer from .data_helper import DataHelper from .token_merger import ClassifierChunkParser import pickle import re class Normalizer: def __init__(self, half_space_char='\u200c', statistical_space_correction=False, date_normalizing_needed=True, time_normalizing_needed=True, pinglish_conversion_needed=False, number2text_needed=True, half_space_corrector=True, train_file_path="resource/tokenizer/Bijan_khan_chunk.txt", token_merger_path="resource/tokenizer/TokenMerger.pckl"): self.time_normalizing_needed = time_normalizing_needed self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" self.dic1_path = self.dir_path + 'resource/normalizer/Dic1_new.txt' self.dic2_path = self.dir_path + 'resource/normalizer/Dic2_new.txt' self.dic3_path = self.dir_path + 'resource/normalizer/Dic3_new.txt' self.dic1 = self.load_dictionary(self.dic1_path) self.dic2 = self.load_dictionary(self.dic2_path) self.dic3 = self.load_dictionary(self.dic3_path) self.tokenizer = Tokenizer() self.dictaition_normalizer = DictatioinNormalizer() self.statistical_space_correction = statistical_space_correction self.date_normalizing_needed = date_normalizing_needed self.pinglish_conversion_needed = pinglish_conversion_needed self.number2text_needed = number2text_needed self.half_space_corrector = half_space_corrector self.data_helper = DataHelper() self.token_merger = ClassifierChunkParser() if self.date_normalizing_needed or self.pinglish_conversion_needed: self.date_normalizer = DateNormalizer() self.pinglish_conversion = PinglishNormalizer() if self.time_normalizing_needed: self.time_normalizer = TimeNormalizer() if self.statistical_space_correction: self.token_merger_path = self.dir_path + token_merger_path self.train_file_path = train_file_path self.half_space_char = half_space_char if os.path.isfile(self.token_merger_path): self.token_merger_model = self.data_helper.load_var(self.token_merger_path) elif os.path.isfile(self.train_file_path): self.token_merger_model = self.token_merger.train_merger(self.train_file_path, test_split=0) self.data_helper.save_var(self.token_merger_path, self.token_merger_model) def load_dictionary(self, file_path): dictionary = {} with open(file_path, 'r', encoding='utf-8') as f: g = f.readlines() for Wrds in g: wrd = Wrds.split(' ') dictionary[wrd[0].strip()] = sub('\n', '', wrd[1].strip()) return dictionary def replace_abbrv(self, text): text = text.replace("( ص )", "( صلي‌الله‌عليه‌وآله )") text = text.replace("( ع )", "( عليه‌السلام )") text = text.replace("( س )", "( سلام‌الله )") text = text.replace("( ره )", "( رحمه‌الله‌عليه )") text = text.replace("( قده )", "( قدس‌سره )") text = text.replace("( رض )", "( رضي‌الله‌عنه )") text = text.replace(" ج ا ا ", " جمهوري اسلامي ايران ") text = text.replace(" ج اا ", " جمهوري اسلامي ايران ") text = text.replace(" ج‌اا ", " جمهوري اسلامي ايران ") text = text.replace(" ج ا ايران ", " جمهوري اسلامي ايران ") text = text.replace(" ج اايران ", " جمهوري اسلامي ايران ") text = text.replace(" ج‌اايران ", " جمهوري اسلامي ايران ") text = text.replace(" صص ", " صفحات ") text = text.replace(" ه ق ", " هجري قمري ") text = text.replace(" ه ش ", " هجري شمسي ") text = text.replace(" ق م ", " قبل از ميلاد ") text = text.replace(" صص ", " صفحات ") text = text.replace(" الخ ", " الي‌آخر ") return text def replace_special_symbols(self, doc_string): special_dict = { "﷼": " ریال ", "ﷴ": " محمد ", "ﷺ": " صلي‌الله‌عليه‌وآله‌وسلم ", "ﷲ": " الله ", "ﷻ": " جَل جلاله ", "ﷱ": " قلي ", "ﷳ": " صلي ", "ﷰ": " اكبر ", "ﷵ": " صلي‌الله‌عليه‌وآله‌وسلم ", "ﷶ": " رسول ", "ﷷ": " عليه‌السلام ", "ﷸ": " و سلم ", "﷽": " بسم‌الله‌الرحمن‌الرحيم ", } result = doc_string for i, j in special_dict.items(): result = sub(i, j, result) return result def sub_alphabets(self, doc_string): # try: # doc_string = doc_string.decode('utf-8') # except UnicodeEncodeError: # pass a0 = "ء" b0 = "ئ" c0 = sub(a0, b0, doc_string) a1 = r"ٲ|ٱ|إ|ﺍ|أ" a11 = r"ﺁ|آ" b1 = r"ا" b11 = r"آ" c11 = sub(a11, b11, c0) c1 = sub(a1, b1, c11) a2 = r"ﺐ|ﺏ|ﺑ" b2 = r"ب" c2 = sub(a2, b2, c1) a3 = r"ﭖ|ﭗ|ﭙ|ﺒ|ﭘ" b3 = r"پ" c3 = sub(a3, b3, c2) a4 = r"ﭡ|ٺ|ٹ|ﭞ|ٿ|ټ|ﺕ|ﺗ|ﺖ|ﺘ" b4 = r"ت" c4 = sub(a4, b4, c3) a5 = r"ﺙ|ﺛ" b5 = r"ث" c5 = sub(a5, b5, c4) a6 = r"ﺝ|ڃ|ﺠ|ﺟ" b6 = r"ج" c6 = sub(a6, b6, c5) a7 = r"ڃ|ﭽ|ﭼ" b7 = r"چ" c7 = sub(a7, b7, c6) a8 = r"ﺢ|ﺤ|څ|ځ|ﺣ" b8 = r"ح" c8 = sub(a8, b8, c7) a9 = r"ﺥ|ﺦ|ﺨ|ﺧ" b9 = r"خ" c9 = sub(a9, b9, c8) a10 = r"ڏ|ډ|ﺪ|ﺩ" b10 = r"د" c10 = sub(a10, b10, c9) a11 = r"ﺫ|ﺬ|ﻧ" b11 = r"ذ" c11 = sub(a11, b11, c10) a12 = r"ڙ|ڗ|ڒ|ڑ|ڕ|ﺭ|ﺮ" b12 = r"ر" c12 = sub(a12, b12, c11) a13 = r"ﺰ|ﺯ" b13 = r"ز" c13 = sub(a13, b13, c12) a14 = r"ﮊ" b14 = r"ژ" c14 = sub(a14, b14, c13) a15 = r"ݭ|ݜ|ﺱ|ﺲ|ښ|ﺴ|ﺳ" b15 = r"س" c15 = sub(a15, b15, c14) a16 = r"ﺵ|ﺶ|ﺸ|ﺷ" b16 = r"ش" c16 = sub(a16, b16, c15) a17 = r"ﺺ|ﺼ|ﺻ" b17 = r"ص" c17 = sub(a17, b17, c16) a18 = r"ﺽ|ﺾ|ﺿ|ﻀ" b18 = r"ض" c18 = sub(a18, b18, c17) a19 = r"ﻁ|ﻂ|ﻃ|ﻄ" b19 = r"ط" c19 = sub(a19, b19, c18) a20 = r"ﻆ|ﻇ|ﻈ" b20 = r"ظ" c20 = sub(a20, b20, c19) a21 = r"ڠ|ﻉ|ﻊ|ﻋ" b21 = r"ع" c21 = sub(a21, b21, c20) a22 = r"ﻎ|ۼ|ﻍ|ﻐ|ﻏ" b22 = r"غ" c22 = sub(a22, b22, c21) a23 = r"ﻒ|ﻑ|ﻔ|ﻓ" b23 = r"ف" c23 = sub(a23, b23, c22) a24 = r"ﻕ|ڤ|ﻖ|ﻗ" b24 = r"ق" c24 = sub(a24, b24, c23) a25 = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ک" b25 = r"ك" c25 = sub(a25, b25, c24) a26 = r"ﮚ|ﮒ|ﮓ|ﮕ|ﮔ" b26 = r"گ" c26 = sub(a26, b26, c25) a27 = r"ﻝ|ﻞ|ﻠ|ڵ" b27 = r"ل" c27 = sub(a27, b27, c26) a28 = r"ﻡ|ﻤ|ﻢ|ﻣ" b28 = r"م" c28 = sub(a28, b28, c27) a29 = r"ڼ|ﻦ|ﻥ|ﻨ" b29 = r"ن" c29 = sub(a29, b29, c28) a30 = r"ވ|ﯙ|ۈ|ۋ|ﺆ|ۊ|ۇ|ۏ|ۅ|ۉ|ﻭ|ﻮ|ؤ" b30 = r"و" c30 = sub(a30, b30, c29) a31 = r"ﺔ|ﻬ|ھ|ﻩ|ﻫ|ﻪ|ۀ|ە|ة|ہ|\u06C1" b31 = r"ه" c31 = sub(a31, b31, c30) a32 = r"ﭛ|ﻯ|ۍ|ﻰ|ﻱ|ﻲ|ں|ﻳ|ﻴ|ﯼ|ې|ﯽ|ﯾ|ﯿ|ێ|ے|ى|ی" b32 = r"ي" c32 = sub(a32, b32, c31) a33 = r'¬' b33 = r'‌' c33 = sub(a33, b33, c32) pa0 = r'•|·|●|·|・|∙|。|ⴰ' pb0 = r'.' pc0 = sub(pa0, pb0, c33) pa1 = r',|٬|‚|,' pb1 = r'،' pc1 = sub(pa1, pb1, pc0) pa2 = r'ʕ' pb2 = r'؟' pc2 = sub(pa2, pb2, pc1) pa3 = r'٪' pb3 = r'%' pc3 = sub(pa3, pb3, pc2) na0 = r'۰|٠' nb0 = r'0' nc0 = sub(na0, nb0, pc3) na1 = r'۱|١' nb1 = r'1' nc1 = sub(na1, nb1, nc0) na2 = r'۲|٢' nb2 = r'2' nc2 = sub(na2, nb2, nc1) na3 = r'۳|٣' nb3 = r'3' nc3 = sub(na3, nb3, nc2) na4 = r'۴|٤' nb4 = r'4' nc4 = sub(na4, nb4, nc3) na5 = r'۵' nb5 = r'5' nc5 = sub(na5, nb5, nc4) na6 = r'۶|٦' nb6 = r'6' nc6 = sub(na6, nb6, nc5) na7 = r'۷|٧' nb7 = r'7' nc7 = sub(na7, nb7, nc6) na8 = r'۸|٨' nb8 = r'8' nc8 = sub(na8, nb8, nc7) na9 = r'۹|٩' nb9 = r'9' nc9 = sub(na9, nb9, nc8) ea1 = r'ـ|ِ|ُ|َ' eb1 = r'' #ec1 = sub(ea1, eb1, nc9) ec1 = nc9 sa1 = r'( )+' sb1 = r' ' sc1 = sub(sa1, sb1, ec1) sa2 = r'(\n)+' sb2 = r'\n' sc2 = sub(sa2, sb2, sc1) espa0 = u'\u200e|\u200f| ' espb0 = ' ' espc0 = sub(espa0, espb0, sc2) return espc0 def space_correction(self, doc_string): a00 = r'^(بی|می|نمی)( )' b00 = r'\1‌' c00 = sub(a00, b00, doc_string) a0 = r'( )(می|نمی|بی)( )' b0 = r'\1\2‌' c0 = sub(a0, b0, c00) a1 = r'( )(هایی|ها|های|ایی|هایم|هایت|هایش|هایمان|هایتان|هایشان|ات|ان|ین' \ r'|انی|بان|ام|ای|یم|ید|اید|اند|بودم|بودی|بود|بودیم|بودید|بودند|ست)( )' b1 = r'‌\2\3' c1 = sub(a1, b1, c0) a2 = r'( )(شده|نشده)( )' b2 = r'‌\2‌' c2 = sub(a2, b2, c1) a3 = r'( )(طلبان|طلب|گرایی|گرایان|شناس|شناسی|گذاری|گذار|گذاران|شناسان|گیری|پذیری|بندی|آوری|سازی|' \ r'بندی|کننده|کنندگان|گیری|پرداز|پردازی|پردازان|آمیز|سنجی|ریزی|داری|دهنده|آمیز|پذیری' \ r'|پذیر|پذیران|گر|ریز|ریزی|رسانی|یاب|یابی|گانه|گانه‌ای|انگاری|گا|بند|رسانی|دهندگان|دار)( )' b3 = r'‌\2\3' c3 = sub(a3, b3, c2) return c3 def space_correction_plus1(self, doc_string): out_sentences = '' for wrd in doc_string.split(' '): try: out_sentences = out_sentences + ' ' + self.dic1[wrd] except KeyError: out_sentences = out_sentences + ' ' + wrd return out_sentences def space_correction_plus2(self, doc_string): out_sentences = '' wrds = doc_string.split(' ') word_len = wrds.__len__() if word_len < 2: return doc_string cnt = 1 for i in range(0, word_len - 1): w = wrds[i] + wrds[i + 1] try: out_sentences = out_sentences + ' ' + self.dic2[w] cnt = 0 except KeyError: if cnt == 1: out_sentences = out_sentences + ' ' + wrds[i] cnt = 1 if cnt == 1: out_sentences = out_sentences + ' ' + wrds[i + 1] return out_sentences def space_correction_plus3(self, doc_string): # Dict = {'گفتوگو': 'گفت‌وگو'} out_sentences = '' wrds = doc_string.split(' ') word_len = wrds.__len__() if word_len < 3: return doc_string cnt = 1 cnt2 = 0 for i in range(0, word_len - 2): w = wrds[i] + wrds[i + 1] + wrds[i + 2] try: out_sentences = out_sentences + ' ' + self.dic3[w] cnt = 0 cnt2 = 2 except KeyError: if cnt == 1 and cnt2 == 0: out_sentences = out_sentences + ' ' + wrds[i] else: cnt2 -= 1 cnt = 1 if cnt == 1 and cnt2 == 0: out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] elif cnt == 1 and cnt2 == 1: out_sentences = out_sentences + ' ' + wrds[i + 2] return out_sentences def replace_puncs(self, doc_string): repeat_pattern = \ r"((?|\-|;|\[|\]|\{|\}|»|«|\^|'|\\|¡|~|©|،|؟|؛|\")\1+" doc_strig = re.sub(repeat_pattern, r"\1", doc_string) pattern = r'(?|\-|;|\[|\]|\{|\}|»|«|\^|\'|\\|¡|~|©|،|؟|؛|\"' clauses = [i.strip() for i in list(filter(None, re.split(pattern, doc_string)))] result = ' | '.join(clauses) return result def split_digit_from_alphabet(self, doc_string): doc_string = re.sub(r'(\d)([^\d\s])', r'\1 \2', doc_string) doc_string = re.sub(r'(\D)([^\D\s])', r'\1 \2', doc_string) return doc_string def normalize(self, doc_string, new_line_elimination=False): normalized_string = doc_string normalized_string = self.dictaition_normalizer.remove_extra_space_zwnj(normalized_string) normalized_string = self.sub_alphabets(normalized_string) normalized_string = self.replace_abbrv(normalized_string) normalized_string = self.replace_special_symbols(normalized_string) normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination).strip() normalized_string = self.split_digit_from_alphabet(normalized_string) normalized_string = self.split_phoneme_in_persian_and_eng_from_rest(normalized_string) if self.date_normalizing_needed: normalized_string = self.date_normalizer.normalize_dates(normalized_string) if self.time_normalizing_needed: normalized_string = self.time_normalizer.normalize_time(normalized_string) if self.number2text_needed: normalized_string = sub(r'[\u06F0-\u06F90-9]+', lambda x: unidecode(x.group(0)), normalized_string) n = NumberNormalizer() normalized_string = sub(r'[0-9]+', lambda x: n.convert(x.group(0)), normalized_string) if self.statistical_space_correction: token_list = normalized_string.strip().split() token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0] token_list = self.token_merger.merg_tokens(token_list, self.token_merger_model, self.half_space_char) normalized_string = " ".join(x for x in token_list) normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination) else: normalized_string = self.space_correction(self.space_correction_plus1(self.space_correction_plus2( self.space_correction_plus3(normalized_string)))).strip() if self.pinglish_conversion_needed: normalized_string = self.pinglish_conversion.pingilish2persian( self.tokenizer.tokenize_words(normalized_string)) normalized_string = self.replace_puncs(normalized_string) if self.half_space_corrector: normalized_string = self.dictaition_normalizer.remove_extra_space_zwnj(normalized_string) normalized_string = self.dictaition_normalizer.join_words_without_rules(normalized_string) normalized_string = self.dictaition_normalizer.correct_compound(normalized_string, 5) normalized_string = self.dictaition_normalizer.half_space_corrector(normalized_string) return normalized_string def split_phoneme_in_persian_and_eng_from_rest(self, doc_string): doc_string = re.sub(r'([\u0622-\u06f0])([^\u0622-\u06f0])', r'\1 \2', doc_string) doc_string = re.sub(r'([a-z|A-Z])([^a-z|^A-Z])', r'\1 \2', doc_string) return doc_string class DateNormalizer: def __init__(self): self.nn = NumberNormalizer() self.persian_month_dict = {'فروردين': 1, 'ارديبهشت': 2, 'خرداد': 3, 'تير': 4, 'مرداد': 5, 'شهريور': 6, 'مهر': 7, 'آبان': 8, 'آذر': 9, 'دي': 10, 'بهمن': 11, 'اسفند': 12} self.christian_month_dict = {'ژانویه': 1, 'فوریه': 2, 'مارس': 3, 'آپریل': 4, 'می': 5, 'ژوئن': 6, 'جولای': 7, 'آگوست': 8, 'سپتامبر': 9, 'اکتبر': 10, 'نوامبر': 11, 'دسامبر': 12} self.num_dict = {'چهار': 4, 'سه': 3, 'دو': 2, 'يك': 1, 'يازده': 11, 'سيزده': 13, 'چهارده': 14, 'دوازده': 12, 'پانزده': 15, 'شانزده': 16, 'چهارم': 4, 'سوم': 3, 'دوم': 2, 'يكم': 1, 'اول': 1, 'يازدهم': 11, 'سيزدهم': 13, 'چهاردهم': 14, 'دوازدهم': 12, 'پانزدهم': 15, 'شانزدهم': 16, 'هفدهم': 17, 'هجدهم': 18, 'نوزدهم': 19, 'بيستم': 20, 'چهلم': 40, 'پنجاهم': 50, 'شصتم': 60, 'هفتادم': 70, 'نودم': 90, 'سيصدم': 300, 'چهارصدم': 400, 'پانصدم': 500, 'ششصدم': 600, 'هفتصدم': 700, 'هشتصدم': 800, 'نهصدم': 900, 'هشتادم': 80, 'هزار': 1000, 'ميليون': 1000000, 'دويست': 200, 'ده': 10, 'نه': 9, 'هشت': 8, 'هفت': 7, 'شش': 6, 'پنج': 5, 'هفده': 17, 'هجده': 18, 'نوزده': 19, 'بيست': 20, 'سي': 30, 'چهل': 40, 'پنجاه': 50, 'شصت': 60, 'هفتاد': 70, 'نود': 90, 'سيصد': 300, 'چهارصد': 400, 'پانصد': 500, 'ششصد': 600, 'هفتصد': 700, 'هشتصد': 800, 'نهصد': 900, 'هشتاد': 80, ' ': 0, 'ميليارد': 1000000000, 'صدم': 100, 'هزارم': 1000, 'دويستم': 200, 'دهم': 10, 'نهم': 9, 'هشتم': 8, 'هفتم': 7, 'ششم': 6, 'پنجم': 5} def find_date_part(self, doc_string): persian_date_regex = re.compile(r'\b(1[0-4]\d\d|[3-9][2-9])([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-2])' r'([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])\b') persian_date_regex_rev = re.compile( r'\b([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-2])' r'([\s]*[/.-][\s]*)(1[0-4][\d][\d]|[3-9][2-9])\b') persian_date_md_regex = re.compile(r'(? 0: value += self.list2num(tmp_section_list) tmp_section_list[:] = [] if value - int(value) == 0: return int(value) else: return value def is_number(self, word): return word in self.num_dict def find_number_location(self, token_list): start_index = 0 number_section = [] for i, el in enumerate(token_list): if self.is_number(el) or (el.replace('.', '', 1).isdigit()): start_index = i number_section.append(start_index) break i = start_index + 1 while i < len(token_list): if token_list[i] == "و" and (i + 1) < len(token_list): if self.is_number(token_list[i + 1]) or (token_list[i + 1].replace('.', '', 1).isdigit()): number_section.append(i) number_section.append(i + 1) i += 2 else: break elif self.is_number(token_list[i]) or (token_list[i].replace('.', '', 1).isdigit()): number_section.append(i) i += 1 else: break return number_section def normalize_numbers(self, token_list, converted=""): for i, el in enumerate(token_list): if el.endswith("ین") and self.is_number(el[:-2]): token_list[i] = el[:-2] finded = self.find_number_location(token_list) if len(finded) == 0: rest_of_string = " ".join(t for t in token_list) return converted + " " + rest_of_string else: numerical_subsection = [token_list[x] for x in finded] numerical_subsection = self.convert2num(numerical_subsection) converted = converted + " " + " ".join(x for x in token_list[:finded[0]]) + " " + str(numerical_subsection) new_index = finded[-1] + 1 return self.normalize_numbers(token_list[new_index:], converted) class NumberNormalizer: def __init__(self): self.faBaseNum = {1: 'يك', 2: 'دو', 3: 'سه', 4: 'چهار', 5: 'پنج', 6: 'شِش', 7: 'هفت', 8: 'هشت', 9: 'نُه', 10: 'دَه', 11: 'يازده', 12: 'دوازده', 13: 'سيزده', 14: 'چهارده', 15: 'پانزده', 16: 'شانزده', 17: 'هفده', 18: 'هجده', 19: 'نوزده', 20: 'بيست', 30: 'سي', 40: 'چهل', 50: 'پنجاه', 60: 'شصت', 70: 'هفتاد', 80: 'هشتاد', 90: 'نود', 100: 'صد', 200: 'دويست', 300: 'سيصد', 500: 'پانصد'} self.faBaseNumKeys = self.faBaseNum.keys() self.faBigNum = ["يك", "هزار", "ميليون", "ميليارد"] self.faBigNumSize = len(self.faBigNum) def split3(self, st): parts = [] n = len(st) d, m = divmod(n, 3) for i in range(d): parts.append(int(st[n - 3 * i - 3:n - 3 * i])) if m > 0: parts.append(int(st[:m])) return parts def convert(self, st): st = str(st) if len(st) > 3: parts = self.split3(st) k = len(parts) wparts = [] for i in range(k): p = parts[i] if p == 0: continue if i == 0: wpart = self.convert(p) else: if i < self.faBigNumSize: fa_order = self.faBigNum[i] else: fa_order = '' (d, m) = divmod(i, 3) t9 = self.faBigNum[3] for j in range(d): if j > 0: fa_order += "‌" fa_order += t9 if m != 0: if fa_order != '': fa_order = "‌" + fa_order fa_order = self.faBigNum[m] + fa_order wpart = fa_order if i == 1 and p == 1 else self.convert(p) + " " + fa_order wparts.append(wpart) return " وَ ".join(reversed(wparts)) # now assume that n <= 999 n = int(st) if n in self.faBaseNumKeys: return self.faBaseNum[n] y = n % 10 d = int((n % 100) / 10) s = int(n / 100) # print s, d, y dy = 10 * d + y fa = '' if s != 0: if s * 100 in self.faBaseNumKeys: fa += self.faBaseNum[s * 100] else: fa += (self.faBaseNum[s] + self.faBaseNum[100]) if d != 0 or y != 0: fa += " وَ " if d != 0: if dy in self.faBaseNumKeys: fa += self.faBaseNum[dy] return fa fa += self.faBaseNum[d * 10] if y != 0: fa += " وَ " if y != 0: fa += self.faBaseNum[y] return fa def convert_ordinary(self, arg): if isinstance(arg, int): num = arg st = str(arg) elif isinstance(arg, str): num = int(arg) st = arg else: raise TypeError('bad type "%s"' % type(arg)) if num == 1: return 'اول' # OR 'یکم'fixme elif num == 10: return 'دهم' norm_fa = self.convert(st) if len(norm_fa) == 0: return '' if norm_fa.endswith(u'ی'): norm_fa += u'‌ام' elif norm_fa.endswith(u'سه'): norm_fa = norm_fa[:-1] + u'وم' else: norm_fa += u'م' return norm_fa class PinglishNormalizer: def __init__(self): self.data_helper = DataHelper() self.file_dir = os.path.dirname(os.path.realpath(__file__)) + "/" self.en_dict_filename = self.file_dir + "resource/tokenizer/enDict" self.en_dict = self.data_helper.load_var(self.en_dict_filename) self.fa_dict_filename = self.file_dir + "resource/tokenizer/faDict" self.fa_dict = self.data_helper.load_var(self.fa_dict_filename) def pingilish2persian(self, pinglish_words_list): for i, word in enumerate(pinglish_words_list): if word in self.en_dict: pinglish_words_list[i] = self.en_dict[word] # .decode("utf-8") # inp = inp.replace(word, enDict[word], 1) else: ch = self.characterize(word) pr = self.map_char(ch) amir = self.make_word(pr) for wd in amir: am = self.escalation(wd) asd = ''.join(am) if asd in self.fa_dict: pinglish_words_list[i] = asd # .decode("utf-8") # inp = inp.replace(word, asd, 1) inp = " ".join(x for x in pinglish_words_list) return inp def characterize(self, word): list_of_char = [] i = 0 while i < len(word): char = word[i] sw_out = self.switcher(char) if sw_out is None: esp_out = None if i < len(word) - 1: esp_out = self.esp_check(word[i], word[i + 1]) if esp_out is None: list_of_char.append(word[i]) else: list_of_char.append(esp_out) i += 1 else: list_of_char.append(sw_out) i += 1 return list_of_char def switcher(self, ch): switcher = { "c": None, "k": None, "z": None, "s": None, "g": None, "a": None, "u": None, "e": None, "o": None } return switcher.get(ch, ch) def esp_check(self, char1, char2): st = char1 + char2 if st == "ch": return "ch" elif st == "kh": return "kh" elif st == "zh": return "zh" elif st == "sh": return "sh" elif st == "gh": return "gh" elif st == "aa": return "aa" elif st == "ee": return "ee" elif st == "oo": return "oo" elif st == "ou": return "ou" else: return None def map_char(self, word): listm = [] sw_out = self.map_switcher(word[0]) i = 0 if sw_out is None: listm.append(["ا"]) i += 1 if word[0] == "oo": listm.append(["او"]) i += 1 while i < len(word): listm.append(self.char_switcher(word[i])) i += 1 if word[len(word) - 1] == "e": listm.append(["ه"]) elif word[len(word) - 1] == "a": listm.append(["ا"]) elif word[len(word) - 1] == "o": listm.append(["و"]) elif word[len(word) - 1] == "u": listm.append(["و"]) return listm def map_switcher(self, ch): switcher = { "a": None, "e": None, "o": None, "u": None, "ee": None, "ou": None } return switcher.get(ch, ch) def make_word(self, chp): word_list = [[]] for char in chp: word_list_temp = [] for tmp_word_list in word_list: for chch in char: tmp = copy.deepcopy(tmp_word_list) tmp.append(chch) word_list_temp.append(tmp) word_list = word_list_temp return word_list def escalation(self, word): tmp = [] i = 0 t = len(word) while i < t - 1: tmp.append(word[i]) if word[i] == word[i + 1]: i += 1 i += 1 if i != t: tmp.append(word[i]) return tmp def char_switcher(self, ch): switcher = { 'a': ["", "ا"], 'c': ["ث", "ص", "ص"], 'h': ["ه", "ح"], 'b': ["ب"], 'p': ["پ"], 't': ["ت", "ط"], 's': ["س", "ص", "ث"], 'j': ["ج"], 'ch': ["چ"], 'kh': ["خ"], 'q': ["ق", "غ"], 'd': ["د"], 'z': ["ز", "ذ", "ض", "ظ"], 'r': ["ر"], 'zh': ["ژ"], 'sh': ["ش"], 'gh': [",ق", "غ"], 'f': ["ف"], 'k': ["ک"], 'g': ["گ"], 'l': ["ل"], 'm': ["م"], 'n': ["ن"], 'v': ["و"], 'aa': ["ا"], 'ee': ["ی"], 'oo': ["و"], 'ou': ["و"], 'i': ["ی"], 'y': ["ی"], ' ': [""], 'w': ["و"], 'e': ["", "ه"], 'o': ["", "و"] } return switcher.get(ch, "") class DictatioinNormalizer: def __init__(self, zwnj_database_path='./Parsivar/resource/normalizer/N_cctt.txt', compound_table_path='./Parsivar/resource/normalizer/Normalizer_WrongCompound.txt'): self.zwnj_table, self.zwnj_type = self.load_zwnj_database(zwnj_database_path) self.compound_table = self.load_compound_table(compound_table_path) look_up_t_farsi_str = './g2p_resources/look_up_t_farsi_extra_of_ariana_b_punch_EN_extra_of_nevisa.pkl' homograph_t_str = './g2p_resources/list_homograph.pkl' with open(look_up_t_farsi_str, 'rb') as f: self.look_up_t_farsi = pickle.load(f) with open(homograph_t_str, 'rb') as f: self.homograph = pickle.load(f) self.half_space_data = [half_space_group for half_space_group in self.look_up_t_farsi if isinstance(half_space_group, str) and re.findall('\u200c', half_space_group)] + \ [half_space_group for half_space_group in self.homograph if isinstance(half_space_group, str) and re.findall('\u200c', half_space_group)] self.half_space_data_splited = [word_group.split('\u200c') for word_group in self.half_space_data] self.half_space_data_joined = dict() for i in self.half_space_data_splited: for j in range(len(i)): for k in range(j + 1, len(i)): self.half_space_data_joined["".join(i[j:k + 1])] = '\u200c'.join(i[j:k + 1]) def remove_extra_space_zwnj(self, doc_string): extra_regex = re.compile(r'([\u0648\u0624\u062f\u0630\u0631\u0632\u0698\u0627' r'\u0622\u0654\u0621\u0623\u0625\u0060][\u0627\u064b][\u0621\u064b])(\u200c)') doc_string = extra_regex.sub('\1', doc_string) extra_regex = re.compile(r'([\u0030-\u0039])(\u200c)') doc_string = extra_regex.sub('\1', doc_string) extra_regex = re.compile(r'(\u200c)([\u0030-\u0039])') doc_string = extra_regex.sub('\2', doc_string) extra_regex = re.compile(r'([a-z|A-Z])(\u200c)') doc_string = extra_regex.sub('\1', doc_string) extra_regex = re.compile(r'(\u200c)([a-z|A-Z])') doc_string = extra_regex.sub('\2', doc_string) extra_regex = re.compile(r'[\s]{2,}') doc_string = extra_regex.sub(' ', doc_string) extra_regex = re.compile(r'[\u200c]{2,}') doc_string = extra_regex.sub('\u200c', doc_string) extra_regex = re.compile(r'\u200c\s') doc_string = extra_regex.sub(' ', doc_string) extra_regex = re.compile(r'\s\u200c') doc_string = extra_regex.sub(' ', doc_string) return doc_string def correct_compound(self, doc_string, neighborhood): word_list = doc_string.split() if neighborhood > len(word_list): neighborhood = len(word_list) output_list = [] i = 0 while i < len(word_list): for j in range(neighborhood, 0, -1): if i + j < len(word_list): comp = " ".join(word_list[i:i + j]) else: comp = " ".join(word_list[i:]) if comp in self.compound_table: output_list.append(self.compound_table[comp]) i += j break else: output_list.append(word_list[i]) i += 1 return " ".join(output_list) def half_space_corrector(self, string): word_list = string.split() word_list = [word.strip('\u200c') for word in word_list] word_index = 0 result = [] while word_index < len(word_list): a=1 if word_list[word_index] in self.half_space_data_joined and \ word_list[word_index] not in self.look_up_t_farsi and word_list[word_index] not in self.homograph: word_list[word_index] = self.half_space_data_joined[word_list[word_index]] possible_words = [i for i in self.half_space_data_splited if word_list[word_index] == i[0] and word_index + 1 < len(word_list) and word_list[word_index + 1] == i[1]] best_words = [word_list[word_index]] max_match = 0 if possible_words: for j in possible_words: for k in range(len(j)): if k >= len(word_list): break if len(word_list) <= word_index+k or j[k] != word_list[word_index + k]: break if max_match < k: max_match = k best_words = j[:k + 1] else: if word_index + 1 < len(word_list) and (word_list[word_index + 1] in ['ام', 'ات', 'اش', 'مان', 'شان', 'تان', 'ايم', 'ايد', 'اند', 'اي', 'ها', 'هاي', 'هايي', 'هايم', 'هايشان', 'هايتان', 'هايمان', 'هايت', 'هايش', 'هائي' 'تر', 'ترين', 'ي', 'يي'] or word_list[word_index] in ['مي', 'نمي', 'برنمي', 'درمي', 'درنمي']): best_words = [word_list[word_index], word_list[word_index + 1]] max_match = 1 result.append('\u200c'.join(best_words)) word_index += max_match + 1 result = " ".join(result) return result def hamze_corrector(self, string): word_list = string.split() for word in word_list: if word not in self.homograph + self.look_up_t_farsi: pass def load_zwnj_database(self, zwnj_database_file): table_words = dict() words_type = dict() with open(zwnj_database_file, 'r') as f: for line in f: cols = line.split('\t') connection_type = cols[0] incorrect_word = cols[1] correct_word = cols[2].replace(' ', '_') table_words[incorrect_word.replace('\u200c', '^')] = correct_word if connection_type in ["مجزا", "به قبلي", "به بعدي"]: words_type[correct_word.replace('\u200c', '^')] = connection_type return table_words, words_type def is_english_grapheme(self, char): return 'a' < char < 'z' or 'A' < char < 'Z' def is_english_word(self, word): return self.is_english_grapheme(word[0]) def is_punctuation(self, char): return char in ['-', '=', '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', '[', '{', ']', '}', ':', ';', '\'', '\"', ',', '<', '.', '>', '/', '?', '|', '\\', '÷', '٬', '٫', '٪', '×', '،', '»', '«', '؛', '؟', '…', '”', 'ˈ'] def join_words_without_rules(self, docstring): words = docstring.split() correct_docstring = [] for ind in range(len(words)): word = words[ind] table_word = word.replace('\u200c', '^') if table_word in self.zwnj_type.keys(): if self.zwnj_type[table_word] == "مجزا": correct_docstring.append(self.zwnj_table[table_word].replace('_', ' ')) elif self.zwnj_type[table_word] == "به قبلي": if correct_docstring and not self.is_english_word(correct_docstring[-1]) \ and not self.is_punctuation(correct_docstring[-1][-1]): correct_docstring.append('\u200c' + self.zwnj_table[table_word]) else: correct_docstring.append(self.zwnj_table[table_word]) elif self.zwnj_type[table_word] == "به بعدي": if ind < len(words) - 1 and not self.is_english_word(words[ind + 1]) \ and not self.is_punctuation(words[ind + 1][0]): correct_docstring.append(self.zwnj_table[table_word] + '\u200c') else: correct_docstring.append(self.zwnj_table[table_word]) else: correct_docstring.append(word) return " ".join(correct_docstring) def load_compound_table(self, compound_table_path): compound_table = dict() with open(compound_table_path, 'r') as f: for line in f: firstcol, secondcol = line.split('\t') compound_table[firstcol] = secondcol return compound_table class TimeNormalizer: def __init__(self, spanning=2): self.numberNorm = NumberNormalizer() self.spanning = spanning def normalize_time(self, doc_string): time_regex = re.compile(r'([0-9]|0[0-9]|1[0-9]|2[0-4])[\s]*([:\-])[\s]*([0-5][\d]|60)[\']?' r'[\s]*(([:\-])[\s]*([0-5][\d]|60)[\"]?)?') keywords_time1 = ['وقت', 'مدت', 'زمان'] keywords_time2 = ['عصر', 'شب', 'غروب', 'ظهر', 'صبح', 'ساعت', 'بامداد'] while time_regex.search(doc_string): match = time_regex.search(doc_string) if ngram_lookup(doc_string, match.start(), match.end(), keywords_time1, self.spanning) and \ not ngram_lookup(doc_string, match.start(), match.end(), keywords_time2, self.spanning): if match[6] is not None: hour = int(match[1]) minute = int(match[3]) second = int(match[6]) doc_string = self.time_duration_to_text_sec(doc_string, match.start(), match.end(), hour, minute, second) else: hour = int(match[1]) minute = int(match[3]) doc_string = self.time_duration_to_text(doc_string, match.start(), match.end(), hour, minute) else: if match[6] is not None: hour = int(match[1]) minute = int(match[3]) second = int(match[6]) doc_string = self.time_to_text_sec(doc_string, match.start(), match.end(), hour, minute, second) else: hour = int(match[1]) minute = int(match[3]) doc_string = self.time_to_text(doc_string, match.start(), match.end(), hour, minute) return doc_string def time_duration_to_text_sec(self, doc_string, start, end, hour, minute, second): string = self.numberNorm.convert(hour) + ' ساعت وَ ' + self.numberNorm.convert(minute) + ' دقیقه وَ ' + \ self.numberNorm.convert(second) + ' ثانیه' doc_string = doc_string[:start] + string + doc_string[end:] return doc_string def time_duration_to_text(self, doc_string, start, end, hour, minute): string = self.numberNorm.convert(hour) + ' ساعت وَ ' + self.numberNorm.convert(minute) + ' دقیقه ' doc_string = doc_string[:start] + string + doc_string[end:] return doc_string def time_to_text_sec(self, doc_string, start, end, hour, minute, second): string = self.numberNorm.convert(hour) + ' وَ ' + self.numberNorm.convert(minute) + ' وَ ' + \ self.numberNorm.convert(second) + ' ثانیه ' doc_string = doc_string[:start] + string + doc_string[end:] return doc_string def time_to_text(self, doc_string, start, end, hour, minute): string = self.numberNorm.convert(hour) + ' وَ' \ ' ' + self.numberNorm.convert(minute) + ' دقیقه ' doc_string = doc_string[:start] + string + doc_string[end:] return doc_string def ngram_lookup(doc_string, start, end, word_list, span): result = False for spanns in range(1, span): if any(x in word_list for x in doc_string[:start].rsplit(maxsplit=spanns + 1)[-spanns:]): result = True if any(x in word_list for x in doc_string[end:].split(maxsplit=spanns + 1)[:spanns]): result = True return result