Spaces:

abreza
/

mana-tts

Running on Zero

File size: 50,165 Bytes

eb57aa1

from re import sub
from unidecode import unidecode
import copy
import os
from .tokenizer import Tokenizer
from .data_helper import DataHelper
from .token_merger import ClassifierChunkParser
import pickle
import re


class Normalizer:

    def __init__(self,

                 half_space_char='\u200c',

                 statistical_space_correction=False,

                 date_normalizing_needed=True,

                 time_normalizing_needed=True,

                 pinglish_conversion_needed=False,

                 number2text_needed=True,

                 half_space_corrector=True,

                 train_file_path="resource/tokenizer/Bijan_khan_chunk.txt",

                 token_merger_path="resource/tokenizer/TokenMerger.pckl"):
        self.time_normalizing_needed = time_normalizing_needed
        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"

        self.dic1_path = self.dir_path + 'resource/normalizer/Dic1_new.txt'
        self.dic2_path = self.dir_path + 'resource/normalizer/Dic2_new.txt'
        self.dic3_path = self.dir_path + 'resource/normalizer/Dic3_new.txt'
        self.dic1 = self.load_dictionary(self.dic1_path)
        self.dic2 = self.load_dictionary(self.dic2_path)
        self.dic3 = self.load_dictionary(self.dic3_path)

        self.tokenizer = Tokenizer()
        self.dictaition_normalizer = DictatioinNormalizer()
        self.statistical_space_correction = statistical_space_correction
        self.date_normalizing_needed = date_normalizing_needed
        self.pinglish_conversion_needed = pinglish_conversion_needed
        self.number2text_needed = number2text_needed
        self.half_space_corrector = half_space_corrector
        self.data_helper = DataHelper()
        self.token_merger = ClassifierChunkParser()

        if self.date_normalizing_needed or self.pinglish_conversion_needed:
            self.date_normalizer = DateNormalizer()
            self.pinglish_conversion = PinglishNormalizer()

        if self.time_normalizing_needed:
            self.time_normalizer = TimeNormalizer()

        if self.statistical_space_correction:
            self.token_merger_path = self.dir_path + token_merger_path
            self.train_file_path = train_file_path
            self.half_space_char = half_space_char

            if os.path.isfile(self.token_merger_path):
                self.token_merger_model = self.data_helper.load_var(self.token_merger_path)
            elif os.path.isfile(self.train_file_path):
                self.token_merger_model = self.token_merger.train_merger(self.train_file_path, test_split=0)
                self.data_helper.save_var(self.token_merger_path, self.token_merger_model)

    def load_dictionary(self, file_path):
        dictionary = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            g = f.readlines()
            for Wrds in g:
                wrd = Wrds.split(' ')
                dictionary[wrd[0].strip()] = sub('\n', '', wrd[1].strip())
        return dictionary

    def replace_abbrv(self, text):
        text = text.replace("( ص )", "( صلي‌الله‌عليه‌وآله )")
        text = text.replace("( ع )", "( عليه‌السلام )")
        text = text.replace("( س )", "( سلام‌الله )")
        text = text.replace("( ره )", "( رحمه‌الله‌عليه )")
        text = text.replace("( قده )", "( قدس‌سره )")
        text = text.replace("( رض )", "( رضي‌الله‌عنه )")
        text = text.replace(" ج ا ا ", " جمهوري اسلامي ايران ")
        text = text.replace(" ج اا ", " جمهوري اسلامي ايران ")
        text = text.replace(" ج‌اا ", " جمهوري اسلامي ايران ")
        text = text.replace(" ج ا ايران ", " جمهوري اسلامي ايران ")
        text = text.replace(" ج اايران ", " جمهوري اسلامي ايران ")
        text = text.replace(" ج‌اايران ", " جمهوري اسلامي ايران ")
        text = text.replace(" صص ", " صفحات ")
        text = text.replace(" ه ق ", " هجري قمري ")
        text = text.replace(" ه ش ", " هجري شمسي ")
        text = text.replace(" ق م ", " قبل از ميلاد ")
        text = text.replace(" صص ", " صفحات ")
        text = text.replace(" الخ ", " الي‌آخر ")
        return text

    def replace_special_symbols(self, doc_string):
        special_dict = {
            "﷼": " ریال ",
            "ﷴ": " محمد ",
            "ﷺ": " صلي‌الله‌عليه‌وآله‌وسلم ",
            "ﷲ": " الله ",
            "ﷻ": " جَل جلاله ",
            "ﷱ": " قلي ",
            "ﷳ": " صلي ",
            "ﷰ": " اكبر ",
            "ﷵ": " صلي‌الله‌عليه‌وآله‌وسلم ",
            "ﷶ": " رسول ",
            "ﷷ": " عليه‌السلام ",
            "ﷸ": " و سلم ",
            "﷽": " بسم‌الله‌الرحمن‌الرحيم ",
        }
        result = doc_string
        for i, j in special_dict.items():
            result = sub(i, j, result)
        return result

    def sub_alphabets(self, doc_string):
        # try:
        #     doc_string = doc_string.decode('utf-8')
        # except UnicodeEncodeError:
        #     pass
        a0 = "ء"
        b0 = "ئ"
        c0 = sub(a0, b0, doc_string)
        a1 = r"ٲ|ٱ|إ|ﺍ|أ"
        a11 = r"ﺁ|آ"
        b1 = r"ا"
        b11 = r"آ"
        c11 = sub(a11, b11, c0)
        c1 = sub(a1, b1, c11)
        a2 = r"ﺐ|ﺏ|ﺑ"
        b2 = r"ب"
        c2 = sub(a2, b2, c1)
        a3 = r"ﭖ|ﭗ|ﭙ|ﺒ|ﭘ"
        b3 = r"پ"
        c3 = sub(a3, b3, c2)
        a4 = r"ﭡ|ٺ|ٹ|ﭞ|ٿ|ټ|ﺕ|ﺗ|ﺖ|ﺘ"
        b4 = r"ت"
        c4 = sub(a4, b4, c3)
        a5 = r"ﺙ|ﺛ"
        b5 = r"ث"
        c5 = sub(a5, b5, c4)
        a6 = r"ﺝ|ڃ|ﺠ|ﺟ"
        b6 = r"ج"
        c6 = sub(a6, b6, c5)
        a7 = r"ڃ|ﭽ|ﭼ"
        b7 = r"چ"
        c7 = sub(a7, b7, c6)
        a8 = r"ﺢ|ﺤ|څ|ځ|ﺣ"
        b8 = r"ح"
        c8 = sub(a8, b8, c7)
        a9 = r"ﺥ|ﺦ|ﺨ|ﺧ"
        b9 = r"خ"
        c9 = sub(a9, b9, c8)
        a10 = r"ڏ|ډ|ﺪ|ﺩ"
        b10 = r"د"
        c10 = sub(a10, b10, c9)
        a11 = r"ﺫ|ﺬ|ﻧ"
        b11 = r"ذ"
        c11 = sub(a11, b11, c10)
        a12 = r"ڙ|ڗ|ڒ|ڑ|ڕ|ﺭ|ﺮ"
        b12 = r"ر"
        c12 = sub(a12, b12, c11)
        a13 = r"ﺰ|ﺯ"
        b13 = r"ز"
        c13 = sub(a13, b13, c12)
        a14 = r"ﮊ"
        b14 = r"ژ"
        c14 = sub(a14, b14, c13)
        a15 = r"ݭ|ݜ|ﺱ|ﺲ|ښ|ﺴ|ﺳ"
        b15 = r"س"
        c15 = sub(a15, b15, c14)
        a16 = r"ﺵ|ﺶ|ﺸ|ﺷ"
        b16 = r"ش"
        c16 = sub(a16, b16, c15)
        a17 = r"ﺺ|ﺼ|ﺻ"
        b17 = r"ص"
        c17 = sub(a17, b17, c16)
        a18 = r"ﺽ|ﺾ|ﺿ|ﻀ"
        b18 = r"ض"
        c18 = sub(a18, b18, c17)
        a19 = r"ﻁ|ﻂ|ﻃ|ﻄ"
        b19 = r"ط"
        c19 = sub(a19, b19, c18)
        a20 = r"ﻆ|ﻇ|ﻈ"
        b20 = r"ظ"
        c20 = sub(a20, b20, c19)
        a21 = r"ڠ|ﻉ|ﻊ|ﻋ"
        b21 = r"ع"
        c21 = sub(a21, b21, c20)
        a22 = r"ﻎ|ۼ|ﻍ|ﻐ|ﻏ"
        b22 = r"غ"
        c22 = sub(a22, b22, c21)
        a23 = r"ﻒ|ﻑ|ﻔ|ﻓ"
        b23 = r"ف"
        c23 = sub(a23, b23, c22)
        a24 = r"ﻕ|ڤ|ﻖ|ﻗ"
        b24 = r"ق"
        c24 = sub(a24, b24, c23)
        a25 = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ک"
        b25 = r"ك"
        c25 = sub(a25, b25, c24)
        a26 = r"ﮚ|ﮒ|ﮓ|ﮕ|ﮔ"
        b26 = r"گ"
        c26 = sub(a26, b26, c25)
        a27 = r"ﻝ|ﻞ|ﻠ|ڵ"
        b27 = r"ل"
        c27 = sub(a27, b27, c26)
        a28 = r"ﻡ|ﻤ|ﻢ|ﻣ"
        b28 = r"م"
        c28 = sub(a28, b28, c27)
        a29 = r"ڼ|ﻦ|ﻥ|ﻨ"
        b29 = r"ن"
        c29 = sub(a29, b29, c28)
        a30 = r"ވ|ﯙ|ۈ|ۋ|ﺆ|ۊ|ۇ|ۏ|ۅ|ۉ|ﻭ|ﻮ|ؤ"
        b30 = r"و"
        c30 = sub(a30, b30, c29)
        a31 = r"ﺔ|ﻬ|ھ|ﻩ|ﻫ|ﻪ|ۀ|ە|ة|ہ|\u06C1"
        b31 = r"ه"
        c31 = sub(a31, b31, c30)
        a32 = r"ﭛ|ﻯ|ۍ|ﻰ|ﻱ|ﻲ|ں|ﻳ|ﻴ|ﯼ|ې|ﯽ|ﯾ|ﯿ|ێ|ے|ى|ی"
        b32 = r"ي"
        c32 = sub(a32, b32, c31)
        a33 = r'¬'
        b33 = r'‌'
        c33 = sub(a33, b33, c32)
        pa0 = r'•|·|●|·|・|∙|｡|ⴰ'
        pb0 = r'.'
        pc0 = sub(pa0, pb0, c33)
        pa1 = r',|٬|‚|，'
        pb1 = r'،'
        pc1 = sub(pa1, pb1, pc0)
        pa2 = r'ʕ'
        pb2 = r'؟'
        pc2 = sub(pa2, pb2, pc1)
        pa3 = r'٪'
        pb3 = r'%'
        pc3 = sub(pa3, pb3, pc2)
        na0 = r'۰|٠'
        nb0 = r'0'
        nc0 = sub(na0, nb0, pc3)
        na1 = r'۱|١'
        nb1 = r'1'
        nc1 = sub(na1, nb1, nc0)
        na2 = r'۲|٢'
        nb2 = r'2'
        nc2 = sub(na2, nb2, nc1)
        na3 = r'۳|٣'
        nb3 = r'3'
        nc3 = sub(na3, nb3, nc2)
        na4 = r'۴|٤'
        nb4 = r'4'
        nc4 = sub(na4, nb4, nc3)
        na5 = r'۵'
        nb5 = r'5'
        nc5 = sub(na5, nb5, nc4)
        na6 = r'۶|٦'
        nb6 = r'6'
        nc6 = sub(na6, nb6, nc5)
        na7 = r'۷|٧'
        nb7 = r'7'
        nc7 = sub(na7, nb7, nc6)
        na8 = r'۸|٨'
        nb8 = r'8'
        nc8 = sub(na8, nb8, nc7)
        na9 = r'۹|٩'
        nb9 = r'9'
        nc9 = sub(na9, nb9, nc8)
        ea1 = r'ـ|ِ|ُ|َ'
        eb1 = r''
        #ec1 = sub(ea1, eb1, nc9)
        ec1 = nc9
        sa1 = r'( )+'
        sb1 = r' '
        sc1 = sub(sa1, sb1, ec1)
        sa2 = r'(\n)+'
        sb2 = r'\n'
        sc2 = sub(sa2, sb2, sc1)
        espa0 = u'\u200e|\u200f| '
        espb0 = ' '
        espc0 = sub(espa0, espb0, sc2)
        return espc0

    def space_correction(self, doc_string):
        a00 = r'^(بی|می|نمی)( )'
        b00 = r'\1‌'
        c00 = sub(a00, b00, doc_string)
        a0 = r'( )(می|نمی|بی)( )'
        b0 = r'\1\2‌'
        c0 = sub(a0, b0, c00)
        a1 = r'( )(هایی|ها|های|ایی|هایم|هایت|هایش|هایمان|هایتان|هایشان|ات|ان|ین' \
             r'|انی|بان|ام|ای|یم|ید|اید|اند|بودم|بودی|بود|بودیم|بودید|بودند|ست)( )'
        b1 = r'‌\2\3'
        c1 = sub(a1, b1, c0)
        a2 = r'( )(شده|نشده)( )'
        b2 = r'‌\2‌'
        c2 = sub(a2, b2, c1)
        a3 = r'( )(طلبان|طلب|گرایی|گرایان|شناس|شناسی|گذاری|گذار|گذاران|شناسان|گیری|پذیری|بندی|آوری|سازی|' \
             r'بندی|کننده|کنندگان|گیری|پرداز|پردازی|پردازان|آمیز|سنجی|ریزی|داری|دهنده|آمیز|پذیری' \
             r'|پذیر|پذیران|گر|ریز|ریزی|رسانی|یاب|یابی|گانه|گانه‌ای|انگاری|گا|بند|رسانی|دهندگان|دار)( )'
        b3 = r'‌\2\3'
        c3 = sub(a3, b3, c2)
        return c3

    def space_correction_plus1(self, doc_string):
        out_sentences = ''
        for wrd in doc_string.split(' '):
            try:
                out_sentences = out_sentences + ' ' + self.dic1[wrd]
            except KeyError:
                out_sentences = out_sentences + ' ' + wrd
        return out_sentences

    def space_correction_plus2(self, doc_string):
        out_sentences = ''
        wrds = doc_string.split(' ')
        word_len = wrds.__len__()
        if word_len < 2:
            return doc_string
        cnt = 1
        for i in range(0, word_len - 1):
            w = wrds[i] + wrds[i + 1]
            try:
                out_sentences = out_sentences + ' ' + self.dic2[w]
                cnt = 0
            except KeyError:
                if cnt == 1:
                    out_sentences = out_sentences + ' ' + wrds[i]
                cnt = 1
        if cnt == 1:
            out_sentences = out_sentences + ' ' + wrds[i + 1]
        return out_sentences

    def space_correction_plus3(self, doc_string):
        # Dict = {'گفتوگو': 'گفت‌وگو'}
        out_sentences = ''
        wrds = doc_string.split(' ')
        word_len = wrds.__len__()
        if word_len < 3:
            return doc_string
        cnt = 1
        cnt2 = 0
        for i in range(0, word_len - 2):
            w = wrds[i] + wrds[i + 1] + wrds[i + 2]
            try:
                out_sentences = out_sentences + ' ' + self.dic3[w]
                cnt = 0
                cnt2 = 2
            except KeyError:
                if cnt == 1 and cnt2 == 0:
                    out_sentences = out_sentences + ' ' + wrds[i]
                else:
                    cnt2 -= 1
                cnt = 1
        if cnt == 1 and cnt2 == 0:
            out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
        elif cnt == 1 and cnt2 == 1:
            out_sentences = out_sentences + ' ' + wrds[i + 2]
        return out_sentences

    def replace_puncs(self, doc_string):
        repeat_pattern = \
            r"((?<!\.)\.(?!\.)|\.{3}\.*|_|،|,|\(|\)|:|\?|!|<|>|\-|;|\[|\]|\{|\}|»|«|\^|'|\\|¡|~|©|،|؟|؛|\")\1+"
        doc_strig = re.sub(repeat_pattern, r"\1", doc_string)
        pattern = r'(?<!\.)\.(?!\.)|\.{3}\.*|_|،|,|\(|\)|:|\?|!|<|>|\-|;|\[|\]|\{|\}|»|«|\^|\'|\\|¡|~|©|،|؟|؛|\"'
        clauses = [i.strip() for i in list(filter(None, re.split(pattern, doc_string)))]
        result = ' | '.join(clauses)
        return result

    def split_digit_from_alphabet(self, doc_string):
        doc_string = re.sub(r'(\d)([^\d\s])', r'\1 \2', doc_string)
        doc_string = re.sub(r'(\D)([^\D\s])', r'\1 \2', doc_string)
        return doc_string

    def normalize(self, doc_string, new_line_elimination=False):
        normalized_string = doc_string
        normalized_string = self.dictaition_normalizer.remove_extra_space_zwnj(normalized_string)
        normalized_string = self.sub_alphabets(normalized_string)
        normalized_string = self.replace_abbrv(normalized_string)
        normalized_string = self.replace_special_symbols(normalized_string)
        normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination).strip()
        normalized_string = self.split_digit_from_alphabet(normalized_string)
        normalized_string = self.split_phoneme_in_persian_and_eng_from_rest(normalized_string)
        if self.date_normalizing_needed:
            normalized_string = self.date_normalizer.normalize_dates(normalized_string)
        if self.time_normalizing_needed:
            normalized_string = self.time_normalizer.normalize_time(normalized_string)

        if self.number2text_needed:
            normalized_string = sub(r'[\u06F0-\u06F90-9]+', lambda x: unidecode(x.group(0)), normalized_string)
            n = NumberNormalizer()
            normalized_string = sub(r'[0-9]+', lambda x: n.convert(x.group(0)), normalized_string)
        if self.statistical_space_correction:
            token_list = normalized_string.strip().split()
            token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
            token_list = self.token_merger.merg_tokens(token_list, self.token_merger_model, self.half_space_char)
            normalized_string = " ".join(x for x in token_list)
            normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination)
        else:
            normalized_string = self.space_correction(self.space_correction_plus1(self.space_correction_plus2(
                self.space_correction_plus3(normalized_string)))).strip()
        if self.pinglish_conversion_needed:
            normalized_string = self.pinglish_conversion.pingilish2persian(
                self.tokenizer.tokenize_words(normalized_string))
        normalized_string = self.replace_puncs(normalized_string)
        if self.half_space_corrector:
            normalized_string = self.dictaition_normalizer.remove_extra_space_zwnj(normalized_string)
            normalized_string = self.dictaition_normalizer.join_words_without_rules(normalized_string)
            normalized_string = self.dictaition_normalizer.correct_compound(normalized_string, 5)
            normalized_string = self.dictaition_normalizer.half_space_corrector(normalized_string)

        return normalized_string

    def split_phoneme_in_persian_and_eng_from_rest(self, doc_string):
        doc_string = re.sub(r'([\u0622-\u06f0])([^\u0622-\u06f0])', r'\1 \2', doc_string)
        doc_string = re.sub(r'([a-z|A-Z])([^a-z|^A-Z])', r'\1 \2', doc_string)
        return doc_string


class DateNormalizer:
    def __init__(self):
        self.nn = NumberNormalizer()
        self.persian_month_dict = {'فروردين': 1, 'ارديبهشت': 2, 'خرداد': 3, 'تير': 4, 'مرداد': 5,
                                   'شهريور': 6, 'مهر': 7, 'آبان': 8, 'آذر': 9, 'دي': 10, 'بهمن': 11, 'اسفند': 12}
        self.christian_month_dict = {'ژانویه': 1, 'فوریه': 2, 'مارس': 3, 'آپریل': 4, 'می': 5, 'ژوئن': 6, 'جولای': 7,
                                     'آگوست': 8, 'سپتامبر': 9, 'اکتبر': 10, 'نوامبر': 11, 'دسامبر': 12}

        self.num_dict = {'چهار': 4, 'سه': 3, 'دو': 2, 'يك': 1, 'يازده': 11, 'سيزده': 13, 'چهارده': 14, 'دوازده': 12,
                         'پانزده': 15, 'شانزده': 16, 'چهارم': 4, 'سوم': 3, 'دوم': 2, 'يكم': 1, 'اول': 1, 'يازدهم': 11,
                         'سيزدهم': 13, 'چهاردهم': 14, 'دوازدهم': 12, 'پانزدهم': 15, 'شانزدهم': 16, 'هفدهم': 17,
                         'هجدهم': 18, 'نوزدهم': 19, 'بيستم': 20, 'چهلم': 40, 'پنجاهم': 50, 'شصتم': 60, 'هفتادم': 70,
                         'نودم': 90, 'سيصدم': 300, 'چهارصدم': 400, 'پانصدم': 500, 'ششصدم': 600, 'هفتصدم': 700,
                         'هشتصدم': 800, 'نهصدم': 900, 'هشتادم': 80, 'هزار': 1000, 'ميليون': 1000000, 'دويست': 200,
                         'ده': 10, 'نه': 9, 'هشت': 8, 'هفت': 7, 'شش': 6, 'پنج': 5, 'هفده': 17, 'هجده': 18, 'نوزده': 19,
                         'بيست': 20, 'سي': 30, 'چهل': 40, 'پنجاه': 50, 'شصت': 60, 'هفتاد': 70, 'نود': 90, 'سيصد': 300,
                         'چهارصد': 400, 'پانصد': 500, 'ششصد': 600, 'هفتصد': 700, 'هشتصد': 800, 'نهصد': 900,
                         'هشتاد': 80, ' ': 0, 'ميليارد': 1000000000, 'صدم': 100, 'هزارم': 1000, 'دويستم': 200,
                         'دهم': 10, 'نهم': 9, 'هشتم': 8, 'هفتم': 7, 'ششم': 6, 'پنجم': 5}

    def find_date_part(self, doc_string):
        persian_date_regex = re.compile(r'\b(1[0-4]\d\d|[3-9][2-9])([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-2])'
                                        r'([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])\b')
        persian_date_regex_rev = re.compile(
            r'\b([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-2])'
            r'([\s]*[/.-][\s]*)(1[0-4][\d][\d]|[3-9][2-9])\b')
        persian_date_md_regex = re.compile(r'(?<![/.\-])([0[1-9]|[1-9]|1[0-2])([\s]*[/.\-][\s]*)'
                                           r'([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])[\s]*(?![/.\-])')
        christian_date_regex = re.compile(r'\b([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])([\s]*[/.\-][\s]*)'
                                          r'([1-9]|0[1-9]|1[0-2])([\s]*[/.\-][\s]*)(1[5-9][0-9][0-9]|2[0][0-9][0-9])|'
                                          r'([1-9]|0[1-9]|1[0-2])([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])'
                                          r'([\s]*[/.\-][\s]*)(1[5-9][0-9][0-9]|2[0][0-9][0-9])|'
                                          r'(1[5-9][0-9][0-9]|2[0][0-9][0-9])([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-2])'
                                          r'([\s]*[/.\-][\s]*)([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])\b')
        keywords_date = ['مورخ', 'مورخه', 'تاريخ', 'شمسي', 'ميلادي', 'قمري', 'هجري']
        persian_result = []
        christian_result = []
        for match in persian_date_regex.finditer(doc_string):
            persian_result.append([*match.span(), int(match[5]), int(match[3]), int(match[1])])
        for match in persian_date_regex_rev.finditer(doc_string):
            persian_result.append([*match.span(), int(match[1]), int(match[3]), int(match[5])])

        for match in christian_date_regex.finditer(doc_string):
            a = False
            for s in persian_result:
                if range(max(s[0], match.start()), min(s[1], match.end())):
                    a = True
            if not a:
                if match[11] is not None:
                    christian_result.append([*match.span(), int(match[15]), int(match[13]), int(match[11])])
                elif match[6] is not None:
                    christian_result.append([*match.span(), int(match[10]), int(match[8]), int(match[6])])
                elif match[1] is not None:
                    christian_result.append([*match.span(), int(match[5]), int(match[3]), int(match[1])])

        for match in persian_date_md_regex.finditer(doc_string):
            if ngram_lookup(doc_string, match.start(), match.end(), keywords_date, 3) and \
                    not ngram_lookup(doc_string, match.start(), match.end(), keywords_date, 3):
                a = False
                for s in persian_result + christian_result:
                    if range(max(s[0], match.start()), min(s[1], match.end())):
                        a = True
                if not a:
                    persian_result.append([*match.span(), int(match[3]), int(match[1])])

        return [sorted(persian_result), sorted(christian_result)]

    def date_to_text_persian(self, doc_string, finded_dates, christian_dates):
        for i in range(len(finded_dates)):
            finded_date = finded_dates[i]
            str_date = ''
            str_date += self.nn.convert_ordinary(finded_date[2]) + ' '
            str_date += list(self.persian_month_dict.keys())[int(finded_date[3]) - 1] + ' '
            if len(finded_date) == 5:
                str_date += self.nn.convert(finded_date[4]) + ' '
            if str_date and str_date[-1] != ' ':
                str_date += ' '
            start_date_index = finded_date[0]
            end_date_index = finded_date[1]
            doc_string = doc_string[:start_date_index] + str_date + doc_string[end_date_index:]
            for j in range(i + 1, len(finded_dates)):
                finded_dates[j][0] += len(str_date) - (end_date_index - start_date_index)
                finded_dates[j][1] += len(str_date) - (end_date_index - start_date_index)
            for j in range(i + 1, len(christian_dates)):
                christian_dates[j][0] += len(str_date) - (end_date_index - start_date_index)
                christian_dates[j][1] += len(str_date) - (end_date_index - start_date_index)
        return doc_string, christian_dates

    def date_to_text_christian(self, doc_string, finded_dates):
        for i in range(len(finded_dates)):
            finded_date = finded_dates[i]
            str_date = ''
            str_date += self.nn.convert_ordinary(finded_date[2]) + ' '
            str_date += list(self.christian_month_dict.keys())[int(finded_date[3]) - 1] + ' '
            if len(finded_date) == 5:
                str_date += self.nn.convert(finded_date[4]) + ' '
            if str_date and str_date[-1] != ' ':
                str_date += ' '
            start_date_index = finded_date[0]
            end_date_index = finded_date[1]
            doc_string = doc_string[:start_date_index] + str_date + doc_string[end_date_index:]
            for j in range(i + 1, len(finded_dates)):
                finded_dates[j][0] += len(str_date) - (end_date_index - start_date_index)
                finded_dates[j][1] += len(str_date) - (end_date_index - start_date_index)
        return doc_string

    def normalize_dates(self, doc_string):
        finded_dates = self.find_date_part(doc_string)
        doc_string, finded_dates[1] = self.date_to_text_persian(doc_string, finded_dates[0], finded_dates[1])
        doc_string = self.date_to_text_christian(doc_string, finded_dates[1])
        return doc_string

    def list2num(self, numerical_section_list):
        value = 1
        for index, el in enumerate(numerical_section_list):
            if self.is_number(el):
                value *= self.num_dict[el]
            else:
                value *= float(el)
        return value

    def convert2num(self, numerical_section_list):
        value = 0
        tmp_section_list = []
        for index, el in enumerate(numerical_section_list):
            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
                tmp_section_list.append(el)
            elif el == "و":
                value += self.list2num(tmp_section_list)
                tmp_section_list[:] = []
        if len(tmp_section_list) > 0:
            value += self.list2num(tmp_section_list)
            tmp_section_list[:] = []
        if value - int(value) == 0:
            return int(value)
        else:
            return value

    def is_number(self, word):
        return word in self.num_dict

    def find_number_location(self, token_list):
        start_index = 0
        number_section = []
        for i, el in enumerate(token_list):
            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
                start_index = i
                number_section.append(start_index)
                break

        i = start_index + 1
        while i < len(token_list):
            if token_list[i] == "و" and (i + 1) < len(token_list):
                if self.is_number(token_list[i + 1]) or (token_list[i + 1].replace('.', '', 1).isdigit()):
                    number_section.append(i)
                    number_section.append(i + 1)
                    i += 2
                else:
                    break
            elif self.is_number(token_list[i]) or (token_list[i].replace('.', '', 1).isdigit()):
                number_section.append(i)
                i += 1
            else:
                break
        return number_section

    def normalize_numbers(self, token_list, converted=""):
        for i, el in enumerate(token_list):
            if el.endswith("ین") and self.is_number(el[:-2]):
                token_list[i] = el[:-2]
        finded = self.find_number_location(token_list)
        if len(finded) == 0:
            rest_of_string = " ".join(t for t in token_list)
            return converted + " " + rest_of_string
        else:
            numerical_subsection = [token_list[x] for x in finded]
            numerical_subsection = self.convert2num(numerical_subsection)

            converted = converted + " " + " ".join(x for x in token_list[:finded[0]]) + " " + str(numerical_subsection)

            new_index = finded[-1] + 1
            return self.normalize_numbers(token_list[new_index:], converted)


class NumberNormalizer:
    def __init__(self):
        self.faBaseNum = {1: 'يك', 2: 'دو', 3: 'سه', 4: 'چهار', 5: 'پنج', 6: 'شِش', 7: 'هفت', 8: 'هشت', 9: 'نُه',
                          10: 'دَه', 11: 'يازده', 12: 'دوازده', 13: 'سيزده', 14: 'چهارده', 15: 'پانزده', 16: 'شانزده',
                          17: 'هفده', 18: 'هجده', 19: 'نوزده', 20: 'بيست', 30: 'سي', 40: 'چهل', 50: 'پنجاه', 60: 'شصت',
                          70: 'هفتاد', 80: 'هشتاد', 90: 'نود', 100: 'صد', 200: 'دويست', 300: 'سيصد', 500: 'پانصد'}

        self.faBaseNumKeys = self.faBaseNum.keys()
        self.faBigNum = ["يك", "هزار", "ميليون", "ميليارد"]
        self.faBigNumSize = len(self.faBigNum)

    def split3(self, st):
        parts = []
        n = len(st)
        d, m = divmod(n, 3)
        for i in range(d):
            parts.append(int(st[n - 3 * i - 3:n - 3 * i]))
        if m > 0:
            parts.append(int(st[:m]))
        return parts

    def convert(self, st):
        st = str(st)
        if len(st) > 3:
            parts = self.split3(st)
            k = len(parts)
            wparts = []
            for i in range(k):
                p = parts[i]
                if p == 0:
                    continue
                if i == 0:
                    wpart = self.convert(p)
                else:
                    if i < self.faBigNumSize:
                        fa_order = self.faBigNum[i]
                    else:
                        fa_order = ''
                        (d, m) = divmod(i, 3)
                        t9 = self.faBigNum[3]
                        for j in range(d):
                            if j > 0:
                                fa_order += "‌"
                            fa_order += t9
                        if m != 0:
                            if fa_order != '':
                                fa_order = "‌" + fa_order
                            fa_order = self.faBigNum[m] + fa_order
                    wpart = fa_order if i == 1 and p == 1 else self.convert(p) + " " + fa_order
                wparts.append(wpart)
            return "  وَ ".join(reversed(wparts))
        # now assume that n <= 999
        n = int(st)
        if n in self.faBaseNumKeys:
            return self.faBaseNum[n]
        y = n % 10
        d = int((n % 100) / 10)
        s = int(n / 100)
        # print s, d, y
        dy = 10 * d + y
        fa = ''
        if s != 0:
            if s * 100 in self.faBaseNumKeys:
                fa += self.faBaseNum[s * 100]
            else:
                fa += (self.faBaseNum[s] + self.faBaseNum[100])
            if d != 0 or y != 0:
                fa += "  وَ "
        if d != 0:
            if dy in self.faBaseNumKeys:
                fa += self.faBaseNum[dy]
                return fa
            fa += self.faBaseNum[d * 10]
            if y != 0:
                fa += "  وَ "
        if y != 0:
            fa += self.faBaseNum[y]
        return fa

    def convert_ordinary(self, arg):
        if isinstance(arg, int):
            num = arg
            st = str(arg)
        elif isinstance(arg, str):
            num = int(arg)
            st = arg
        else:
            raise TypeError('bad type "%s"' % type(arg))
        if num == 1:
            return 'اول'  # OR 'یکم'fixme
        elif num == 10:
            return 'دهم'
        norm_fa = self.convert(st)
        if len(norm_fa) == 0:
            return ''
        if norm_fa.endswith(u'ی'):
            norm_fa += u'‌ام'
        elif norm_fa.endswith(u'سه'):
            norm_fa = norm_fa[:-1] + u'وم'
        else:
            norm_fa += u'م'
        return norm_fa


class PinglishNormalizer:
    def __init__(self):
        self.data_helper = DataHelper()
        self.file_dir = os.path.dirname(os.path.realpath(__file__)) + "/"

        self.en_dict_filename = self.file_dir + "resource/tokenizer/enDict"
        self.en_dict = self.data_helper.load_var(self.en_dict_filename)

        self.fa_dict_filename = self.file_dir + "resource/tokenizer/faDict"
        self.fa_dict = self.data_helper.load_var(self.fa_dict_filename)

    def pingilish2persian(self, pinglish_words_list):

        for i, word in enumerate(pinglish_words_list):
            if word in self.en_dict:
                pinglish_words_list[i] = self.en_dict[word]  # .decode("utf-8")
                # inp = inp.replace(word, enDict[word], 1)
            else:
                ch = self.characterize(word)
                pr = self.map_char(ch)
                amir = self.make_word(pr)
                for wd in amir:
                    am = self.escalation(wd)
                    asd = ''.join(am)
                    if asd in self.fa_dict:
                        pinglish_words_list[i] = asd  # .decode("utf-8")
                        # inp = inp.replace(word, asd, 1)
        inp = " ".join(x for x in pinglish_words_list)
        return inp

    def characterize(self, word):
        list_of_char = []
        i = 0
        while i < len(word):
            char = word[i]
            sw_out = self.switcher(char)
            if sw_out is None:
                esp_out = None
                if i < len(word) - 1:
                    esp_out = self.esp_check(word[i], word[i + 1])
                if esp_out is None:
                    list_of_char.append(word[i])
                else:
                    list_of_char.append(esp_out)
                    i += 1
            else:
                list_of_char.append(sw_out)
            i += 1
        return list_of_char

    def switcher(self, ch):
        switcher = {
            "c": None,
            "k": None,
            "z": None,
            "s": None,
            "g": None,
            "a": None,
            "u": None,
            "e": None,
            "o": None
        }
        return switcher.get(ch, ch)

    def esp_check(self, char1, char2):
        st = char1 + char2
        if st == "ch":
            return "ch"
        elif st == "kh":
            return "kh"
        elif st == "zh":
            return "zh"
        elif st == "sh":
            return "sh"
        elif st == "gh":
            return "gh"
        elif st == "aa":
            return "aa"
        elif st == "ee":
            return "ee"
        elif st == "oo":
            return "oo"
        elif st == "ou":
            return "ou"
        else:
            return None

    def map_char(self, word):
        listm = []
        sw_out = self.map_switcher(word[0])
        i = 0
        if sw_out is None:
            listm.append(["ا"])
            i += 1
        if word[0] == "oo":
            listm.append(["او"])
            i += 1
        while i < len(word):
            listm.append(self.char_switcher(word[i]))
            i += 1
        if word[len(word) - 1] == "e":
            listm.append(["ه"])
        elif word[len(word) - 1] == "a":
            listm.append(["ا"])
        elif word[len(word) - 1] == "o":
            listm.append(["و"])
        elif word[len(word) - 1] == "u":
            listm.append(["و"])

        return listm

    def map_switcher(self, ch):
        switcher = {
            "a": None,
            "e": None,
            "o": None,
            "u": None,
            "ee": None,

            "ou": None
        }
        return switcher.get(ch, ch)

    def make_word(self, chp):
        word_list = [[]]
        for char in chp:
            word_list_temp = []
            for tmp_word_list in word_list:
                for chch in char:
                    tmp = copy.deepcopy(tmp_word_list)
                    tmp.append(chch)
                    word_list_temp.append(tmp)
            word_list = word_list_temp
        return word_list

    def escalation(self, word):
        tmp = []
        i = 0
        t = len(word)
        while i < t - 1:
            tmp.append(word[i])
            if word[i] == word[i + 1]:
                i += 1
            i += 1
        if i != t:
            tmp.append(word[i])
        return tmp

    def char_switcher(self, ch):
        switcher = {
            'a': ["", "ا"],
            'c': ["ث", "ص", "ص"],
            'h': ["ه", "ح"],
            'b': ["ب"],
            'p': ["پ"],
            't': ["ت", "ط"],
            's': ["س", "ص", "ث"],
            'j': ["ج"],
            'ch': ["چ"],
            'kh': ["خ"],
            'q': ["ق", "غ"],
            'd': ["د"],
            'z': ["ز", "ذ", "ض", "ظ"],
            'r': ["ر"],
            'zh': ["ژ"],
            'sh': ["ش"],
            'gh': [",ق", "غ"],
            'f': ["ف"],
            'k': ["ک"],
            'g': ["گ"],
            'l': ["ل"],
            'm': ["م"],
            'n': ["ن"],
            'v': ["و"],
            'aa': ["ا"],
            'ee': ["ی"],
            'oo': ["و"],
            'ou': ["و"],
            'i': ["ی"],
            'y': ["ی"],
            ' ': [""],
            'w': ["و"],
            'e': ["", "ه"],
            'o': ["", "و"]
        }
        return switcher.get(ch, "")


class DictatioinNormalizer:
    def __init__(self, zwnj_database_path='./Parsivar/resource/normalizer/N_cctt.txt',

                 compound_table_path='./Parsivar/resource/normalizer/Normalizer_WrongCompound.txt'):
        self.zwnj_table, self.zwnj_type = self.load_zwnj_database(zwnj_database_path)
        self.compound_table = self.load_compound_table(compound_table_path)
        look_up_t_farsi_str = './g2p_resources/look_up_t_farsi_extra_of_ariana_b_punch_EN_extra_of_nevisa.pkl'
        homograph_t_str = './g2p_resources/list_homograph.pkl'
        with open(look_up_t_farsi_str, 'rb') as f:
            self.look_up_t_farsi = pickle.load(f)
        with open(homograph_t_str, 'rb') as f:
            self.homograph = pickle.load(f)
        self.half_space_data = [half_space_group for half_space_group in self.look_up_t_farsi
                                if isinstance(half_space_group, str) and re.findall('\u200c', half_space_group)] + \
                               [half_space_group for half_space_group in self.homograph
                                if isinstance(half_space_group, str) and re.findall('\u200c', half_space_group)]
        self.half_space_data_splited = [word_group.split('\u200c') for word_group in self.half_space_data]
        self.half_space_data_joined = dict()
        for i in self.half_space_data_splited:
            for j in range(len(i)):
                for k in range(j + 1, len(i)):
                    self.half_space_data_joined["".join(i[j:k + 1])] = '\u200c'.join(i[j:k + 1])

    def remove_extra_space_zwnj(self, doc_string):
        extra_regex = re.compile(r'([\u0648\u0624\u062f\u0630\u0631\u0632\u0698\u0627'
                                 r'\u0622\u0654\u0621\u0623\u0625\u0060][\u0627\u064b][\u0621\u064b])(\u200c)')
        doc_string = extra_regex.sub('\1', doc_string)
        extra_regex = re.compile(r'([\u0030-\u0039])(\u200c)')
        doc_string = extra_regex.sub('\1', doc_string)
        extra_regex = re.compile(r'(\u200c)([\u0030-\u0039])')
        doc_string = extra_regex.sub('\2', doc_string)
        extra_regex = re.compile(r'([a-z|A-Z])(\u200c)')
        doc_string = extra_regex.sub('\1', doc_string)
        extra_regex = re.compile(r'(\u200c)([a-z|A-Z])')
        doc_string = extra_regex.sub('\2', doc_string)
        extra_regex = re.compile(r'[\s]{2,}')
        doc_string = extra_regex.sub(' ', doc_string)
        extra_regex = re.compile(r'[\u200c]{2,}')
        doc_string = extra_regex.sub('\u200c', doc_string)
        extra_regex = re.compile(r'\u200c\s')
        doc_string = extra_regex.sub(' ', doc_string)
        extra_regex = re.compile(r'\s\u200c')
        doc_string = extra_regex.sub(' ', doc_string)
        return doc_string

    def correct_compound(self, doc_string, neighborhood):
        word_list = doc_string.split()
        if neighborhood > len(word_list):
            neighborhood = len(word_list)
        output_list = []
        i = 0
        while i < len(word_list):
            for j in range(neighborhood, 0, -1):
                if i + j < len(word_list):
                    comp = " ".join(word_list[i:i + j])
                else:
                    comp = " ".join(word_list[i:])
                if comp in self.compound_table:
                    output_list.append(self.compound_table[comp])
                    i += j
                    break
            else:
                output_list.append(word_list[i])
                i += 1
        return " ".join(output_list)

    def half_space_corrector(self, string):
        word_list = string.split()
        word_list = [word.strip('\u200c') for word in word_list]
        word_index = 0
        result = []
        while word_index < len(word_list):
            a=1
            if word_list[word_index] in self.half_space_data_joined and \
                    word_list[word_index] not in self.look_up_t_farsi and word_list[word_index] not in self.homograph:
                word_list[word_index] = self.half_space_data_joined[word_list[word_index]]
            possible_words = [i for i in self.half_space_data_splited if word_list[word_index] == i[0]
                              and word_index + 1 < len(word_list) and word_list[word_index + 1] == i[1]]
            best_words = [word_list[word_index]]
            max_match = 0
            if possible_words:
                for j in possible_words:
                    for k in range(len(j)):
                        if k >= len(word_list):
                            break
                        if len(word_list) <= word_index+k or j[k] != word_list[word_index + k]:
                            break
                        if max_match < k:
                            max_match = k
                            best_words = j[:k + 1]

            else:
                if word_index + 1 < len(word_list) and (word_list[word_index + 1] in
                                                        ['ام', 'ات', 'اش', 'مان', 'شان', 'تان', 'ايم', 'ايد', 'اند',
                                                         'اي', 'ها', 'هاي', 'هايي', 'هايم', 'هايشان', 'هايتان',
                                                         'هايمان', 'هايت', 'هايش', 'هائي' 'تر', 'ترين', 'ي', 'يي']
                                                        or word_list[word_index] in ['مي', 'نمي', 'برنمي', 'درمي',
                                                                                     'درنمي']):
                    best_words = [word_list[word_index], word_list[word_index + 1]]
                    max_match = 1
            result.append('\u200c'.join(best_words))
            word_index += max_match + 1
        result = " ".join(result)
        return result

    def hamze_corrector(self, string):
        word_list = string.split()
        for word in word_list:
            if word not in self.homograph + self.look_up_t_farsi:
                pass

    def load_zwnj_database(self, zwnj_database_file):
        table_words = dict()
        words_type = dict()
        with open(zwnj_database_file, 'r') as f:
            for line in f:
                cols = line.split('\t')
                connection_type = cols[0]
                incorrect_word = cols[1]
                correct_word = cols[2].replace(' ', '_')
                table_words[incorrect_word.replace('\u200c', '^')] = correct_word
                if connection_type in ["مجزا", "به قبلي", "به بعدي"]:
                    words_type[correct_word.replace('\u200c', '^')] = connection_type
        return table_words, words_type

    def is_english_grapheme(self, char):
        return 'a' < char < 'z' or 'A' < char < 'Z'

    def is_english_word(self, word):
        return self.is_english_grapheme(word[0])

    def is_punctuation(self, char):
        return char in ['-', '=', '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', '[', '{', ']', '}',
                        ':', ';', '\'', '\"', ',', '<', '.', '>', '/', '?', '|', '\\', '÷', '٬', '٫', '٪', '×', '،',
                        '»', '«', '؛', '؟', '…', '”', 'ˈ']

    def join_words_without_rules(self, docstring):
        words = docstring.split()
        correct_docstring = []
        for ind in range(len(words)):
            word = words[ind]
            table_word = word.replace('\u200c', '^')
            if table_word in self.zwnj_type.keys():
                if self.zwnj_type[table_word] == "مجزا":
                    correct_docstring.append(self.zwnj_table[table_word].replace('_', ' '))
                elif self.zwnj_type[table_word] == "به قبلي":
                    if correct_docstring and not self.is_english_word(correct_docstring[-1]) \
                            and not self.is_punctuation(correct_docstring[-1][-1]):
                        correct_docstring.append('\u200c' + self.zwnj_table[table_word])
                    else:
                        correct_docstring.append(self.zwnj_table[table_word])
                elif self.zwnj_type[table_word] == "به بعدي":
                    if ind < len(words) - 1 and not self.is_english_word(words[ind + 1]) \
                            and not self.is_punctuation(words[ind + 1][0]):
                        correct_docstring.append(self.zwnj_table[table_word] + '\u200c')
                    else:
                        correct_docstring.append(self.zwnj_table[table_word])
            else:
                correct_docstring.append(word)
        return " ".join(correct_docstring)

    def load_compound_table(self, compound_table_path):
        compound_table = dict()
        with open(compound_table_path, 'r') as f:
            for line in f:
                firstcol, secondcol = line.split('\t')
                compound_table[firstcol] = secondcol
        return compound_table


class TimeNormalizer:
    def __init__(self, spanning=2):
        self.numberNorm = NumberNormalizer()
        self.spanning = spanning

    def normalize_time(self, doc_string):
        time_regex = re.compile(r'([0-9]|0[0-9]|1[0-9]|2[0-4])[\s]*([:\-])[\s]*([0-5][\d]|60)[\']?'
                                r'[\s]*(([:\-])[\s]*([0-5][\d]|60)[\"]?)?')
        keywords_time1 = ['وقت', 'مدت', 'زمان']
        keywords_time2 = ['عصر', 'شب', 'غروب', 'ظهر', 'صبح', 'ساعت', 'بامداد']

        while time_regex.search(doc_string):
            match = time_regex.search(doc_string)
            if ngram_lookup(doc_string, match.start(), match.end(), keywords_time1, self.spanning) and \
                    not ngram_lookup(doc_string, match.start(), match.end(), keywords_time2, self.spanning):
                if match[6] is not None:
                    hour = int(match[1])
                    minute = int(match[3])
                    second = int(match[6])
                    doc_string = self.time_duration_to_text_sec(doc_string, match.start(), match.end(),
                                                                hour, minute, second)
                else:
                    hour = int(match[1])
                    minute = int(match[3])
                    doc_string = self.time_duration_to_text(doc_string, match.start(), match.end(), hour, minute)
            else:
                if match[6] is not None:
                    hour = int(match[1])
                    minute = int(match[3])
                    second = int(match[6])
                    doc_string = self.time_to_text_sec(doc_string, match.start(), match.end(), hour, minute, second)
                else:
                    hour = int(match[1])
                    minute = int(match[3])
                    doc_string = self.time_to_text(doc_string, match.start(), match.end(), hour, minute)

        return doc_string

    def time_duration_to_text_sec(self, doc_string, start, end, hour, minute, second):
        string = self.numberNorm.convert(hour) + ' ساعت وَ ' + self.numberNorm.convert(minute) + ' دقیقه وَ ' + \
                 self.numberNorm.convert(second) + ' ثانیه'
        doc_string = doc_string[:start] + string + doc_string[end:]
        return doc_string

    def time_duration_to_text(self, doc_string, start, end, hour, minute):
        string = self.numberNorm.convert(hour) + ' ساعت وَ ' + self.numberNorm.convert(minute) + ' دقیقه '
        doc_string = doc_string[:start] + string + doc_string[end:]
        return doc_string

    def time_to_text_sec(self, doc_string, start, end, hour, minute, second):
        string = self.numberNorm.convert(hour) + ' وَ ' + self.numberNorm.convert(minute) + ' وَ ' + \
                 self.numberNorm.convert(second) + ' ثانیه '
        doc_string = doc_string[:start] + string + doc_string[end:]
        return doc_string

    def time_to_text(self, doc_string, start, end, hour, minute):
        string = self.numberNorm.convert(hour) + ' وَ' \
                                                 ' ' + self.numberNorm.convert(minute) + ' دقیقه '
        doc_string = doc_string[:start] + string + doc_string[end:]
        return doc_string


def ngram_lookup(doc_string, start, end, word_list, span):
    result = False
    for spanns in range(1, span):
        if any(x in word_list for x in doc_string[:start].rsplit(maxsplit=spanns + 1)[-spanns:]):
            result = True
        if any(x in word_list for x in doc_string[end:].split(maxsplit=spanns + 1)[:spanns]):
            result = True
    return result