File size: 3,172 Bytes
eb57aa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
 


class Tokenizer():
    def __init__(self):
        pass

    def tokenize_words(self, doc_string):
        token_list = doc_string.strip().split()
        token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
        return token_list

    def tokenize_sentences(self, doc_string):
        # finding the numbers
        pattern = r"[-+]?\d*\.\d+|\d+"
        print(doc_string)
        nums_list = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)

        pattern = r'([!\.\?؟]+)[\n]*'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r':\n'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r';[\n]*'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r'؛[\n]*'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r'^[\s\r\n]*$'
        doc_string = re.sub(pattern, '', doc_string)
        pattern = r'[\n\r]+'
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        for number in nums_list:
            pattern = 'floatingpointnumber'
            doc_string = re.sub(pattern, number, doc_string, 1)

        doc_string = doc_string.split('\t\t')
        doc_string = [x for x in doc_string if len(x) > 0 and not x.isspace()]
        return doc_string

    def add_tab(self, mystring):
        mystring = mystring.group()  # this method return the string matched by re
        mystring = mystring.strip(' ')  # ommiting the whitespace around the pucntuation
        mystring = mystring.strip('\n')  # ommiting the newline around the pucntuation
        mystring = " " + mystring + "\t\t"  # adding a space after and before punctuation
        return mystring

    def tokenize_long_clauses(self, string_list, max_char_num):
        result = []
        pos_tagger = POSTagger('./g2p_resources/model/perpos.model')
        for string in string_list:
            tmp = string
            if len(tmp) > max_char_num:
                while (len(tmp) > max_char_num):
                    first_space_ind = tmp.find(' ', max_char_num)
                    second_space_ind = tmp.find(' ', 2*max_char_num)
                    pos = [pos_tagger.parse([word])[0] for word in tmp[first_space_ind:second_space_ind].split()]
                    try:
                        word_ind = [x[1] for x in pos[:20]].index('V')
                        split_index = sum(len(i[0]) for i in pos[:word_ind+1]) + word_ind + 1 + first_space_ind
                    except:
                        split_index = first_space_ind
                    result.append(tmp[:split_index])
                    tmp = tmp[split_index:]
                    if len(tmp) <= max_char_num:
                        result.append(tmp)
            else:
                result.append(tmp)

        return result