File size: 3,233 Bytes
eb57aa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
from nltk.tag.stanford import StanfordPOSTagger
import re

class POSTagger():
    def __init__(self,

                 stanford_postagger_model=None,

                 wapiti_postagger_model=None,

                 jar_tagger_path=None,

                 jdk_variable_path="C:/Program Files/Java/jdk1.8.0_121/bin/java.exe",

                 tagging_model="wapiti"):

        import platform
        if platform.system() == "Windows":
            self.tagging_model = "stanford"
        else:
            self.tagging_model = tagging_model

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"

        if stanford_postagger_model is None:
            self.stanford_postagger_model = self.dir_path + "resource/postagger/NC_model"
        else:
            self.stanford_postagger_model = stanford_postagger_model

        if jar_tagger_path is None:
            self.jar_tagger_path = self.dir_path + 'resource/postagger/stanford-postagger.jar'
        else:
            self.jar_tagger_path = jar_tagger_path

        if wapiti_postagger_model is None:
            self.wapiti_postagger_model = self.dir_path + "resource/postagger/UPC_full_model_wapiti"
        else:
            self.wapiti_postagger_model = wapiti_postagger_model

        if self.tagging_model == "stanford":
            java_path = jdk_variable_path
            os.environ['JAVAHOME'] = java_path

            self.tagger = StanfordPOSTagger(model_filename=self.stanford_postagger_model,
                                            path_to_jar=self.jar_tagger_path,
                                            encoding='utf-8',
                                            java_options='-mx5000m')
        elif self.tagging_model == "wapiti":
            from wapiti import Model
            self.tagger = Model(model=self.wapiti_postagger_model)

    def is_all_latin(self, word):
        pattern = '[a-zA-Z]*'
        w = re.sub(pattern, '', word)
        if len(w) == 0:
            return True
        else:
            return False

    def parse(self, token_list):
        tagged_tuples = []
        if self.tagging_model == "stanford":
            postags = self.tagger.tag(token_list)
            for element in postags:
                tmp = '_'.join(t for t in element)
                tmp = tmp.strip("_")
                tmp = tmp.split('/')
                tag = tmp[-1]
                tmp = tmp[:-1]
                tmp = '/'.join(i for i in tmp)
                tmp = tmp.strip('/')
                if self.is_all_latin(tmp):
                    tagged_tuples.append((tmp, "FW"))
                else:
                    tagged_tuples.append((tmp, tag))

        elif self.tagging_model == "wapiti":
            sent_line = "\n".join(x for x in token_list)
            postags = self.tagger.label_sequence(sent_line).decode('utf-8')
            postags = postags.strip().split('\n')
            for i, el in enumerate(token_list):
                if self.is_all_latin(el):
                    tagged_tuples.append((el, u"FW"))
                else:
                    tagged_tuples.append((el, postags[i]))
        return tagged_tuples