|
import os
|
|
from nltk.tag.stanford import StanfordPOSTagger
|
|
import re
|
|
|
|
class POSTagger():
|
|
def __init__(self,
|
|
stanford_postagger_model=None,
|
|
wapiti_postagger_model=None,
|
|
jar_tagger_path=None,
|
|
jdk_variable_path="C:/Program Files/Java/jdk1.8.0_121/bin/java.exe",
|
|
tagging_model="wapiti"):
|
|
|
|
import platform
|
|
if platform.system() == "Windows":
|
|
self.tagging_model = "stanford"
|
|
else:
|
|
self.tagging_model = tagging_model
|
|
|
|
self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
|
|
if stanford_postagger_model is None:
|
|
self.stanford_postagger_model = self.dir_path + "resource/postagger/NC_model"
|
|
else:
|
|
self.stanford_postagger_model = stanford_postagger_model
|
|
|
|
if jar_tagger_path is None:
|
|
self.jar_tagger_path = self.dir_path + 'resource/postagger/stanford-postagger.jar'
|
|
else:
|
|
self.jar_tagger_path = jar_tagger_path
|
|
|
|
if wapiti_postagger_model is None:
|
|
self.wapiti_postagger_model = self.dir_path + "resource/postagger/UPC_full_model_wapiti"
|
|
else:
|
|
self.wapiti_postagger_model = wapiti_postagger_model
|
|
|
|
if self.tagging_model == "stanford":
|
|
java_path = jdk_variable_path
|
|
os.environ['JAVAHOME'] = java_path
|
|
|
|
self.tagger = StanfordPOSTagger(model_filename=self.stanford_postagger_model,
|
|
path_to_jar=self.jar_tagger_path,
|
|
encoding='utf-8',
|
|
java_options='-mx5000m')
|
|
elif self.tagging_model == "wapiti":
|
|
from wapiti import Model
|
|
self.tagger = Model(model=self.wapiti_postagger_model)
|
|
|
|
def is_all_latin(self, word):
|
|
pattern = '[a-zA-Z]*'
|
|
w = re.sub(pattern, '', word)
|
|
if len(w) == 0:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def parse(self, token_list):
|
|
tagged_tuples = []
|
|
if self.tagging_model == "stanford":
|
|
postags = self.tagger.tag(token_list)
|
|
for element in postags:
|
|
tmp = '_'.join(t for t in element)
|
|
tmp = tmp.strip("_")
|
|
tmp = tmp.split('/')
|
|
tag = tmp[-1]
|
|
tmp = tmp[:-1]
|
|
tmp = '/'.join(i for i in tmp)
|
|
tmp = tmp.strip('/')
|
|
if self.is_all_latin(tmp):
|
|
tagged_tuples.append((tmp, "FW"))
|
|
else:
|
|
tagged_tuples.append((tmp, tag))
|
|
|
|
elif self.tagging_model == "wapiti":
|
|
sent_line = "\n".join(x for x in token_list)
|
|
postags = self.tagger.label_sequence(sent_line).decode('utf-8')
|
|
postags = postags.strip().split('\n')
|
|
for i, el in enumerate(token_list):
|
|
if self.is_all_latin(el):
|
|
tagged_tuples.append((el, u"FW"))
|
|
else:
|
|
tagged_tuples.append((el, postags[i]))
|
|
return tagged_tuples
|
|
|