import os from nltk.tag.stanford import StanfordPOSTagger import re class POSTagger(): def __init__(self, stanford_postagger_model=None, wapiti_postagger_model=None, jar_tagger_path=None, jdk_variable_path="C:/Program Files/Java/jdk1.8.0_121/bin/java.exe", tagging_model="wapiti"): import platform if platform.system() == "Windows": self.tagging_model = "stanford" else: self.tagging_model = tagging_model self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" if stanford_postagger_model is None: self.stanford_postagger_model = self.dir_path + "resource/postagger/NC_model" else: self.stanford_postagger_model = stanford_postagger_model if jar_tagger_path is None: self.jar_tagger_path = self.dir_path + 'resource/postagger/stanford-postagger.jar' else: self.jar_tagger_path = jar_tagger_path if wapiti_postagger_model is None: self.wapiti_postagger_model = self.dir_path + "resource/postagger/UPC_full_model_wapiti" else: self.wapiti_postagger_model = wapiti_postagger_model if self.tagging_model == "stanford": java_path = jdk_variable_path os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(model_filename=self.stanford_postagger_model, path_to_jar=self.jar_tagger_path, encoding='utf-8', java_options='-mx5000m') elif self.tagging_model == "wapiti": from wapiti import Model self.tagger = Model(model=self.wapiti_postagger_model) def is_all_latin(self, word): pattern = '[a-zA-Z]*' w = re.sub(pattern, '', word) if len(w) == 0: return True else: return False def parse(self, token_list): tagged_tuples = [] if self.tagging_model == "stanford": postags = self.tagger.tag(token_list) for element in postags: tmp = '_'.join(t for t in element) tmp = tmp.strip("_") tmp = tmp.split('/') tag = tmp[-1] tmp = tmp[:-1] tmp = '/'.join(i for i in tmp) tmp = tmp.strip('/') if self.is_all_latin(tmp): tagged_tuples.append((tmp, "FW")) else: tagged_tuples.append((tmp, tag)) elif self.tagging_model == "wapiti": sent_line = "\n".join(x for x in token_list) postags = self.tagger.label_sequence(sent_line).decode('utf-8') postags = postags.strip().split('\n') for i, el in enumerate(token_list): if self.is_all_latin(el): tagged_tuples.append((el, u"FW")) else: tagged_tuples.append((el, postags[i])) return tagged_tuples