|
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer |
|
import gradio as gr |
|
from spacy import displacy |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("lirondos/anglicisms-spanish-mbert") |
|
model = AutoModelForTokenClassification.from_pretrained( |
|
"lirondos/anglicisms-spanish-mbert" |
|
) |
|
nlp = pipeline("ner", model=model, tokenizer=tokenizer) |
|
|
|
diplacy_dict_template = { |
|
"text": "But Google is starting from behind.", |
|
"ents": [{"start": 4, "end": 10, "label": "ORG"}], |
|
"title": None, |
|
} |
|
|
|
|
|
def infer(input_text): |
|
displacy_ents = [] |
|
borrowings = nlp(input_text) |
|
|
|
for borrowing in borrowings: |
|
displacy_ent_dict = { |
|
"start": borrowing["start"], |
|
"end": borrowing["end"], |
|
"label": borrowing["entity"], |
|
} |
|
displacy_ents.append(displacy_ent_dict) |
|
|
|
colors = {"B-ENG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", |
|
"I-ENG": "linear-gradient(90deg, #99bfff, #a57cf0)", |
|
"B-OTHER": "linear-gradient(90deg, #79d0a5, #f6e395)", |
|
"I-OTHER": "linear-gradient(90deg, #f79a76, #fb6d6d)"} |
|
|
|
options = {"ents": ["B-ENG", "I-ENG", "B-OTHER", "I-OTHER"], "colors": colors} |
|
displacy_dict_template = {"text": input_text, "ents": displacy_ents, "title": None} |
|
|
|
html = displacy.render(displacy_dict_template, style="ent", page=True, manual=True, options=options) |
|
|
|
html = ( |
|
"" |
|
+ html |
|
+ "" |
|
) |
|
|
|
return html |
|
|
|
|
|
description="""This space is a demo for the paper [Detecting Unassimilated Borrowings in Spanish: |
|
An Annotated Corpus and Approaches to Modeling](https://arxiv.org/pdf/2203.16169.pdf) |
|
|
|
The goal of the underlying model is to detect foreign words, e.g. anglicisms, in spanish texts. |
|
In general it has two types of tags for foreign words: *ENG* and *OTHER*. The authors used [BIO-tagging](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)), |
|
which is why in practice you will see a *B-* or *I-* in front of the tags. |
|
""" |
|
|
|
demo = gr.Interface( |
|
title="Borrowing Detection Español", |
|
description=description, |
|
fn=infer, |
|
inputs=gr.Text(), |
|
outputs=gr.HTML(), |
|
examples=["Buscamos data scientist para proyecto de machine learning.", |
|
"Las fake news sobre la celebrity se reprodujeron por los 'mass media' en prime time.", |
|
"Me gusta el cine noir y el anime."], |
|
) |
|
|
|
demo.launch() |
|
|