Spaces:

vonewman
/

ner_app

Runtime error

App Files Files Community

vonewman commited on Oct 25, 2023

Commit

cb0039e

1 Parent(s): 967c296

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -15

app.py CHANGED Viewed

@@ -1,24 +1,123 @@
 import streamlit as st
-from transformers import pipeline
-# Créez un widget pour télécharger le fichier
-uploaded_file = st.file_uploader("Téléchargez un document (PDF, TXT, CSV, JSON)", type=["pdf", "txt", "csv", "json"])
-# Chargement du modèle DistilBERT pour la reconnaissance d'entités nommées
-nlp = pipeline("ner", model="distilbert-base-cased",
-               aggregation_strategy="simple")
-if uploaded_file is not None:
-    # Lecture du contenu du fichier
-    text = uploaded_file.read()
-    # Utilisation du modèle de traitement du langage naturel pour la reconnaissance d'entités nommées
-    entities = nlp(text)
-    st.subheader("Entités nommées détectées dans le document :")
-    for entity in entities:
-        st.write(f"Texte : {entity['word']}, Étiquette : {entity['entity']}")
-    # Vous pouvez également afficher d'autres informations sur les entités détectées si nécessaire.

 import streamlit as st
+import pandas as pd
+import numpy as np
+import re
+import json
+import base64
+import uuid
+import transformers
+from datasets import Dataset,load_dataset, load_from_disk
+from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
+st.set_page_config(
+    page_title="Named Entity Recognition Tagger", page_icon="📘"
+)
+def convert_df(df:pd.DataFrame):
+     return df.to_csv(index=False).encode('utf-8')
+#@st.cache
+def convert_json(df:pd.DataFrame):
+    result = df.to_json(orient="index")
+    parsed = json.loads(result)
+    json_string = json.dumps(parsed)
+    #st.json(json_string, expanded=True)
+    return json_string
+st.title("📘Named Entity Recognition Tagger")
+@st.cache(allow_output_mutation=True)
+def load_model():
+    model = AutoModelForTokenClassification.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")
+    trainer = Trainer(model=model)
+    tokenizer = AutoTokenizer.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")
+    return trainer, model, tokenizer
+id2tag = {0: 'O',
+         1: 'B-LOC',
+         2: 'B-PER',
+         3: 'I-PER',
+         4: 'B-ORG',
+         5: 'I-DATE',
+         6: 'B-DATE',
+         7: 'I-ORG',
+         8: 'I-LOC'
+        }
+def tag_sentence(text:str):
+      # convert our text to a tokenized sequence
+      inputs = tokenizer(text, truncation=True, return_tensors="pt")
+      # get outputs
+      outputs = model(**inputs)
+      # convert to probabilities with softmax
+      probs = outputs[0][0].softmax(1)
+      # get the tags with the highest probability
+      word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()], np.round(probs[i][tagid].item() *100,2) )
+                    for i, tagid in enumerate (probs.argmax(axis=1))]
+      df=pd.DataFrame(word_tags, columns=['word', 'tag', 'probability'])
+      return df
+with st.form(key='my_form'):
+    x1 = st.text_input(label='Enter a sentence:', max_chars=250)
+    print(x1)
+    submit_button = st.form_submit_button(label='🏷️ Create tags')
+if submit_button:
+    if re.sub('\s+','',x1)=='':
+        st.error('Please enter a non-empty sentence.')
+    elif re.match(r'\A\s*\w+\s*\Z', x1):
+        st.error("Please enter a sentence with at least one word")
+    else:
+        st.markdown("### Tagged Sentence")
+        st.header("")
+        Trainer, model, tokenizer = load_model()
+        results=tag_sentence(x1)
+        cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
+        with c1:
+            #csvbutton = download_button(results, "results.csv", "📥 Download .csv")
+            csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results), file_name= "results.csv", mime='text/csv', key='csv')
+        with c2:
+            #textbutton = download_button(results, "results.txt", "📥 Download .txt")
+            textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results), file_name= "results.text", mime='text/plain',  key='text')
+        with c3:
+            #jsonbutton = download_button(results, "results.json", "📥 Download .json")
+            jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results), file_name= "results.json", mime='application/json',  key='json')
+        st.header("")
+        c1, c2, c3 = st.columns([1, 3, 1])
+        with c2:
+             st.table(results.style.background_gradient(subset=['probability']).format(precision=2))
+st.header("")
+st.header("")
+st.header("")
+with st.expander("ℹ️ - About this app", expanded=True):
+    st.write(
+        """
+-   The **Named Entity Recognition Tagger** app is a tool that performs named entity recognition.
+-   The available entitites are: *corporation*, *creative-work*, *group*, *location*, *person* and *product*.
+-   The app uses the [RoBERTa model](https://huggingface.co/roberta-large), fine-tuned on the [wnut](https://huggingface.co/datasets/wnut_17) dataset.
+-   The model uses the **byte-level BPE tokenizer**. Each sentece is first tokenized.
+-   For more info regarding the data science part, check this [post](https://towardsdatascience.com/named-entity-recognition-with-deep-learning-bert-the-essential-guide-274c6965e2d?sk=c3c3699e329e45a8ed93d286ae04ef10).
+       """
+    )