Spaces:
Sleeping
Sleeping
File size: 4,357 Bytes
4948d8f e6bfe5c 295300a e6bfe5c d25abcf e6bfe5c 295300a 1b711d9 295300a 4948d8f 125e609 295300a b8834e9 295300a 5b3d11c 4948d8f f193a60 295300a 4948d8f 295300a 4874aa0 295300a 4948d8f 295300a c558c48 4948d8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# Refactored Streamlit App for Setswana NER using HuggingFace Models
import streamlit as st
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import pandas as pd
import spacy
# -------------------- PAGE CONFIG --------------------
st.set_page_config(layout="wide")
# -------------------- UI HEADER --------------------
st.image("logo_transparent_small.png", use_column_width="always")
st.title("Demo for Setswana PuoBERTa NER Model")
# -------------------- MODEL SELECTION --------------------
model_list = ['dsfsi/PuoBERTa-NER']
model_checkpoint = st.sidebar.radio("Select NER Model", model_list)
aggregation_strategy = "simple"
# -------------------- TEXT INPUT --------------------
input_method = st.radio("Select Input Method", ['Example Text', 'Write Text', 'Upload CSV'])
def get_input_text():
if input_method == 'Example Text':
examples = [
"Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"
]
return st.selectbox("Example Sentences", examples)
elif input_method == 'Write Text':
return st.text_area("Enter text", height=128)
elif input_method == 'Upload CSV':
uploaded = st.file_uploader("Upload CSV", type="csv")
if uploaded:
df = pd.read_csv(uploaded)
col = st.selectbox("Choose column with text", df.columns)
return "\n".join(df[col].dropna().astype(str).tolist())
return ""
input_text = get_input_text()
# -------------------- MODEL LOADING --------------------
@st.cache_resource
def load_ner_pipeline(model_checkpoint, strategy):
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=strategy)
# -------------------- ENTITY MERGE --------------------
def merge_entities(output):
merged = []
for i, ent in enumerate(output):
if i > 0 and ent["start"] == output[i-1]["end"] and ent["entity_group"] == output[i-1]["entity_group"]:
merged[-1]["word"] += ent["word"]
merged[-1]["end"] = ent["end"]
else:
merged.append(ent)
return merged
# -------------------- RUN NER --------------------
if st.button("Run NER") and input_text.strip():
with st.spinner("Running NER..."):
ner = load_ner_pipeline(model_checkpoint, aggregation_strategy)
output = ner(input_text)
entities = merge_entities(output)
if entities:
df = pd.DataFrame(entities)[['word','entity_group','score','start','end']]
st.subheader("Recognized Entities")
st.dataframe(df)
spacy_display = {"text": input_text, "ents": [], "title": None}
for ent in entities:
label = ent["entity_group"]
if label == "PER":
label = "PERSON"
spacy_display["ents"].append({"start": ent["start"], "end": ent["end"], "label": label})
html = spacy.displacy.render(spacy_display, style="ent", manual=True, minify=True)
styled_html = f"<style>mark.entity {{ display: inline-block; }}</style><div style='overflow-x:auto;'>{html}</div>"
st.markdown(styled_html, unsafe_allow_html=True)
else:
st.info("No entities recognized in the input.")
# -------------------- AUTHORS, CITATION & FEEDBACK --------------------
st.markdown("""
---
### 📚 Authors & Citation
**Authors**
Vukosi Marivate, Moseli Mots'Oehli, Valencia Wagner, Richard Lastrucci, Isheanesu Dzingirai
**Citation**
```bibtex
@inproceedings{marivate2023puoberta,
title = {PuoBERTa: Training and evaluation of a curated language model for Setswana},
author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai},
year = {2023},
booktitle= {Artificial Intelligence Research. SACAIR 2023. Communications in Computer and Information Science},
url= {https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17},
keywords = {NLP},
preprint_url = {https://arxiv.org/abs/2310.09141},
dataset_url = {https://github.com/dsfsi/PuoBERTa},
software_url = {https://huggingface.co/dsfsi/PuoBERTa}
}""") |