PuoBERTaSpace

Sleeping

File size: 4,357 Bytes

# Refactored Streamlit App for Setswana NER using HuggingFace Models

import streamlit as st
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import pandas as pd
import spacy

# -------------------- PAGE CONFIG --------------------
st.set_page_config(layout="wide")

# -------------------- UI HEADER --------------------
st.image("logo_transparent_small.png", use_column_width="always")
st.title("Demo for Setswana PuoBERTa NER Model")

# -------------------- MODEL SELECTION --------------------
model_list = ['dsfsi/PuoBERTa-NER']
model_checkpoint = st.sidebar.radio("Select NER Model", model_list)
aggregation_strategy = "simple"

# -------------------- TEXT INPUT --------------------
input_method = st.radio("Select Input Method", ['Example Text', 'Write Text', 'Upload CSV'])

def get_input_text():
    if input_method == 'Example Text':
        examples = [
            "Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"
        ]
        return st.selectbox("Example Sentences", examples)
    elif input_method == 'Write Text':
        return st.text_area("Enter text", height=128)
    elif input_method == 'Upload CSV':
        uploaded = st.file_uploader("Upload CSV", type="csv")
        if uploaded:
            df = pd.read_csv(uploaded)
            col = st.selectbox("Choose column with text", df.columns)
            return "\n".join(df[col].dropna().astype(str).tolist())
    return ""

input_text = get_input_text()

# -------------------- MODEL LOADING --------------------
@st.cache_resource
def load_ner_pipeline(model_checkpoint, strategy):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
    return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=strategy)

# -------------------- ENTITY MERGE --------------------
def merge_entities(output):
    merged = []
    for i, ent in enumerate(output):
        if i > 0 and ent["start"] == output[i-1]["end"] and ent["entity_group"] == output[i-1]["entity_group"]:
            merged[-1]["word"] += ent["word"]
            merged[-1]["end"] = ent["end"]
        else:
            merged.append(ent)
    return merged

# -------------------- RUN NER --------------------
if st.button("Run NER") and input_text.strip():
    with st.spinner("Running NER..."):
        ner = load_ner_pipeline(model_checkpoint, aggregation_strategy)
        output = ner(input_text)
        entities = merge_entities(output)

        if entities:
            df = pd.DataFrame(entities)[['word','entity_group','score','start','end']]
            st.subheader("Recognized Entities")
            st.dataframe(df)

            spacy_display = {"text": input_text, "ents": [], "title": None}
            for ent in entities:
                label = ent["entity_group"]
                if label == "PER":
                    label = "PERSON"
                spacy_display["ents"].append({"start": ent["start"], "end": ent["end"], "label": label})

            html = spacy.displacy.render(spacy_display, style="ent", manual=True, minify=True)
            styled_html = f"<style>mark.entity {{ display: inline-block; }}</style><div style='overflow-x:auto;'>{html}</div>"
            st.markdown(styled_html, unsafe_allow_html=True)
        else:
            st.info("No entities recognized in the input.")

# -------------------- AUTHORS, CITATION & FEEDBACK --------------------
st.markdown("""
---  
### 📚 Authors & Citation

**Authors**  
Vukosi Marivate, Moseli Mots'Oehli, Valencia Wagner, Richard Lastrucci, Isheanesu Dzingirai  

**Citation**  
```bibtex
@inproceedings{marivate2023puoberta,
  title   = {PuoBERTa: Training and evaluation of a curated language model for Setswana},
  author  = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai},
  year    = {2023},
  booktitle= {Artificial Intelligence Research. SACAIR 2023. Communications in Computer and Information Science},
  url= {https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17},
  keywords = {NLP},
  preprint_url = {https://arxiv.org/abs/2310.09141},
  dataset_url = {https://github.com/dsfsi/PuoBERTa},
  software_url = {https://huggingface.co/dsfsi/PuoBERTa}
}""")