πΏπ¦ South African Language Identification
Multilingual Language Detection for South African Languages
# coding=utf-8 import streamlit as st import pandas as pd import matplotlib.pyplot as plt import altair as alt from transformers import pipeline import fasttext from huggingface_hub import hf_hub_download import json import os import re import string import base64 from typing import List, Tuple, Dict, Optional import logging # Configure page st.set_page_config( page_title="South African Language Identification", page_icon="πΏπ¦", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Constants and Configuration MODEL_CONFIGS = { "za-bert": { "name": "ZA-BERT", "model_id": "dsfsi/za-lid-bert", "description": "Lightweight BERT-based model trained on South African languages", "recommended": True }, "xlmr-large": { "name": "XLM-R Large", "model_id": "dsfsi/za-xlmrlarge-lid", "description": "XLM-RoBERTa Large model fine-tuned for SA languages" }, "serengeti": { "name": "Serengeti", "model_id": "dsfsi/za-serengeti-lid", "description": "Afri-centric model with superior performance" }, "afriberta": { "name": "AfriBERTa", "model_id": "dsfsi/za-afriberta-lid", "description": "African-focused BERT model" }, "afro-xlmr": { "name": "Afro-XLM-R", "model_id": "dsfsi/za-afro-xlmr-base-lid", "description": "African-centric XLM-RoBERTa model" }, "afrolm": { "name": "AfroLM", "model_id": "dsfsi/za-afrolm-lid", "description": "African language model" } } # Utility Functions @st.cache_data def load_language_names() -> Dict[str, str]: """Load language names mapping""" try: with open("assets/language_names.json", 'r') as f: return json.load(f) except FileNotFoundError: # Fallback mapping for common South African languages return { "afr": "Afrikaans", "eng": "English", "nso": "Northern Sotho", "sot": "Sesotho", "ssw": "Siswati", "tsn": "Setswana", "tso": "Xitsonga", "ven": "Tshivenda", "xho": "isiXhosa", "zul": "isiZulu", "nbl": "isiNdebele", "und": "Undetermined" } @st.cache_resource def load_model(model_key: str): """Load and cache models""" try: config = MODEL_CONFIGS[model_key] model = pipeline("text-classification", model=config["model_id"]) return model except Exception as e: st.error(f"Error loading model {model_key}: {str(e)}") return None def preprocess_text(text: str) -> str: """Clean and preprocess input text""" if not text or not text.strip(): return "" # Basic cleaning text = text.replace('\n', ' ') # Remove problematic characters replacement_map = {ord(c): ' ' for c in ':β’#{|}' + string.digits} text = text.translate(replacement_map) # Normalize whitespace text = re.sub(r'\s+', ' ', text).strip() return text def get_language_name(label: str, lang_names: Dict[str, str]) -> str: """Get language name from label""" if '_' in label: iso_code = label.split('_')[0] else: iso_code = label return lang_names.get(iso_code, label) def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]: """Predict language for given text""" if not model or not text.strip(): return "und", 0.0, "Undetermined" try: processed_text = preprocess_text(text) if not processed_text: return "und", 0.0, "Undetermined" result = model(processed_text) if isinstance(result, list) and len(result) > 0: prediction = result[0] label = prediction['label'] confidence = prediction['score'] language_name = get_language_name(label, lang_names) return label, confidence, language_name return "und", 0.0, "Undetermined" except Exception as e: st.error(f"Prediction error: {str(e)}") return "und", 0.0, "Error" def create_confidence_plot(language: str, confidence: float) -> plt.Figure: """Create a confidence visualization""" fig, ax = plt.subplots(figsize=(10, 2)) # Colors primary_color = "#ff6b35" bg_color = "#f8f9fa" text_color = "#2c3e50" # Create horizontal bar ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8) ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3) # Styling ax.set_xlim(0, 1) ax.set_ylim(-0.5, 0.5) ax.set_xlabel("Confidence Score", fontsize=12, color=text_color) ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})", fontsize=14, fontweight='bold', color=text_color, pad=20) # Remove y-axis and spines ax.set_yticks([]) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) # Add confidence text ax.text(confidence/2, 0, f"{confidence:.1%}", ha='center', va='center', fontweight='bold', color='white') plt.tight_layout() return fig def render_paper_info(): """Render paper information and citation""" st.markdown("### π Research Paper") col1, col2 = st.columns([2, 1]) with col1: st.markdown(""" **"From N-grams to Pre-trained Multilingual Models For Language Identification"** *Authors: Thapelo Andrew Sindane, Vukosi Marivate* Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024) This research investigates N-gram models and large pre-trained multilingual models for Language Identification across 11 South African languages, showing that Serengeti performs best across all model types. """) with col2: st.markdown(""" **Links:** - [π Paper](https://aclanthology.org/2024.nlp4dh-1.22/) - [π€ HuggingFace](https://huggingface.co/dsfsi) - [π» GitHub](https://github.com/dsfsi/za-lid) """) def render_citation(): """Render BibTeX citation""" citation = """@inproceedings{sindane-marivate-2024-n, title = "From N-grams to Pre-trained Multilingual Models For Language Identification", author = "Sindane, Thapelo Andrew and Marivate, Vukosi", editor = "HΓ€mΓ€lΓ€inen, Mika and Γhman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri", booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities", month = nov, year = "2024", address = "Miami, USA", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.nlp4dh-1.22/", doi = "10.18653/v1/2024.nlp4dh-1.22", pages = "229--239" }""" st.code(citation, language='bibtex') def main(): # Header st.markdown("""
Multilingual Language Detection for South African Languages
{model_config['description']}
Detected Language
Confidence
Language Code