import gradio as gr import torch from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer import pandas as pd import time import re import tempfile import os import uuid # Model loading model_name = "dsfsi/nso-en-m2m100-gov" tokenizer = M2M100Tokenizer.from_pretrained(model_name) model = M2M100ForConditionalGeneration.from_pretrained(model_name) tokenizer.src_lang = "ns" model.config.forced_bos_token_id = tokenizer.get_lang_id("en") # Translation function (single) def translate_nso_en(text): if not text.strip(): return "Please enter Northern Sotho (Sepedi) text." inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) translated_tokens = model.generate( **inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en") ) return tokenizer.decode(translated_tokens[0], skip_special_tokens=True) # Linguistic analysis def calculate_metrics(text): words = text.split() word_count = len(words) char_count = len(text) sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()]) unique_words = len(set(words)) avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0 lexical_div = unique_words / word_count if word_count else 0 return { 'char_count': char_count, 'word_count': word_count, 'sentence_count': sentence_count, 'unique_words': unique_words, 'avg_word_length': avg_word_length, 'lexical_diversity': lexical_div } def create_metrics_table(src_metrics, tgt_metrics): data = { 'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'], 'Source Text': [ src_metrics.get('word_count', 0), src_metrics.get('char_count', 0), src_metrics.get('sentence_count', 0), src_metrics.get('unique_words', 0), f"{src_metrics.get('avg_word_length', 0):.1f}", f"{src_metrics.get('lexical_diversity', 0):.3f}" ], 'Target Text': [ tgt_metrics.get('word_count', 0), tgt_metrics.get('char_count', 0), tgt_metrics.get('sentence_count', 0), tgt_metrics.get('unique_words', 0), f"{tgt_metrics.get('avg_word_length', 0):.1f}", f"{tgt_metrics.get('lexical_diversity', 0):.3f}" ] } return pd.DataFrame(data) def translate_and_analyze(text): if not text.strip(): return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {}) start = time.time() translated = translate_nso_en(text) src_metrics = calculate_metrics(text) tgt_metrics = calculate_metrics(translated) elapsed = time.time() - start report = f"""## 📊 Linguistic Analysis Report ### Translation Details - **Processing Time**: {elapsed:.2f} seconds ### Text Complexity Metrics | Metric | Source | Target | Ratio | |--------|--------|--------|-------| | Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} | | Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} | | Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} | | Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} | | Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} | """ table = create_metrics_table(src_metrics, tgt_metrics) return translated, report, table # Batch processing def secure_batch_processing(file_obj): if file_obj is None: return "Please upload a file.", pd.DataFrame() temp_dir = None try: session_id = str(uuid.uuid4()) temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_") file_ext = os.path.splitext(file_obj.name)[1].lower() if file_ext not in ['.txt', '.csv']: return "Only .txt and .csv files are supported.", pd.DataFrame() temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}") import shutil shutil.copy2(file_obj.name, temp_file_path) texts = [] if file_ext == '.csv': df = pd.read_csv(temp_file_path) if df.empty: return "The uploaded CSV file is empty.", pd.DataFrame() texts = df.iloc[:, 0].dropna().astype(str).tolist() else: with open(temp_file_path, 'r', encoding='utf-8') as f: content = f.read() texts = [line.strip() for line in content.split('\n') if line.strip()] if not texts: return "No text found in the uploaded file.", pd.DataFrame() max_batch_size = 10 if len(texts) > max_batch_size: texts = texts[:max_batch_size] warning_msg = f"Processing limited to first {max_batch_size} entries for performance." else: warning_msg = "" results = [] for i, text in enumerate(texts): if len(text.strip()) == 0: continue if len(text) > 1000: text = text[:1000] + "..." translated = translate_nso_en(text) results.append({ 'Index': i + 1, 'Original': text[:100] + '...' if len(text) > 100 else text, 'Translation': translated[:100] + '...' if len(translated) > 100 else translated }) if not results: return "No valid text entries found to translate.", pd.DataFrame() results_df = pd.DataFrame(results) summary = f"Successfully processed {len(results)} text entries." if warning_msg: summary = f"{summary} {warning_msg}" return summary, results_df except Exception as e: return f"Error processing file: {str(e)}", pd.DataFrame() finally: if temp_dir and os.path.exists(temp_dir): try: import shutil shutil.rmtree(temp_dir) except Exception as e: print(f"Warning: Could not clean up temporary directory: {e}") # Examples EXAMPLES = [ ["Leina la ka ke Vukosi."], ["Ke leboga thušo ya gago."], ["Re a go amogela mo Pretoria."], ["Go tloga ka letšatši la lehono, dilo di tlo kaonafala."], ["O swanetše go hwetša thušo ge go kgonega."], ["Ngwana o ya sekolong letšatšing le lengwe le le lengwe."] ] # Research tools def detailed_analysis(text): if not text.strip(): return {} metrics = calculate_metrics(text) return { "basic_metrics": metrics, "text_length": len(text), "analysis_completed": True } def create_gradio_interface(): with gr.Blocks( title="🔬 Northern Sotho-English Linguistic Translation Tool", theme=gr.themes.Soft(), css=""" .gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;} .main-header {text-align: center; padding: 2rem 0;} .dsfsi-logo {text-align: center; margin-bottom: 1rem;} .dsfsi-logo img {max-width: 300px; height: auto;} .metric-table {font-size: 0.9em;} """ ) as demo: gr.HTML("""

🔬 Northern Sotho-English Linguistic Translation Tool

AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects.

""") with gr.Tabs(): with gr.Tab("🌐 Translation & Analysis"): gr.Markdown(""" ### Real-time Translation with Linguistic Analysis Translate from Northern Sotho (Sepedi) to English and get detailed linguistic insights. """) with gr.Row(): with gr.Column(scale=1): input_text = gr.Textbox( label="Northern Sotho (Sepedi) Input", placeholder="Enter text to translate...", lines=4, max_lines=10 ) translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg") with gr.Column(scale=1): output_text = gr.Textbox( label="Translation (English)", lines=4, interactive=False ) gr.Markdown("### 📚 Example Translations") gr.Examples( examples=EXAMPLES, inputs=[input_text], label="Click an example to try it:" ) with gr.Accordion("📊 Detailed Linguistic Analysis", open=False): analysis_output = gr.Markdown(label="Analysis Report") with gr.Accordion("📈 Metrics Table", open=False): metrics_table = gr.Dataframe( label="Comparative Metrics", headers=["Metric", "Source Text", "Target Text"], interactive=False ) translate_btn.click( fn=translate_and_analyze, inputs=input_text, outputs=[output_text, analysis_output, metrics_table] ) with gr.Tab("📁 Batch Processing"): gr.Markdown(""" ### Secure Corpus Analysis & Batch Translation Upload text or CSV files for batch translation and analysis. Files are processed securely and temporarily. """) with gr.Row(): with gr.Column(): file_upload = gr.File( label="Upload File (Max 5MB)", file_types=[".txt", ".csv"], type="filepath", file_count="single" ) batch_btn = gr.Button("🔄 Process Batch", variant="primary") gr.Markdown(""" **Supported formats:** - `.txt` files: One text per line - `.csv` files: Text in first column - **Security limits**: Max 10 entries, 1000 chars per text - **Privacy**: Files are deleted after processing """) with gr.Column(): batch_summary = gr.Textbox( label="Processing Summary", lines=3, interactive=False ) batch_results = gr.Dataframe( label="Translation Results", interactive=False, wrap=True ) batch_btn.click( fn=secure_batch_processing, inputs=file_upload, outputs=[batch_summary, batch_results] ) with gr.Tab("🔬 Research Tools"): gr.Markdown(""" ### Advanced Linguistic Analysis Tools Analyze text for linguistic features. """) with gr.Row(): with gr.Column(): research_text = gr.Textbox( label="Text for Analysis", lines=6, placeholder="Enter Northern Sotho (Sepedi) or English text...", max_lines=15 ) analyze_btn = gr.Button("🔍 Analyze Text", variant="primary") with gr.Column(): research_output = gr.JSON( label="Detailed Analysis Results" ) analyze_btn.click( fn=detailed_analysis, inputs=research_text, outputs=research_output ) gr.Markdown(""" ### 🗣️ About Northern Sotho (Sepedi) Language **Northern Sotho (Sepedi)** is a Bantu language spoken by millions of people, primarily in: - 🇿🇦 **South Africa** – Official language **Key Linguistic Features:** - **Language Family**: Niger-Congo → Bantu → Sotho-Tswana - **Script**: Latin alphabet - **Characteristics**: Agglutinative, noun-class system - **ISO Code**: nso (ISO 639-2/3) """) gr.Markdown(""" --- ### 📚 Model Information & Citation **Model Used:** [`dsfsi/nso-en-m2m100-gov`](https://huggingface.co/dsfsi/nso-en-m2m100-gov) Based on Meta's M2M100, fine-tuned specifically for Northern Sotho-English by the **Data Science for Social Impact Research Group**. **Training Data:** Vuk'uzenzele and ZA-gov-multilingual South African corpora. ### 🔒 Privacy & Security - No conversation history stored - Uploaded files deleted after processing - All processing in isolated temporary environments - No user data persistence ### 🙏 Acknowledgments We thank **Thapelo Sindani** and **Zion Nia Van Wyk** for their assistance in creating this space. ### 📖 Citation ```bibtex @inproceedings{lastrucci-etal-2023-preparing, title = "Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora", author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate", booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)", pages = "18--25", year = "2023" } ``` **Links**: - [DSFSI](https://www.dsfsi.co.za/) - [Model](https://huggingface.co/dsfsi/nso-en-m2m100-gov) - [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp) - [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual) - [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform) --- **Built for the African NLP community** """) return demo if __name__ == "__main__": demo = create_gradio_interface() demo.launch( share=True, server_name="0.0.0.0", server_port=7860, show_error=True )