nso-en-translation

Sleeping

File size: 15,881 Bytes

import gradio as gr
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import pandas as pd
import time
import re
import tempfile
import os
import uuid

# Model loading
model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")

# Translation function (single)
def translate_nso_en(text):
    if not text.strip():
        return "Please enter Northern Sotho (Sepedi) text."
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    translated_tokens = model.generate(
        **inputs,
        max_length=512,
        forced_bos_token_id=tokenizer.get_lang_id("en")
    )
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# Linguistic analysis
def calculate_metrics(text):
    words = text.split()
    word_count = len(words)
    char_count = len(text)
    sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
    unique_words = len(set(words))
    avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0
    lexical_div = unique_words / word_count if word_count else 0
    return {
        'char_count': char_count,
        'word_count': word_count,
        'sentence_count': sentence_count,
        'unique_words': unique_words,
        'avg_word_length': avg_word_length,
        'lexical_diversity': lexical_div
    }

def create_metrics_table(src_metrics, tgt_metrics):
    data = {
        'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'],
        'Source Text': [
            src_metrics.get('word_count', 0),
            src_metrics.get('char_count', 0),
            src_metrics.get('sentence_count', 0),
            src_metrics.get('unique_words', 0),
            f"{src_metrics.get('avg_word_length', 0):.1f}",
            f"{src_metrics.get('lexical_diversity', 0):.3f}"
        ],
        'Target Text': [
            tgt_metrics.get('word_count', 0),
            tgt_metrics.get('char_count', 0),
            tgt_metrics.get('sentence_count', 0),
            tgt_metrics.get('unique_words', 0),
            f"{tgt_metrics.get('avg_word_length', 0):.1f}",
            f"{tgt_metrics.get('lexical_diversity', 0):.3f}"
        ]
    }
    return pd.DataFrame(data)

def translate_and_analyze(text):
    if not text.strip():
        return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {})
    start = time.time()
    translated = translate_nso_en(text)
    src_metrics = calculate_metrics(text)
    tgt_metrics = calculate_metrics(translated)
    elapsed = time.time() - start
    report = f"""## 📊 Linguistic Analysis Report

### Translation Details
- **Processing Time**: {elapsed:.2f} seconds

### Text Complexity Metrics
| Metric | Source | Target | Ratio |
|--------|--------|--------|-------|
| Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} |
| Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} |
| Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} |
| Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} |
| Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} |
"""
    table = create_metrics_table(src_metrics, tgt_metrics)
    return translated, report, table

# Batch processing
def secure_batch_processing(file_obj):
    if file_obj is None:
        return "Please upload a file.", pd.DataFrame()
    temp_dir = None
    try:
        session_id = str(uuid.uuid4())
        temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_")
        file_ext = os.path.splitext(file_obj.name)[1].lower()
        if file_ext not in ['.txt', '.csv']:
            return "Only .txt and .csv files are supported.", pd.DataFrame()
        temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}")
        import shutil
        shutil.copy2(file_obj.name, temp_file_path)
        texts = []
        if file_ext == '.csv':
            df = pd.read_csv(temp_file_path)
            if df.empty:
                return "The uploaded CSV file is empty.", pd.DataFrame()
            texts = df.iloc[:, 0].dropna().astype(str).tolist()
        else:
            with open(temp_file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            texts = [line.strip() for line in content.split('\n') if line.strip()]
        if not texts:
            return "No text found in the uploaded file.", pd.DataFrame()
        max_batch_size = 10
        if len(texts) > max_batch_size:
            texts = texts[:max_batch_size]
            warning_msg = f"Processing limited to first {max_batch_size} entries for performance."
        else:
            warning_msg = ""
        results = []
        for i, text in enumerate(texts):
            if len(text.strip()) == 0:
                continue
            if len(text) > 1000:
                text = text[:1000] + "..."
            translated = translate_nso_en(text)
            results.append({
                'Index': i + 1,
                'Original': text[:100] + '...' if len(text) > 100 else text,
                'Translation': translated[:100] + '...' if len(translated) > 100 else translated
            })
        if not results:
            return "No valid text entries found to translate.", pd.DataFrame()
        results_df = pd.DataFrame(results)
        summary = f"Successfully processed {len(results)} text entries."
        if warning_msg:
            summary = f"{summary} {warning_msg}"
        return summary, results_df
    except Exception as e:
        return f"Error processing file: {str(e)}", pd.DataFrame()
    finally:
        if temp_dir and os.path.exists(temp_dir):
            try:
                import shutil
                shutil.rmtree(temp_dir)
            except Exception as e:
                print(f"Warning: Could not clean up temporary directory: {e}")

# Examples
EXAMPLES = [
    ["Leina la ka ke Vukosi."],
    ["Ke leboga thušo ya gago."],
    ["Re a go amogela mo Pretoria."],
    ["Go tloga ka letšatši la lehono, dilo di tlo kaonafala."],
    ["O swanetše go hwetša thušo ge go kgonega."],
    ["Ngwana o ya sekolong letšatšing le lengwe le le lengwe."]
]

# Research tools
def detailed_analysis(text):
    if not text.strip():
        return {}
    metrics = calculate_metrics(text)
    return {
        "basic_metrics": metrics,
        "text_length": len(text),
        "analysis_completed": True
    }

def create_gradio_interface():
    with gr.Blocks(
        title="🔬 Northern Sotho-English Linguistic Translation Tool",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;}
        .main-header {text-align: center; padding: 2rem 0;}
        .dsfsi-logo {text-align: center; margin-bottom: 1rem;}
        .dsfsi-logo img {max-width: 300px; height: auto;}
        .metric-table {font-size: 0.9em;}
        """
    ) as demo:

        gr.HTML("""
        <div class="dsfsi-logo">
            <img src="https://www.dsfsi.co.za/images/logo_transparent_expanded.png" alt="DSFSI Logo" />
        </div>
        <div class="main-header">
            <h1>🔬 Northern Sotho-English Linguistic Translation Tool</h1>
            <p style="font-size: 1.1em; color: #666; max-width: 800px; margin: 0 auto;">
                AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects.
            </p>
        </div>
        """)

        with gr.Tabs():
            with gr.Tab("🌐 Translation & Analysis"):
                gr.Markdown("""
                ### Real-time Translation with Linguistic Analysis
                Translate from Northern Sotho (Sepedi) to English and get detailed linguistic insights.
                """)
                with gr.Row():
                    with gr.Column(scale=1):
                        input_text = gr.Textbox(
                            label="Northern Sotho (Sepedi) Input",
                            placeholder="Enter text to translate...",
                            lines=4,
                            max_lines=10
                        )
                        translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
                    with gr.Column(scale=1):
                        output_text = gr.Textbox(
                            label="Translation (English)",
                            lines=4,
                            interactive=False
                        )
                gr.Markdown("### 📚 Example Translations")
                gr.Examples(
                    examples=EXAMPLES,
                    inputs=[input_text],
                    label="Click an example to try it:"
                )
                with gr.Accordion("📊 Detailed Linguistic Analysis", open=False):
                    analysis_output = gr.Markdown(label="Analysis Report")
                with gr.Accordion("📈 Metrics Table", open=False):
                    metrics_table = gr.Dataframe(
                        label="Comparative Metrics",
                        headers=["Metric", "Source Text", "Target Text"],
                        interactive=False
                    )
                translate_btn.click(
                    fn=translate_and_analyze,
                    inputs=input_text,
                    outputs=[output_text, analysis_output, metrics_table]
                )

            with gr.Tab("📁 Batch Processing"):
                gr.Markdown("""
                ### Secure Corpus Analysis & Batch Translation
                Upload text or CSV files for batch translation and analysis. Files are processed securely and temporarily.
                """)
                with gr.Row():
                    with gr.Column():
                        file_upload = gr.File(
                            label="Upload File (Max 5MB)",
                            file_types=[".txt", ".csv"],
                            type="filepath",
                            file_count="single"
                        )
                        batch_btn = gr.Button("🔄 Process Batch", variant="primary")
                        gr.Markdown("""
                        **Supported formats:**
                        - `.txt` files: One text per line
                        - `.csv` files: Text in first column
                        - **Security limits**: Max 10 entries, 1000 chars per text
                        - **Privacy**: Files are deleted after processing
                        """)
                    with gr.Column():
                        batch_summary = gr.Textbox(
                            label="Processing Summary",
                            lines=3,
                            interactive=False
                        )
                        batch_results = gr.Dataframe(
                            label="Translation Results",
                            interactive=False,
                            wrap=True
                        )
                batch_btn.click(
                    fn=secure_batch_processing,
                    inputs=file_upload,
                    outputs=[batch_summary, batch_results]
                )

            with gr.Tab("🔬 Research Tools"):
                gr.Markdown("""
                ### Advanced Linguistic Analysis Tools
                Analyze text for linguistic features.
                """)
                with gr.Row():
                    with gr.Column():
                        research_text = gr.Textbox(
                            label="Text for Analysis",
                            lines=6,
                            placeholder="Enter Northern Sotho (Sepedi) or English text...",
                            max_lines=15
                        )
                        analyze_btn = gr.Button("🔍 Analyze Text", variant="primary")
                    with gr.Column():
                        research_output = gr.JSON(
                            label="Detailed Analysis Results"
                        )
                analyze_btn.click(
                    fn=detailed_analysis,
                    inputs=research_text,
                    outputs=research_output
                )
                gr.Markdown("""
                ### 🗣️ About Northern Sotho (Sepedi) Language
                
                **Northern Sotho (Sepedi)** is a Bantu language spoken by millions of people, primarily in:
                - 🇿🇦 **South Africa** – Official language
                
                **Key Linguistic Features:**
                - **Language Family**: Niger-Congo → Bantu → Sotho-Tswana
                - **Script**: Latin alphabet
                - **Characteristics**: Agglutinative, noun-class system
                - **ISO Code**: nso (ISO 639-2/3)
                """)

        gr.Markdown("""
        ---
        ### 📚 Model Information & Citation

        **Model Used:** [`dsfsi/nso-en-m2m100-gov`](https://huggingface.co/dsfsi/nso-en-m2m100-gov)

        Based on Meta's M2M100, fine-tuned specifically for Northern Sotho-English by the **Data Science for Social Impact Research Group**.

        **Training Data:** Vuk'uzenzele and ZA-gov-multilingual South African corpora.

        ### 🔒 Privacy & Security
        - No conversation history stored
        - Uploaded files deleted after processing
        - All processing in isolated temporary environments
        - No user data persistence

        ### 🙏 Acknowledgments
        We thank **Thapelo Sindani** and **Zion Nia Van Wyk** for their assistance in creating this space.

        ### 📖 Citation
        ```bibtex
        @inproceedings{lastrucci-etal-2023-preparing,
            title = "Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora",
            author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab 
                      and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
            booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
            pages = "18--25",
            year = "2023"
        }
        ```
        **Links**: 
        - [DSFSI](https://www.dsfsi.co.za/)
        - [Model](https://huggingface.co/dsfsi/nso-en-m2m100-gov)
        - [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp)
        - [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual)
        - [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform)
        ---
        **Built for the African NLP community**
        """)
    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )