vukosi's picture
Update app.py
aae664e verified
import gradio as gr
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import pandas as pd
import time
import re
import tempfile
import os
import uuid
# Model loading
model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")
# Translation function (single)
def translate_nso_en(text):
if not text.strip():
return "Please enter Northern Sotho (Sepedi) text."
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
translated_tokens = model.generate(
**inputs,
max_length=512,
forced_bos_token_id=tokenizer.get_lang_id("en")
)
return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
# Linguistic analysis
def calculate_metrics(text):
words = text.split()
word_count = len(words)
char_count = len(text)
sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
unique_words = len(set(words))
avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0
lexical_div = unique_words / word_count if word_count else 0
return {
'char_count': char_count,
'word_count': word_count,
'sentence_count': sentence_count,
'unique_words': unique_words,
'avg_word_length': avg_word_length,
'lexical_diversity': lexical_div
}
def create_metrics_table(src_metrics, tgt_metrics):
data = {
'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'],
'Source Text': [
src_metrics.get('word_count', 0),
src_metrics.get('char_count', 0),
src_metrics.get('sentence_count', 0),
src_metrics.get('unique_words', 0),
f"{src_metrics.get('avg_word_length', 0):.1f}",
f"{src_metrics.get('lexical_diversity', 0):.3f}"
],
'Target Text': [
tgt_metrics.get('word_count', 0),
tgt_metrics.get('char_count', 0),
tgt_metrics.get('sentence_count', 0),
tgt_metrics.get('unique_words', 0),
f"{tgt_metrics.get('avg_word_length', 0):.1f}",
f"{tgt_metrics.get('lexical_diversity', 0):.3f}"
]
}
return pd.DataFrame(data)
def translate_and_analyze(text):
if not text.strip():
return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {})
start = time.time()
translated = translate_nso_en(text)
src_metrics = calculate_metrics(text)
tgt_metrics = calculate_metrics(translated)
elapsed = time.time() - start
report = f"""## πŸ“Š Linguistic Analysis Report
### Translation Details
- **Processing Time**: {elapsed:.2f} seconds
### Text Complexity Metrics
| Metric | Source | Target | Ratio |
|--------|--------|--------|-------|
| Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} |
| Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} |
| Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} |
| Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} |
| Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} |
"""
table = create_metrics_table(src_metrics, tgt_metrics)
return translated, report, table
# Batch processing
def secure_batch_processing(file_obj):
if file_obj is None:
return "Please upload a file.", pd.DataFrame()
temp_dir = None
try:
session_id = str(uuid.uuid4())
temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_")
file_ext = os.path.splitext(file_obj.name)[1].lower()
if file_ext not in ['.txt', '.csv']:
return "Only .txt and .csv files are supported.", pd.DataFrame()
temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}")
import shutil
shutil.copy2(file_obj.name, temp_file_path)
texts = []
if file_ext == '.csv':
df = pd.read_csv(temp_file_path)
if df.empty:
return "The uploaded CSV file is empty.", pd.DataFrame()
texts = df.iloc[:, 0].dropna().astype(str).tolist()
else:
with open(temp_file_path, 'r', encoding='utf-8') as f:
content = f.read()
texts = [line.strip() for line in content.split('\n') if line.strip()]
if not texts:
return "No text found in the uploaded file.", pd.DataFrame()
max_batch_size = 10
if len(texts) > max_batch_size:
texts = texts[:max_batch_size]
warning_msg = f"Processing limited to first {max_batch_size} entries for performance."
else:
warning_msg = ""
results = []
for i, text in enumerate(texts):
if len(text.strip()) == 0:
continue
if len(text) > 1000:
text = text[:1000] + "..."
translated = translate_nso_en(text)
results.append({
'Index': i + 1,
'Original': text[:100] + '...' if len(text) > 100 else text,
'Translation': translated[:100] + '...' if len(translated) > 100 else translated
})
if not results:
return "No valid text entries found to translate.", pd.DataFrame()
results_df = pd.DataFrame(results)
summary = f"Successfully processed {len(results)} text entries."
if warning_msg:
summary = f"{summary} {warning_msg}"
return summary, results_df
except Exception as e:
return f"Error processing file: {str(e)}", pd.DataFrame()
finally:
if temp_dir and os.path.exists(temp_dir):
try:
import shutil
shutil.rmtree(temp_dir)
except Exception as e:
print(f"Warning: Could not clean up temporary directory: {e}")
# Examples
EXAMPLES = [
["Leina la ka ke Vukosi."],
["Ke leboga thuΕ‘o ya gago."],
["Re a go amogela mo Pretoria."],
["Go tloga ka letΕ‘atΕ‘i la lehono, dilo di tlo kaonafala."],
["O swanetΕ‘e go hwetΕ‘a thuΕ‘o ge go kgonega."],
["Ngwana o ya sekolong letΕ‘atΕ‘ing le lengwe le le lengwe."]
]
# Research tools
def detailed_analysis(text):
if not text.strip():
return {}
metrics = calculate_metrics(text)
return {
"basic_metrics": metrics,
"text_length": len(text),
"analysis_completed": True
}
def create_gradio_interface():
with gr.Blocks(
title="πŸ”¬ Northern Sotho-English Linguistic Translation Tool",
theme=gr.themes.Soft(),
css="""
.gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;}
.main-header {text-align: center; padding: 2rem 0;}
.dsfsi-logo {text-align: center; margin-bottom: 1rem;}
.dsfsi-logo img {max-width: 300px; height: auto;}
.metric-table {font-size: 0.9em;}
"""
) as demo:
gr.HTML("""
<div class="dsfsi-logo">
<img src="https://www.dsfsi.co.za/images/logo_transparent_expanded.png" alt="DSFSI Logo" />
</div>
<div class="main-header">
<h1>πŸ”¬ Northern Sotho-English Linguistic Translation Tool</h1>
<p style="font-size: 1.1em; color: #666; max-width: 800px; margin: 0 auto;">
AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects.
</p>
</div>
""")
with gr.Tabs():
with gr.Tab("🌐 Translation & Analysis"):
gr.Markdown("""
### Real-time Translation with Linguistic Analysis
Translate from Northern Sotho (Sepedi) to English and get detailed linguistic insights.
""")
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Northern Sotho (Sepedi) Input",
placeholder="Enter text to translate...",
lines=4,
max_lines=10
)
translate_btn = gr.Button("πŸ”„ Translate & Analyze", variant="primary", size="lg")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="Translation (English)",
lines=4,
interactive=False
)
gr.Markdown("### πŸ“š Example Translations")
gr.Examples(
examples=EXAMPLES,
inputs=[input_text],
label="Click an example to try it:"
)
with gr.Accordion("πŸ“Š Detailed Linguistic Analysis", open=False):
analysis_output = gr.Markdown(label="Analysis Report")
with gr.Accordion("πŸ“ˆ Metrics Table", open=False):
metrics_table = gr.Dataframe(
label="Comparative Metrics",
headers=["Metric", "Source Text", "Target Text"],
interactive=False
)
translate_btn.click(
fn=translate_and_analyze,
inputs=input_text,
outputs=[output_text, analysis_output, metrics_table]
)
with gr.Tab("πŸ“ Batch Processing"):
gr.Markdown("""
### Secure Corpus Analysis & Batch Translation
Upload text or CSV files for batch translation and analysis. Files are processed securely and temporarily.
""")
with gr.Row():
with gr.Column():
file_upload = gr.File(
label="Upload File (Max 5MB)",
file_types=[".txt", ".csv"],
type="filepath",
file_count="single"
)
batch_btn = gr.Button("πŸ”„ Process Batch", variant="primary")
gr.Markdown("""
**Supported formats:**
- `.txt` files: One text per line
- `.csv` files: Text in first column
- **Security limits**: Max 10 entries, 1000 chars per text
- **Privacy**: Files are deleted after processing
""")
with gr.Column():
batch_summary = gr.Textbox(
label="Processing Summary",
lines=3,
interactive=False
)
batch_results = gr.Dataframe(
label="Translation Results",
interactive=False,
wrap=True
)
batch_btn.click(
fn=secure_batch_processing,
inputs=file_upload,
outputs=[batch_summary, batch_results]
)
with gr.Tab("πŸ”¬ Research Tools"):
gr.Markdown("""
### Advanced Linguistic Analysis Tools
Analyze text for linguistic features.
""")
with gr.Row():
with gr.Column():
research_text = gr.Textbox(
label="Text for Analysis",
lines=6,
placeholder="Enter Northern Sotho (Sepedi) or English text...",
max_lines=15
)
analyze_btn = gr.Button("πŸ” Analyze Text", variant="primary")
with gr.Column():
research_output = gr.JSON(
label="Detailed Analysis Results"
)
analyze_btn.click(
fn=detailed_analysis,
inputs=research_text,
outputs=research_output
)
gr.Markdown("""
### πŸ—£οΈ About Northern Sotho (Sepedi) Language
**Northern Sotho (Sepedi)** is a Bantu language spoken by millions of people, primarily in:
- πŸ‡ΏπŸ‡¦ **South Africa** – Official language
**Key Linguistic Features:**
- **Language Family**: Niger-Congo β†’ Bantu β†’ Sotho-Tswana
- **Script**: Latin alphabet
- **Characteristics**: Agglutinative, noun-class system
- **ISO Code**: nso (ISO 639-2/3)
""")
gr.Markdown("""
---
### πŸ“š Model Information & Citation
**Model Used:** [`dsfsi/nso-en-m2m100-gov`](https://huggingface.co/dsfsi/nso-en-m2m100-gov)
Based on Meta's M2M100, fine-tuned specifically for Northern Sotho-English by the **Data Science for Social Impact Research Group**.
**Training Data:** Vuk'uzenzele and ZA-gov-multilingual South African corpora.
### πŸ”’ Privacy & Security
- No conversation history stored
- Uploaded files deleted after processing
- All processing in isolated temporary environments
- No user data persistence
### πŸ™ Acknowledgments
We thank **Thapelo Sindani** and **Zion Nia Van Wyk** for their assistance in creating this space.
### πŸ“– Citation
```bibtex
@inproceedings{lastrucci-etal-2023-preparing,
title = "Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora",
author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab
and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
pages = "18--25",
year = "2023"
}
```
**Links**:
- [DSFSI](https://www.dsfsi.co.za/)
- [Model](https://huggingface.co/dsfsi/nso-en-m2m100-gov)
- [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp)
- [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual)
- [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform)
---
**Built for the African NLP community**
""")
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch(
share=True,
server_name="0.0.0.0",
server_port=7860,
show_error=True
)