Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
import pandas as pd | |
import time | |
import re | |
import tempfile | |
import os | |
import uuid | |
# Model loading | |
model_name = "dsfsi/nso-en-m2m100-gov" | |
tokenizer = M2M100Tokenizer.from_pretrained(model_name) | |
model = M2M100ForConditionalGeneration.from_pretrained(model_name) | |
tokenizer.src_lang = "ns" | |
model.config.forced_bos_token_id = tokenizer.get_lang_id("en") | |
# Translation function (single) | |
def translate_nso_en(text): | |
if not text.strip(): | |
return "Please enter Northern Sotho (Sepedi) text." | |
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) | |
translated_tokens = model.generate( | |
**inputs, | |
max_length=512, | |
forced_bos_token_id=tokenizer.get_lang_id("en") | |
) | |
return tokenizer.decode(translated_tokens[0], skip_special_tokens=True) | |
# Linguistic analysis | |
def calculate_metrics(text): | |
words = text.split() | |
word_count = len(words) | |
char_count = len(text) | |
sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()]) | |
unique_words = len(set(words)) | |
avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0 | |
lexical_div = unique_words / word_count if word_count else 0 | |
return { | |
'char_count': char_count, | |
'word_count': word_count, | |
'sentence_count': sentence_count, | |
'unique_words': unique_words, | |
'avg_word_length': avg_word_length, | |
'lexical_diversity': lexical_div | |
} | |
def create_metrics_table(src_metrics, tgt_metrics): | |
data = { | |
'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'], | |
'Source Text': [ | |
src_metrics.get('word_count', 0), | |
src_metrics.get('char_count', 0), | |
src_metrics.get('sentence_count', 0), | |
src_metrics.get('unique_words', 0), | |
f"{src_metrics.get('avg_word_length', 0):.1f}", | |
f"{src_metrics.get('lexical_diversity', 0):.3f}" | |
], | |
'Target Text': [ | |
tgt_metrics.get('word_count', 0), | |
tgt_metrics.get('char_count', 0), | |
tgt_metrics.get('sentence_count', 0), | |
tgt_metrics.get('unique_words', 0), | |
f"{tgt_metrics.get('avg_word_length', 0):.1f}", | |
f"{tgt_metrics.get('lexical_diversity', 0):.3f}" | |
] | |
} | |
return pd.DataFrame(data) | |
def translate_and_analyze(text): | |
if not text.strip(): | |
return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {}) | |
start = time.time() | |
translated = translate_nso_en(text) | |
src_metrics = calculate_metrics(text) | |
tgt_metrics = calculate_metrics(translated) | |
elapsed = time.time() - start | |
report = f"""## π Linguistic Analysis Report | |
### Translation Details | |
- **Processing Time**: {elapsed:.2f} seconds | |
### Text Complexity Metrics | |
| Metric | Source | Target | Ratio | | |
|--------|--------|--------|-------| | |
| Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} | | |
| Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} | | |
| Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} | | |
| Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} | | |
| Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} | | |
""" | |
table = create_metrics_table(src_metrics, tgt_metrics) | |
return translated, report, table | |
# Batch processing | |
def secure_batch_processing(file_obj): | |
if file_obj is None: | |
return "Please upload a file.", pd.DataFrame() | |
temp_dir = None | |
try: | |
session_id = str(uuid.uuid4()) | |
temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_") | |
file_ext = os.path.splitext(file_obj.name)[1].lower() | |
if file_ext not in ['.txt', '.csv']: | |
return "Only .txt and .csv files are supported.", pd.DataFrame() | |
temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}") | |
import shutil | |
shutil.copy2(file_obj.name, temp_file_path) | |
texts = [] | |
if file_ext == '.csv': | |
df = pd.read_csv(temp_file_path) | |
if df.empty: | |
return "The uploaded CSV file is empty.", pd.DataFrame() | |
texts = df.iloc[:, 0].dropna().astype(str).tolist() | |
else: | |
with open(temp_file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
texts = [line.strip() for line in content.split('\n') if line.strip()] | |
if not texts: | |
return "No text found in the uploaded file.", pd.DataFrame() | |
max_batch_size = 10 | |
if len(texts) > max_batch_size: | |
texts = texts[:max_batch_size] | |
warning_msg = f"Processing limited to first {max_batch_size} entries for performance." | |
else: | |
warning_msg = "" | |
results = [] | |
for i, text in enumerate(texts): | |
if len(text.strip()) == 0: | |
continue | |
if len(text) > 1000: | |
text = text[:1000] + "..." | |
translated = translate_nso_en(text) | |
results.append({ | |
'Index': i + 1, | |
'Original': text[:100] + '...' if len(text) > 100 else text, | |
'Translation': translated[:100] + '...' if len(translated) > 100 else translated | |
}) | |
if not results: | |
return "No valid text entries found to translate.", pd.DataFrame() | |
results_df = pd.DataFrame(results) | |
summary = f"Successfully processed {len(results)} text entries." | |
if warning_msg: | |
summary = f"{summary} {warning_msg}" | |
return summary, results_df | |
except Exception as e: | |
return f"Error processing file: {str(e)}", pd.DataFrame() | |
finally: | |
if temp_dir and os.path.exists(temp_dir): | |
try: | |
import shutil | |
shutil.rmtree(temp_dir) | |
except Exception as e: | |
print(f"Warning: Could not clean up temporary directory: {e}") | |
# Examples | |
EXAMPLES = [ | |
["Leina la ka ke Vukosi."], | |
["Ke leboga thuΕ‘o ya gago."], | |
["Re a go amogela mo Pretoria."], | |
["Go tloga ka letΕ‘atΕ‘i la lehono, dilo di tlo kaonafala."], | |
["O swanetΕ‘e go hwetΕ‘a thuΕ‘o ge go kgonega."], | |
["Ngwana o ya sekolong letΕ‘atΕ‘ing le lengwe le le lengwe."] | |
] | |
# Research tools | |
def detailed_analysis(text): | |
if not text.strip(): | |
return {} | |
metrics = calculate_metrics(text) | |
return { | |
"basic_metrics": metrics, | |
"text_length": len(text), | |
"analysis_completed": True | |
} | |
def create_gradio_interface(): | |
with gr.Blocks( | |
title="π¬ Northern Sotho-English Linguistic Translation Tool", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;} | |
.main-header {text-align: center; padding: 2rem 0;} | |
.dsfsi-logo {text-align: center; margin-bottom: 1rem;} | |
.dsfsi-logo img {max-width: 300px; height: auto;} | |
.metric-table {font-size: 0.9em;} | |
""" | |
) as demo: | |
gr.HTML(""" | |
<div class="dsfsi-logo"> | |
<img src="https://www.dsfsi.co.za/images/logo_transparent_expanded.png" alt="DSFSI Logo" /> | |
</div> | |
<div class="main-header"> | |
<h1>π¬ Northern Sotho-English Linguistic Translation Tool</h1> | |
<p style="font-size: 1.1em; color: #666; max-width: 800px; margin: 0 auto;"> | |
AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects. | |
</p> | |
</div> | |
""") | |
with gr.Tabs(): | |
with gr.Tab("π Translation & Analysis"): | |
gr.Markdown(""" | |
### Real-time Translation with Linguistic Analysis | |
Translate from Northern Sotho (Sepedi) to English and get detailed linguistic insights. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
input_text = gr.Textbox( | |
label="Northern Sotho (Sepedi) Input", | |
placeholder="Enter text to translate...", | |
lines=4, | |
max_lines=10 | |
) | |
translate_btn = gr.Button("π Translate & Analyze", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
output_text = gr.Textbox( | |
label="Translation (English)", | |
lines=4, | |
interactive=False | |
) | |
gr.Markdown("### π Example Translations") | |
gr.Examples( | |
examples=EXAMPLES, | |
inputs=[input_text], | |
label="Click an example to try it:" | |
) | |
with gr.Accordion("π Detailed Linguistic Analysis", open=False): | |
analysis_output = gr.Markdown(label="Analysis Report") | |
with gr.Accordion("π Metrics Table", open=False): | |
metrics_table = gr.Dataframe( | |
label="Comparative Metrics", | |
headers=["Metric", "Source Text", "Target Text"], | |
interactive=False | |
) | |
translate_btn.click( | |
fn=translate_and_analyze, | |
inputs=input_text, | |
outputs=[output_text, analysis_output, metrics_table] | |
) | |
with gr.Tab("π Batch Processing"): | |
gr.Markdown(""" | |
### Secure Corpus Analysis & Batch Translation | |
Upload text or CSV files for batch translation and analysis. Files are processed securely and temporarily. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
file_upload = gr.File( | |
label="Upload File (Max 5MB)", | |
file_types=[".txt", ".csv"], | |
type="filepath", | |
file_count="single" | |
) | |
batch_btn = gr.Button("π Process Batch", variant="primary") | |
gr.Markdown(""" | |
**Supported formats:** | |
- `.txt` files: One text per line | |
- `.csv` files: Text in first column | |
- **Security limits**: Max 10 entries, 1000 chars per text | |
- **Privacy**: Files are deleted after processing | |
""") | |
with gr.Column(): | |
batch_summary = gr.Textbox( | |
label="Processing Summary", | |
lines=3, | |
interactive=False | |
) | |
batch_results = gr.Dataframe( | |
label="Translation Results", | |
interactive=False, | |
wrap=True | |
) | |
batch_btn.click( | |
fn=secure_batch_processing, | |
inputs=file_upload, | |
outputs=[batch_summary, batch_results] | |
) | |
with gr.Tab("π¬ Research Tools"): | |
gr.Markdown(""" | |
### Advanced Linguistic Analysis Tools | |
Analyze text for linguistic features. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
research_text = gr.Textbox( | |
label="Text for Analysis", | |
lines=6, | |
placeholder="Enter Northern Sotho (Sepedi) or English text...", | |
max_lines=15 | |
) | |
analyze_btn = gr.Button("π Analyze Text", variant="primary") | |
with gr.Column(): | |
research_output = gr.JSON( | |
label="Detailed Analysis Results" | |
) | |
analyze_btn.click( | |
fn=detailed_analysis, | |
inputs=research_text, | |
outputs=research_output | |
) | |
gr.Markdown(""" | |
### π£οΈ About Northern Sotho (Sepedi) Language | |
**Northern Sotho (Sepedi)** is a Bantu language spoken by millions of people, primarily in: | |
- πΏπ¦ **South Africa** β Official language | |
**Key Linguistic Features:** | |
- **Language Family**: Niger-Congo β Bantu β Sotho-Tswana | |
- **Script**: Latin alphabet | |
- **Characteristics**: Agglutinative, noun-class system | |
- **ISO Code**: nso (ISO 639-2/3) | |
""") | |
gr.Markdown(""" | |
--- | |
### π Model Information & Citation | |
**Model Used:** [`dsfsi/nso-en-m2m100-gov`](https://huggingface.co/dsfsi/nso-en-m2m100-gov) | |
Based on Meta's M2M100, fine-tuned specifically for Northern Sotho-English by the **Data Science for Social Impact Research Group**. | |
**Training Data:** Vuk'uzenzele and ZA-gov-multilingual South African corpora. | |
### π Privacy & Security | |
- No conversation history stored | |
- Uploaded files deleted after processing | |
- All processing in isolated temporary environments | |
- No user data persistence | |
### π Acknowledgments | |
We thank **Thapelo Sindani** and **Zion Nia Van Wyk** for their assistance in creating this space. | |
### π Citation | |
```bibtex | |
@inproceedings{lastrucci-etal-2023-preparing, | |
title = "Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora", | |
author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab | |
and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate", | |
booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)", | |
pages = "18--25", | |
year = "2023" | |
} | |
``` | |
**Links**: | |
- [DSFSI](https://www.dsfsi.co.za/) | |
- [Model](https://huggingface.co/dsfsi/nso-en-m2m100-gov) | |
- [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp) | |
- [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual) | |
- [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform) | |
--- | |
**Built for the African NLP community** | |
""") | |
return demo | |
if __name__ == "__main__": | |
demo = create_gradio_interface() | |
demo.launch( | |
share=True, | |
server_name="0.0.0.0", | |
server_port=7860, | |
show_error=True | |
) | |