🔬 Northern Sotho-English Linguistic Translation Tool
AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects.
import gradio as gr import torch from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer import pandas as pd import time import re import tempfile import os import uuid # Model loading model_name = "dsfsi/nso-en-m2m100-gov" tokenizer = M2M100Tokenizer.from_pretrained(model_name) model = M2M100ForConditionalGeneration.from_pretrained(model_name) tokenizer.src_lang = "ns" model.config.forced_bos_token_id = tokenizer.get_lang_id("en") # Translation function (single) def translate_nso_en(text): if not text.strip(): return "Please enter Northern Sotho (Sepedi) text." inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) translated_tokens = model.generate( **inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en") ) return tokenizer.decode(translated_tokens[0], skip_special_tokens=True) # Linguistic analysis def calculate_metrics(text): words = text.split() word_count = len(words) char_count = len(text) sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()]) unique_words = len(set(words)) avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0 lexical_div = unique_words / word_count if word_count else 0 return { 'char_count': char_count, 'word_count': word_count, 'sentence_count': sentence_count, 'unique_words': unique_words, 'avg_word_length': avg_word_length, 'lexical_diversity': lexical_div } def create_metrics_table(src_metrics, tgt_metrics): data = { 'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'], 'Source Text': [ src_metrics.get('word_count', 0), src_metrics.get('char_count', 0), src_metrics.get('sentence_count', 0), src_metrics.get('unique_words', 0), f"{src_metrics.get('avg_word_length', 0):.1f}", f"{src_metrics.get('lexical_diversity', 0):.3f}" ], 'Target Text': [ tgt_metrics.get('word_count', 0), tgt_metrics.get('char_count', 0), tgt_metrics.get('sentence_count', 0), tgt_metrics.get('unique_words', 0), f"{tgt_metrics.get('avg_word_length', 0):.1f}", f"{tgt_metrics.get('lexical_diversity', 0):.3f}" ] } return pd.DataFrame(data) def translate_and_analyze(text): if not text.strip(): return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {}) start = time.time() translated = translate_nso_en(text) src_metrics = calculate_metrics(text) tgt_metrics = calculate_metrics(translated) elapsed = time.time() - start report = f"""## 📊 Linguistic Analysis Report ### Translation Details - **Processing Time**: {elapsed:.2f} seconds ### Text Complexity Metrics | Metric | Source | Target | Ratio | |--------|--------|--------|-------| | Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} | | Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} | | Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} | | Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} | | Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} | """ table = create_metrics_table(src_metrics, tgt_metrics) return translated, report, table # Batch processing def secure_batch_processing(file_obj): if file_obj is None: return "Please upload a file.", pd.DataFrame() temp_dir = None try: session_id = str(uuid.uuid4()) temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_") file_ext = os.path.splitext(file_obj.name)[1].lower() if file_ext not in ['.txt', '.csv']: return "Only .txt and .csv files are supported.", pd.DataFrame() temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}") import shutil shutil.copy2(file_obj.name, temp_file_path) texts = [] if file_ext == '.csv': df = pd.read_csv(temp_file_path) if df.empty: return "The uploaded CSV file is empty.", pd.DataFrame() texts = df.iloc[:, 0].dropna().astype(str).tolist() else: with open(temp_file_path, 'r', encoding='utf-8') as f: content = f.read() texts = [line.strip() for line in content.split('\n') if line.strip()] if not texts: return "No text found in the uploaded file.", pd.DataFrame() max_batch_size = 10 if len(texts) > max_batch_size: texts = texts[:max_batch_size] warning_msg = f"Processing limited to first {max_batch_size} entries for performance." else: warning_msg = "" results = [] for i, text in enumerate(texts): if len(text.strip()) == 0: continue if len(text) > 1000: text = text[:1000] + "..." translated = translate_nso_en(text) results.append({ 'Index': i + 1, 'Original': text[:100] + '...' if len(text) > 100 else text, 'Translation': translated[:100] + '...' if len(translated) > 100 else translated }) if not results: return "No valid text entries found to translate.", pd.DataFrame() results_df = pd.DataFrame(results) summary = f"Successfully processed {len(results)} text entries." if warning_msg: summary = f"{summary} {warning_msg}" return summary, results_df except Exception as e: return f"Error processing file: {str(e)}", pd.DataFrame() finally: if temp_dir and os.path.exists(temp_dir): try: import shutil shutil.rmtree(temp_dir) except Exception as e: print(f"Warning: Could not clean up temporary directory: {e}") # Examples EXAMPLES = [ ["Leina la ka ke Vukosi."], ["Ke leboga thušo ya gago."], ["Re a go amogela mo Pretoria."], ["Go tloga ka letšatši la lehono, dilo di tlo kaonafala."], ["O swanetše go hwetša thušo ge go kgonega."], ["Ngwana o ya sekolong letšatšing le lengwe le le lengwe."] ] # Research tools def detailed_analysis(text): if not text.strip(): return {} metrics = calculate_metrics(text) return { "basic_metrics": metrics, "text_length": len(text), "analysis_completed": True } def create_gradio_interface(): with gr.Blocks( title="🔬 Northern Sotho-English Linguistic Translation Tool", theme=gr.themes.Soft(), css=""" .gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;} .main-header {text-align: center; padding: 2rem 0;} .dsfsi-logo {text-align: center; margin-bottom: 1rem;} .dsfsi-logo img {max-width: 300px; height: auto;} .metric-table {font-size: 0.9em;} """ ) as demo: gr.HTML("""
AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects.