Krishna086 commited on
Commit
b912ba6
·
verified ·
1 Parent(s): 891f160

Create translation.py

Browse files
Files changed (1) hide show
  1. translation.py +41 -0
translation.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MarianTokenizer, MarianMTModel
2
+
3
+ # Dictionary of supported languages with MarianMT codes
4
+ LANGUAGES = {
5
+ "English": "en",
6
+ "French": "fr",
7
+ "Spanish": "es",
8
+ "German": "de",
9
+ "Chinese": "zh",
10
+ "Arabic": "ar",
11
+ "Russian": "ru",
12
+ "Hindi": "hi",
13
+ "Japanese": "ja"
14
+ }
15
+
16
+ # Cache the model loading for faster performance
17
+ @st.cache_resource
18
+ def load_model(src_lang, tgt_lang):
19
+ """Load the MarianMT model and tokenizer for a language pair."""
20
+ model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
21
+ try:
22
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
23
+ model = MarianMTModel.from_pretrained(model_name)
24
+ return tokenizer, model
25
+ except Exception as e:
26
+ raise Exception(f"Model for {src_lang} to {tgt_lang} not available: {str(e)}")
27
+
28
+ def translate(text, source_lang, target_lang):
29
+ """Translate text from source language to target language."""
30
+ if not text:
31
+ return "Please provide text to translate."
32
+
33
+ # Get language codes from dictionary
34
+ src_code = LANGUAGES.get(source_lang)
35
+ tgt_code = LANGUAGES.get(target_lang)
36
+
37
+ # Load model and translate
38
+ tokenizer, model = load_model(src_code, tgt_code)
39
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=400)
40
+ translated = model.generate(**inputs)
41
+ return tokenizer.decode(translated[0], skip_special_tokens=True)