pratik-19 commited on
Commit
86c8656
·
1 Parent(s): 54609e0

summarization and translation

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
+ from langdetect import detect
4
+
5
+ def load_models():
6
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
7
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")
8
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
9
+ translator = pipeline("translation", model=model, tokenizer=tokenizer)
10
+ return tokenizer, summarizer, translator
11
+
12
+ tokenizer, summarizer, translator = load_models()
13
+
14
+ import streamlit as st
15
+ LANGUAGE_CODES = {
16
+ "en": "en_XX", # English
17
+ "fr": "fr_XX", # French
18
+ "de": "de_DE", # German
19
+ "ru": "ru_RU", # Russian
20
+ "hi": "hi_IN", # Hindi
21
+ "mr": "mr_IN", # Marathi
22
+ "ja": "ja_XX", # Japanese
23
+
24
+ }
25
+
26
+ def detect_language(text):
27
+ lang_code = detect(text)
28
+ return lang_code
29
+
30
+
31
+ def summarize_text(text, lang_code):
32
+ mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX") # Default to English if unsupported
33
+ inputs = tokenizer(
34
+ f"<{mbart_lang_code}>{text}",
35
+ return_tensors="pt",
36
+ max_length=1024,
37
+ truncation=True
38
+ )
39
+ summary_ids = summarizer.model.generate(
40
+ inputs["input_ids"],
41
+ max_length=100,
42
+ min_length=30,
43
+ length_penalty=2.0,
44
+ num_beams=4
45
+ )
46
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
47
+ return summary
48
+
49
+
50
+ def translate_to_english(text, lang_code):
51
+ mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX") # Default to English if unsupported
52
+ inputs = tokenizer(
53
+ f"<{mbart_lang_code}>{text}",
54
+ return_tensors="pt",
55
+ max_length=1024,
56
+ truncation=True
57
+ )
58
+ translated_ids = translator.model.generate(
59
+ inputs["input_ids"],
60
+ max_length=100,
61
+ length_penalty=2.0,
62
+ num_beams=4
63
+ )
64
+ translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
65
+ return translated_text
66
+
67
+
68
+ st.title("Multilingual Summarization and Translation App")
69
+ st.markdown("""This app detects the language of the input text, summarizes it in the same language, and translates it into English.""")
70
+
71
+
72
+ user_input = st.text_area("Enter text in any language:", "")
73
+
74
+ if st.button("Process Text"):
75
+ if user_input.strip():
76
+
77
+ lang_code = detect_language(user_input)
78
+ st.write(f"**Detected Language Code:** {lang_code}")
79
+
80
+ if lang_code not in LANGUAGE_CODES:
81
+ st.warning(f"The detected language ({lang_code}) is not supported by the model.")
82
+ else:
83
+ try:
84
+
85
+ summary = summarize_text(user_input, lang_code)
86
+ st.write(f"### Summarized Text ({lang_code}):")
87
+ st.write(summary)
88
+
89
+
90
+ translation = translate_to_english(summary, lang_code)
91
+ st.write("### Translated Text (English):")
92
+ st.write(translation)
93
+
94
+ except Exception as e:
95
+ st.error(f"An error occurred during processing: {e}")
96
+ else:
97
+ st.warning("Please enter some text to process.")