File size: 4,366 Bytes
7ae1348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b3604b
 
 
 
 
 
 
 
7ae1348
 
 
 
4b3604b
 
7ae1348
 
 
 
 
 
cfea1fe
 
7ae1348
6cb18d9
 
e74ef23
 
 
 
6cb18d9
 
7ae1348
4b3604b
7ae1348
4b3604b
cfea1fe
e74ef23
 
 
4b3604b
7ae1348
 
 
 
 
 
cfea1fe
 
7ae1348
 
e74ef23
cfea1fe
e74ef23
7ae1348
 
e74ef23
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from transformers import MBartForConditionalGeneration, MBartTokenizer, MarianMTModel, MarianTokenizer
import streamlit as st

# Load multilingual summarization model and tokenizer
multilingual_summarization_model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50')
multilingual_summarization_tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-50')

# Dictionary of languages and their corresponding Hugging Face model codes
LANGUAGES = {
    "English": "en_XX",
    "French": "fr_XX",
    "Spanish": "es_XX",
    "German": "de_DE",
    "Chinese": "zh_CN",
    "Russian": "ru_RU",
    "Arabic": "ar_AR",
    "Portuguese": "pt_PT",
    "Hindi": "hi_IN",
    "Italian": "it_IT",
    "Japanese": "ja_XX",
    "Korean": "ko_KR",
    "Dutch": "nl_NL",
    "Polish": "pl_PL",
    "Turkish": "tr_TR",
    "Swedish": "sv_SE",
    "Greek": "el_EL",
    "Finnish": "fi_FI",
    "Hungarian": "hu_HU",
    "Danish": "da_DK",
    "Norwegian": "no_NO",
    "Czech": "cs_CZ",
    "Romanian": "ro_RO",
    "Thai": "th_TH",
    "Hebrew": "he_IL",
    "Vietnamese": "vi_VN",
    "Indonesian": "id_ID",
    "Malay": "ms_MY",
    "Bengali": "bn_BD",
    "Ukrainian": "uk_UA",
    "Urdu": "ur_PK",
    "Swahili": "sw_KE",
    "Serbian": "sr_SR",
    "Croatian": "hr_HR",
    "Slovak": "sk_SK",
    "Lithuanian": "lt_LT",
    "Latvian": "lv_LV",
    "Estonian": "et_EE",
    "Bulgarian": "bg_BG",
    "Macedonian": "mk_MK",
    "Albanian": "sq_AL",
    "Georgian": "ka_GE",
    "Armenian": "hy_AM",
    "Kazakh": "kk_KZ",
    "Uzbek": "uz_UZ",
    "Tajik": "tg_TJ",
    "Kyrgyz": "ky_KG",
    "Turkmen": "tk_TM"
}

# Function to get the appropriate translation model and tokenizer
def get_translation_model(source_lang, target_lang):
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    try:
        model = MarianMTModel.from_pretrained(model_name)
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        print(f"Loaded translation model for {source_lang} to {target_lang}")
        return model, tokenizer
    except Exception as e:
        print(f"Error loading translation model for {source_lang} to {target_lang}: {e}")
        return None, None

# Function to translate text
def translate_text(text, source_lang, target_lang):
    model, tokenizer = get_translation_model(source_lang, target_lang)
    if model is None or tokenizer is None:
        return "Translation model error."
    inputs = tokenizer([text], return_tensors="pt", truncation=True)
    translated_ids = model.generate(inputs['input_ids'], max_length=1024)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

# Summarization function with multi-language support
def summarize_text(text, target_language="English"):
    # Summarize the text using mBART (assuming input text is in English)
    inputs = multilingual_summarization_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    summary_ids = multilingual_summarization_model.generate(
        inputs['input_ids'],
        num_beams=6,        # Increased beams for better quality
        max_length=1500,   # Increased maximum length for longer summaries
        min_length=400,    # Set a minimum length for the summary
        length_penalty=1.5,  # Adjust length penalty to control the length of the summary
        early_stopping=True
    )
    summary = multilingual_summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"Generated summary in English: {summary}")
    
    target_lang_code = LANGUAGES.get(target_language, "en_XX")
    
    # Translate summary to the target language if needed
    if target_lang_code != "en_XX":
        summary = translate_text(summary, "en_XX", target_lang_code)
        print(f"Translated summary to {target_language}: {summary}")
    
    return summary

# Streamlit interface
st.title("Multi-Language Text Summarization Tool")

text = st.text_area("Input Text (in English)")
target_language = st.selectbox("Target Language for Summary", options=list(LANGUAGES.keys()), index=list(LANGUAGES.keys()).index("English"))

if st.button("Summarize"):
    if text:
        summary = summarize_text(text, target_language)
        st.subheader("Summary")
        st.write(summary)
    else:
        st.warning("Please enter text to summarize.")