Spaces:
Sleeping
Sleeping
| import spaces | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from flores import code_mapping | |
| import platform | |
| import torch | |
| import nltk | |
| nltk.download("punkt") | |
| device = "cpu" if platform.system() == "Darwin" else "cuda" | |
| MODEL_NAME = "facebook/nllb-200-3.3B" | |
| code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1])) | |
| flores_codes = list(code_mapping.keys()) | |
| def load_model(): | |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device) | |
| return model | |
| model = load_model() | |
| def load_tokenizer(src_lang, tgt_lang): | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang] | |
| ) | |
| return tokenizer | |
| def translate(text: str, src_lang: str, tgt_lang: str): | |
| tokenizer = load_tokenizer(src_lang, tgt_lang) | |
| paragraphs = text.split("\n") | |
| translated_paragraphs = [] | |
| for paragraph in paragraphs: | |
| sentences = nltk.sent_tokenize(paragraph) | |
| translated_sentences = [] | |
| for sentence in sentences: | |
| input_tokens = ( | |
| tokenizer(sentence, return_tensors="pt") | |
| .input_ids[0] | |
| .cpu() | |
| .numpy() | |
| .tolist() | |
| ) | |
| translated_chunk = model.generate( | |
| input_ids=torch.tensor([input_tokens]).to(device), | |
| forced_bos_token_id=tokenizer.lang_code_to_id[code_mapping[tgt_lang]], | |
| max_length=len(input_tokens) + 50, | |
| num_return_sequences=1, | |
| ) | |
| translated_chunk = tokenizer.decode( | |
| translated_chunk[0], skip_special_tokens=True | |
| ) | |
| translated_sentences.append(translated_chunk) | |
| translated_paragraph = " ".join(translated_sentences) | |
| translated_paragraphs.append(translated_paragraph) | |
| return "\n".join(translated_paragraphs) | |
| description = """ | |
| UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages. | |
| This is made possible through an open approach to AI innovation using Meta’s open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces. | |
| """ | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# UNESCO Language Translator, powered by Meta and Hugging Face") | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| src_lang = gr.Dropdown(label="Source Language", choices=flores_codes) | |
| target_lang = gr.Dropdown(label="Target Language", choices=flores_codes) | |
| with gr.Row(): | |
| input_text = gr.Textbox(label="Input Text", lines=6) | |
| with gr.Row(): | |
| btn = gr.Button("Translate text") | |
| with gr.Row(): | |
| output = gr.Textbox(label="Output Text", lines=6) | |
| btn.click( | |
| translate, | |
| inputs=[input_text, src_lang, target_lang], | |
| outputs=output, | |
| ) | |
| demo.launch() | |