import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration, NllbTokenizer, AutoModelForSeq2SeqLM from PIL import Image import torch # Load model 1: English image captioning blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Load model 2: Translate EN → VI translator_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") lang_code = "vie_Latn" # Manually map lang_code to token ID (based on tokenizer's config) lang_token_id = translator_tokenizer.convert_tokens_to_ids(lang_code) def caption_translate(image): # Step 1: Get English caption inputs = blip_processor(image, return_tensors="pt") out = blip_model.generate(**inputs) eng_caption = blip_processor.decode(out[0], skip_special_tokens=True) # Step 2: Translate to Vietnamese inputs = translator_tokenizer(eng_caption, return_tensors="pt", src_lang="eng_Latn") translated = translator_model.generate( **inputs, forced_bos_token_id=lang_token_id, max_length=100 ) vi_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True) return vi_caption iface = gr.Interface( fn=caption_translate, inputs=gr.Image(type="pil"), outputs="text", title="🧠 AI Mô Tả Hình Ảnh Bằng Tiếng Việt", description="Upload ảnh, hệ thống sẽ mô tả nội dung bằng tiếng Việt bằng cách kết hợp 2 mô hình: caption → translate" ) iface.launch()