Spaces:
Sleeping
Sleeping
File size: 1,713 Bytes
6a4da27 5036e3a 6a4da27 5036e3a 6a4da27 7588485 6a4da27 16aefd2 6a4da27 16aefd2 6a4da27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration, NllbTokenizer, AutoModelForSeq2SeqLM
from PIL import Image
import torch
# Load model 1: English image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Load model 2: Translate EN → VI
translator_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
lang_code = "vie_Latn"
# Manually map lang_code to token ID (based on tokenizer's config)
lang_token_id = translator_tokenizer.convert_tokens_to_ids(lang_code)
def caption_translate(image):
# Step 1: Get English caption
inputs = blip_processor(image, return_tensors="pt")
out = blip_model.generate(**inputs)
eng_caption = blip_processor.decode(out[0], skip_special_tokens=True)
# Step 2: Translate to Vietnamese
inputs = translator_tokenizer(eng_caption, return_tensors="pt", src_lang="eng_Latn")
translated = translator_model.generate(
**inputs,
forced_bos_token_id=lang_token_id,
max_length=100
)
vi_caption = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
return vi_caption
iface = gr.Interface(
fn=caption_translate,
inputs=gr.Image(type="pil"),
outputs="text",
title="🧠 AI Mô Tả Hình Ảnh Bằng Tiếng Việt",
description="Upload ảnh, hệ thống sẽ mô tả nội dung bằng tiếng Việt bằng cách kết hợp 2 mô hình: caption → translate"
)
iface.launch()
|