Spaces:
Runtime error
Runtime error
File size: 2,703 Bytes
355d287 a1fde91 dcdb448 355d287 1e1d66d 84f3f84 355d287 a1fde91 94c8468 355d287 94c8468 b07e7dc 94c8468 355d287 94c8468 355d287 a1fde91 917196e d1ca20c 176961e a1fde91 176961e da13932 176961e 7d5aedc 176961e 7119ad5 7d5aedc 176961e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import torch
import re
import gradio as gr
import streamlit as st
# st.title("Image Caption Generator")
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
import os
import tensorflow as tf
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
print("------------------------- 1 -------------------------\n")
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
print("------------------------- 2 -------------------------\n")
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
print("------------------------- 3 -------------------------\n")
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
print("------------------------- 4 -------------------------\n")
def predict(image,max_length=64, num_beams=4):
image = image.convert('RGB')
image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
caption_ids = model.generate(image, max_length = max_length)[0]
caption_text = clean_text(tokenizer.decode(caption_ids))
return caption_text
print("------------------------- 5 -------------------------\n")
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
examples = ["example1.jpg"]
print("------------------------- 6 -------------------------\n")
title = "Image to Text ViT with LORA"
# interface = gr.Interface(
# fn=predict,
# description=description,
# inputs = input,
# theme="grass",
# outputs=output,
# examples=examples,
# title=title,
# )
# interface.launch(debug=True)
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
TextDiffuser: Diffusion Models as Text Painters
</h1>
<h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
We propose <b>Image to Text</b>, with ViT model but with LORA fine-tuning.
</h2>
</div>
""")
gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
gr.outputs.Textbox(type="text",label="Captions")
# gr.Image(label="Upload any Image", type = 'pil', optional=True)
# gr.Textbox(type="text",label="Captions")
demo.launch() |