Spaces:
Runtime error
Runtime error
import torch | |
import re | |
import gradio as gr | |
import streamlit as st | |
# st.title("Image Caption Generator") | |
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel | |
import os | |
import tensorflow as tf | |
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' | |
device='cpu' | |
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning" | |
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning" | |
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning" | |
print("------------------------- 1 -------------------------\n") | |
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint) | |
print("------------------------- 2 -------------------------\n") | |
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint) | |
print("------------------------- 3 -------------------------\n") | |
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device) | |
print("------------------------- 4 -------------------------\n") | |
def predict(image,max_length=64, num_beams=4): | |
image = image.convert('RGB') | |
image = feature_extractor(image, return_tensors="pt").pixel_values.to(device) | |
clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0] | |
caption_ids = model.generate(image, max_length = max_length)[0] | |
caption_text = clean_text(tokenizer.decode(caption_ids)) | |
return caption_text | |
with gr.Blocks() as demo: | |
gr.HTML( | |
""" | |
<div style="text-align: center; max-width: 1200px; margin: 20px auto;"> | |
<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem"> | |
Image-to-Text with [Lora](https://huggingface.co/blog/lora) and Vit | |
</h1> | |
<h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem"> | |
We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds. | |
Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text. | |
</h2> | |
</div> | |
""") | |
print("------------------------- 5 -------------------------\n") | |
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True) | |
output = gr.outputs.Textbox(type="text",label="Captions") | |
examples = ["example1.jpg"] | |
print("------------------------- 6 -------------------------\n") | |
# title = "Image to - Text" | |
description = "NTT Data" | |
interface = gr.Interface( | |
fn=predict, | |
description=description, | |
inputs = input, | |
theme="grass", | |
outputs=output, | |
examples = examples, | |
# title=title, | |
) | |
interface.launch(debug=True) | |