Spaces:

nttdataspain
/

Image-To-Text-Lora-ViT

Runtime error

File size: 3,603 Bytes

355d287
 
a1fde91
dcdb448
 
355d287
1e1d66d
 
 
84f3f84
355d287
a1fde91
 
 
94c8468
355d287
94c8468
b07e7dc
94c8468
355d287
94c8468
355d287
 
 
a1fde91
 
 
 
 
 
 
917196e
 
 
 
 
d1ca20c
 
389eeec
47fb79d
 
 
 
389eeec
176961e
a1fde91
176961e

import torch 
import re 
import gradio as gr
import streamlit as st
# st.title("Image Caption Generator")
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 
import os
import tensorflow as tf
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
print("------------------------- 1 -------------------------\n")
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
print("------------------------- 2 -------------------------\n")
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
print("------------------------- 3 -------------------------\n")
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
print("------------------------- 4 -------------------------\n")


def predict(image,max_length=64, num_beams=4):
  image = image.convert('RGB')
  image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
  clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
  caption_ids = model.generate(image, max_length = max_length)[0]
  caption_text = clean_text(tokenizer.decode(caption_ids))
  return caption_text 

print("------------------------- 5 -------------------------\n")
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
examples = ["example1.jpg"]
print("------------------------- 6 -------------------------\n")
title = "Image to Text ViT with LORA"

description = """
        # This is a Heading
        This is a paragraph.
        - Item 1
        - Item 2
        """
# interface = gr.Interface(
            
#         fn=predict,
#         description=description,
#         inputs = input,
#         theme="grass",
#         outputs=output,
#         examples=examples,
#         title=title,
#     )
# interface.launch(debug=True)

with gr.Blocks() as demo:

    gr.HTML(
        """
        <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
        <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
            TextDiffuser: Diffusion Models as Text Painters
        </h1>        
        <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds. 
        Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
        </h2>
        </div>
        """)
    gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
    gr.outputs.Textbox(type="text",label="Captions")

demo.launch()