Spaces:

nttdataspain
/

Image-To-Text-Lora-ViT

Runtime error

File size: 3,362 Bytes

355d287
 
a1fde91
dcdb448
 
355d287
1e1d66d
 
 
84f3f84
355d287
a1fde91
 
 
94c8468
355d287
94c8468
b07e7dc
94c8468
355d287
94c8468
355d287
 
 
a1fde91
 
 
 
 
 
 
dcdb448
57d4ed7
dcdb448
 
 
 
 
 
 
 
 
 
 
 
 
94c8468
a1fde91
0730b6e
03038d2
94c8468
dcdb448
94c8468
e59dcf6
a1fde91
355d287
e59dcf6
a1fde91
e59dcf6
 
a1fde91
dcdb448
e59dcf6

import torch 
import re 
import gradio as gr
import streamlit as st
# st.title("Image Caption Generator")
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 
import os
import tensorflow as tf
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
print("------------------------- 1 -------------------------\n")
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
print("------------------------- 2 -------------------------\n")
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
print("------------------------- 3 -------------------------\n")
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
print("------------------------- 4 -------------------------\n")


def predict(image,max_length=64, num_beams=4):
  image = image.convert('RGB')
  image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
  clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
  caption_ids = model.generate(image, max_length = max_length)[0]
  caption_text = clean_text(tokenizer.decode(caption_ids))
  return caption_text 

with gr.Blocks() as demo:

    gr.HTML(
        """
        <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
        <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
            Image-to-Text with [Lora](https://huggingface.co/blog/lora) and Vit
        </h1>        
        <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds. 
        Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
        </h2>
        </div>
        """)
    
print("------------------------- 5 -------------------------\n")
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
examples = ["example1.jpg"]
print("------------------------- 6 -------------------------\n")
# title = "Image to - Text"
description = "NTT Data"
interface = gr.Interface(
            
        fn=predict,
        description=description,
        inputs = input,
        theme="grass",
        outputs=output,
        examples = examples,
        # title=title,
    )
interface.launch(debug=True)