Spaces:

nttdataspain
/

Image-To-Text-Lora-ViT

Runtime error

File size: 1,538 Bytes

84f3f84
a6a6318
355d287
 
 
84f3f84
e59dcf6
 
84f3f84
e59dcf6
 
 
355d287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
953807b
 
e59dcf6
b922d45
3c6c7ef
953807b
e59dcf6
 
355d287
e59dcf6
333c77f
e59dcf6

import gradio as gr
import streamlit as st
import torch 
import re 
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 

# def greet(name):
#     return "Hello " + name + "!!"

# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# iface.launch()

device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)


def predict(image,max_length=64, num_beams=4):
  image = image.convert('RGB')
  image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
  clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
  caption_ids = model.generate(image, max_length = max_length)[0]
  caption_text = clean_text(tokenizer.decode(caption_ids))
  return caption_text 

st.title("Image to Text using Lora")

inputs = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
description = "NTT Data Bilbao team"
title = "Image to Text using Lora"

interface = gr.Interface(
        fn=predict,
        description=description,
        inputs = inputs,
        theme="grass",
        outputs=output,
        title=title,
    )
interface.launch(debug=True)