File size: 2,713 Bytes
355d287
 
a1fde91
dcdb448
 
355d287
1e1d66d
 
 
84f3f84
355d287
a1fde91
 
 
94c8468
355d287
94c8468
b07e7dc
94c8468
355d287
94c8468
355d287
 
 
a1fde91
 
 
 
 
 
 
917196e
 
 
 
 
d1ca20c
 
176961e
a1fde91
176961e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da13932
176961e
7d5aedc
176961e
 
 
7119ad5
 
 
 
7d5aedc
176961e
01ce9b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import torch 
import re 
import gradio as gr
import streamlit as st
# st.title("Image Caption Generator")
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 
import os
import tensorflow as tf
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
print("------------------------- 1 -------------------------\n")
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
print("------------------------- 2 -------------------------\n")
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
print("------------------------- 3 -------------------------\n")
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
print("------------------------- 4 -------------------------\n")


def predict(image,max_length=64, num_beams=4):
  image = image.convert('RGB')
  image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
  clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
  caption_ids = model.generate(image, max_length = max_length)[0]
  caption_text = clean_text(tokenizer.decode(caption_ids))
  return caption_text 

print("------------------------- 5 -------------------------\n")
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
examples = ["example1.jpg"]
print("------------------------- 6 -------------------------\n")
title = "Image to Text ViT with LORA"

# interface = gr.Interface(
            
#         fn=predict,
#         description=description,
#         inputs = input,
#         theme="grass",
#         outputs=output,
#         examples=examples,
#         title=title,
#     )
# interface.launch(debug=True)

with gr.Blocks() as demo:

    gr.HTML(
        """
        <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
        <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
            TextDiffuser: Diffusion Models as Text Painters
        </h1>   
        <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        We propose <b>Image to Text</b>, with ViT model but with LORA fine-tuning. 
        </h2>
        </div>
        """)
    gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
    gr.outputs.Textbox(type="text",label="Captions")
    # gr.Image(label="Upload any Image", type = 'pil', optional=True)
    # gr.Textbox(type="text",label="Captions")
    

demo.launch(debug=True)