File size: 2,824 Bytes
355d287
 
a1fde91
dcdb448
 
355d287
1e1d66d
 
 
84f3f84
355d287
a1fde91
 
 
94c8468
355d287
94c8468
b07e7dc
94c8468
355d287
94c8468
355d287
 
 
a1fde91
 
 
 
 
 
 
917196e
 
 
 
 
d1ca20c
 
389eeec
47fb79d
 
 
 
389eeec
176961e
a1fde91
176961e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da13932
176961e
7d5aedc
176961e
 
 
7d5aedc
 
 
 
 
176961e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import torch 
import re 
import gradio as gr
import streamlit as st
# st.title("Image Caption Generator")
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 
import os
import tensorflow as tf
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
print("------------------------- 1 -------------------------\n")
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
print("------------------------- 2 -------------------------\n")
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
print("------------------------- 3 -------------------------\n")
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
print("------------------------- 4 -------------------------\n")


def predict(image,max_length=64, num_beams=4):
  image = image.convert('RGB')
  image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
  clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
  caption_ids = model.generate(image, max_length = max_length)[0]
  caption_text = clean_text(tokenizer.decode(caption_ids))
  return caption_text 

print("------------------------- 5 -------------------------\n")
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
examples = ["example1.jpg"]
print("------------------------- 6 -------------------------\n")
title = "Image to Text ViT with LORA"

description = """
        # This is a Heading
        This is a paragraph.
        - Item 1
        - Item 2
        """
# interface = gr.Interface(
            
#         fn=predict,
#         description=description,
#         inputs = input,
#         theme="grass",
#         outputs=output,
#         examples=examples,
#         title=title,
#     )
# interface.launch(debug=True)

with gr.Blocks() as demo:

    gr.HTML(
        """
        <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
        <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
            TextDiffuser: Diffusion Models as Text Painters
        </h1>   
        <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        We propose <b>Image to Text</b>, with ViT model but with LORA fine-tuning. 
        </h2>
        </div>
        """)
    # gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
    # gr.outputs.Textbox(type="text",label="Captions")
    gr.Image(label="Upload any Image", type = 'pil', optional=True)
    gr.Textbox(type="text",label="Captions")
    

demo.launch()