Spaces:

nttdataspain
/

Image-To-Text-Lora-ViT

Runtime error

App Files Files Community

Image-To-Text-Lora-ViT / app.py

D0k-tor

Update app.py

dcdb448 about 2 years ago

raw

history blame

3.36 kB

	import torch
	import re
	import gradio as gr
	import streamlit as st
	# st.title("Image Caption Generator")
	from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
	import os
	import tensorflow as tf
	os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

	device='cpu'
	encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
	decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
	model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
	print("------------------------- 1 -------------------------\n")
	feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
	print("------------------------- 2 -------------------------\n")
	tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
	print("------------------------- 3 -------------------------\n")
	model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
	print("------------------------- 4 -------------------------\n")


	def predict(image,max_length=64, num_beams=4):
	image = image.convert('RGB')
	image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
	clean_text = lambda x: x.replace('<\|endoftext\|>','').split('\n')[0]
	caption_ids = model.generate(image, max_length = max_length)[0]
	caption_text = clean_text(tokenizer.decode(caption_ids))
	return caption_text

	with gr.Blocks() as demo:

	gr.HTML(
	"""
	<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
	<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
	Image-to-Text with [Lora](https://huggingface.co/blog/lora) and Vit
	</h1>
	<h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
	We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds.
	Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
	</h2>
	</div>
	""")

	print("------------------------- 5 -------------------------\n")
	input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
	output = gr.outputs.Textbox(type="text",label="Captions")
	examples = ["example1.jpg"]
	print("------------------------- 6 -------------------------\n")
	# title = "Image to - Text"
	description = "NTT Data"
	interface = gr.Interface(

	fn=predict,
	description=description,
	inputs = input,
	theme="grass",
	outputs=output,
	examples = examples,
	# title=title,
	)
	interface.launch(debug=True)