Spaces:

prithivMLmods
/

Qwen-Image-Diffusion

Running on Zero

App Files Files Community

Qwen-Image-Diffusion / app.py

prithivMLmods

upload app (#2)

d8019dd verified 17 days ago

raw

history blame

6.95 kB

	import os
	import time
	import threading
	import gradio as gr
	import spaces
	import torch
	from PIL import Image
	from transformers import (
	AutoModelForImageTextToText,
	AutoProcessor,
	TextIteratorStreamer,
	)
	from transformers.image_utils import load_image

	# Constants for text generation
	MAX_MAX_NEW_TOKENS = 4096
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# Load LFM2-VL-1.6B
	MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
	processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
	model_m = AutoModelForImageTextToText.from_pretrained(
	MODEL_ID_M,
	trust_remote_code=True,
	torch_dtype="bfloat16",
	).to(device).eval()

	# Load LFM2-VL-450M
	MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
	processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
	model_t = AutoModelForImageTextToText.from_pretrained(
	MODEL_ID_T,
	trust_remote_code=True,
	torch_dtype="bfloat16",
	).to(device).eval()

	@spaces.GPU
	def generate_image(model_name: str, text: str, image: Image.Image,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2):
	"""
	Generate responses using the selected model for image input.
	"""
	if model_name == "LFM2-VL-1.6B":
	processor = processor_m
	model = model_m
	elif model_name == "LFM2-VL-450M":
	processor = processor_t
	model = model_t
	else:
	yield "Invalid model selected.", "Invalid model selected."
	return

	if image is None:
	yield "Please upload an image.", "Please upload an image."
	return

	messages = [{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": text},
	]
	}]
	prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(
	text=[prompt_full],
	images=[image],
	return_tensors="pt",
	padding=True,
	truncation=False,
	max_length=MAX_INPUT_TOKEN_LENGTH
	).to(device)
	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()
	buffer = ""
	for new_text in streamer:
	buffer += new_text
	time.sleep(0.01)
	yield buffer, buffer

	# Define examples for image inference
	image_examples = [
	["According to this diagram, where do severe droughts occur?", "images/1.png"],
	["Could you describe this image?", "images/2.jpg"],
	["Provide a description of this image.", "images/3.jpg"],
	["Explain the movie shot in detail.", "images/4.png"],
	]

	# Updated CSS with model choice highlighting
	css = """
	.submit-btn {
	background-color: #2980b9 !important;
	color: white !important;
	}
	.submit-btn:hover {
	background-color: #3498db !important;
	}
	.canvas-output {
	border: 2px solid #4682B4;
	border-radius: 10px;
	padding: 20px;
	}
	"""

	# Create the Gradio Interface
	with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
	gr.Markdown("# LFM2-VL by [LiquidAI](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)")
	with gr.Row():
	with gr.Column():
	image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
	image_upload = gr.Image(type="pil", label="Image")
	image_submit = gr.Button("Submit", elem_classes="submit-btn")
	gr.Examples(
	examples=image_examples,
	inputs=[image_query, image_upload]
	)

	with gr.Accordion("Advanced options", open=False):
	max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
	temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
	top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
	top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
	repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)

	with gr.Column():
	with gr.Column(elem_classes="canvas-output"):
	gr.Markdown("## Output")
	output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
	with gr.Accordion("(Result.md)", open=False):
	markdown_output = gr.Markdown(label="(Result.md)")

	model_choice = gr.Dropdown(
	choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
	label="Select Model",
	value="LFM2-VL-1.6B"
	)

	gr.Markdown("Model Info 💻 \| [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
	gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
	gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")

	# Define the submit button action
	image_submit.click(fn=generate_image,
	inputs=[
	model_choice, image_query, image_upload,
	max_new_tokens, temperature, top_p, top_k,
	repetition_penalty
	],
	outputs=[output, markdown_output])

	if __name__ == "__main__":
	demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)