prithivMLmods's picture
Update app.py
d11a6d0 verified
raw
history blame
6.84 kB
import os
import time
import threading
import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import (
AutoModelForImageTextToText,
AutoProcessor,
TextIteratorStreamer,
)
from transformers.image_utils import load_image
# Constants for text generation
MAX_MAX_NEW_TOKENS = 4096
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load LFM2-VL-1.6B
MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
model_m = AutoModelForImageTextToText.from_pretrained(
MODEL_ID_M,
trust_remote_code=True,
torch_dtype="bfloat16",
).to(device).eval()
# Load LFM2-VL-450M
MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
model_t = AutoModelForImageTextToText.from_pretrained(
MODEL_ID_T,
trust_remote_code=True,
torch_dtype="bfloat16",
).to(device).eval()
@spaces.GPU
def generate_image(model_name: str, text: str, image: Image.Image,
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2):
"""
Generate responses using the selected model for image input.
"""
if model_name == "LFM2-VL-1.6B":
processor = processor_m
model = model_m
elif model_name == "LFM2-VL-450M":
processor = processor_t
model = model_t
else:
yield "Invalid model selected.", "Invalid model selected."
return
if image is None:
yield "Please upload an image.", "Please upload an image."
return
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text},
]
}]
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[prompt_full],
images=[image],
return_tensors="pt",
padding=True,
truncation=False,
max_length=MAX_INPUT_TOKEN_LENGTH
).to(device)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
time.sleep(0.01)
yield buffer, buffer
# Define examples for image inference
image_examples = [
["Explain the movie shot in detail.", "images/4.png"],
["OCR the image in the same format to .md.", "images/5.jpg"],
["According to this diagram, where do severe droughts occur?", "images/1.png"],
["Could you describe this image?", "images/2.jpg"],
["Provide a description of this image.", "images/3.jpg"],
]
# Updated CSS with model choice highlighting
css = """
.submit-btn {
background-color: #2980b9 !important;
color: white !important;
}
.submit-btn:hover {
background-color: #3498db !important;
}
.canvas-output {
border: 2px solid #4682B4;
border-radius: 10px;
padding: 20px;
}
"""
# Create the Gradio Interface
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
gr.Markdown("# **LFM2-VL by [LiquidAI](https://huggingface.co/LiquidAI)**")
with gr.Row():
with gr.Column():
image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
image_upload = gr.Image(type="pil", label="Image")
image_submit = gr.Button("Submit", elem_classes="submit-btn")
gr.Examples(
examples=image_examples,
inputs=[image_query, image_upload]
)
with gr.Accordion("Advanced options", open=False):
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
with gr.Column():
with gr.Column(elem_classes="canvas-output"):
gr.Markdown("## Output")
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
with gr.Accordion("(Result.md)", open=False):
markdown_output = gr.Markdown(label="(Result.md)")
model_choice = gr.Dropdown(
choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
label="Select Model",
value="LFM2-VL-1.6B"
)
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")
# Define the submit button action
image_submit.click(fn=generate_image,
inputs=[
model_choice, image_query, image_upload,
max_new_tokens, temperature, top_p, top_k,
repetition_penalty
],
outputs=[output, markdown_output])
if __name__ == "__main__":
demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)