Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,837 Bytes
d959d44 5a141d3 d959d44 d11a6d0 d959d44 65b7fe8 d959d44 d8019dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import os
import time
import threading
import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import (
AutoModelForImageTextToText,
AutoProcessor,
TextIteratorStreamer,
)
from transformers.image_utils import load_image
# Constants for text generation
MAX_MAX_NEW_TOKENS = 4096
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load LFM2-VL-1.6B
MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
model_m = AutoModelForImageTextToText.from_pretrained(
MODEL_ID_M,
trust_remote_code=True,
torch_dtype="bfloat16",
).to(device).eval()
# Load LFM2-VL-450M
MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
model_t = AutoModelForImageTextToText.from_pretrained(
MODEL_ID_T,
trust_remote_code=True,
torch_dtype="bfloat16",
).to(device).eval()
@spaces.GPU
def generate_image(model_name: str, text: str, image: Image.Image,
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2):
"""
Generate responses using the selected model for image input.
"""
if model_name == "LFM2-VL-1.6B":
processor = processor_m
model = model_m
elif model_name == "LFM2-VL-450M":
processor = processor_t
model = model_t
else:
yield "Invalid model selected.", "Invalid model selected."
return
if image is None:
yield "Please upload an image.", "Please upload an image."
return
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text},
]
}]
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[prompt_full],
images=[image],
return_tensors="pt",
padding=True,
truncation=False,
max_length=MAX_INPUT_TOKEN_LENGTH
).to(device)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
time.sleep(0.01)
yield buffer, buffer
# Define examples for image inference
image_examples = [
["Explain the movie shot in detail.", "images/4.png"],
["OCR the image in the same format to .md.", "images/5.jpg"],
["According to this diagram, where do severe droughts occur?", "images/1.png"],
["Could you describe this image?", "images/2.jpg"],
["Provide a description of this image.", "images/3.jpg"],
]
# Updated CSS with model choice highlighting
css = """
.submit-btn {
background-color: #2980b9 !important;
color: white !important;
}
.submit-btn:hover {
background-color: #3498db !important;
}
.canvas-output {
border: 2px solid #4682B4;
border-radius: 10px;
padding: 20px;
}
"""
# Create the Gradio Interface
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
gr.Markdown("# **LFM2-VL by [LiquidAI](https://huggingface.co/LiquidAI)**")
with gr.Row():
with gr.Column():
image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
image_upload = gr.Image(type="pil", label="Image")
image_submit = gr.Button("Submit", elem_classes="submit-btn")
gr.Examples(
examples=image_examples,
inputs=[image_query, image_upload]
)
with gr.Accordion("Advanced options", open=False):
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
with gr.Column():
with gr.Column(elem_classes="canvas-output"):
gr.Markdown("## Output")
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
with gr.Accordion("(Result.md)", open=False):
markdown_output = gr.Markdown(label="(Result.md)")
model_choice = gr.Dropdown(
choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
label="Select Model",
value="LFM2-VL-1.6B"
)
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")
# Define the submit button action
image_submit.click(fn=generate_image,
inputs=[
model_choice, image_query, image_upload,
max_new_tokens, temperature, top_p, top_k,
repetition_penalty
],
outputs=[output, markdown_output])
if __name__ == "__main__":
demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True) |