File size: 6,837 Bytes
d959d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a141d3
d959d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d11a6d0
d959d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65b7fe8
d959d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8019dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
import time
import threading
import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import (
    AutoModelForImageTextToText,
    AutoProcessor,
    TextIteratorStreamer,
)
from transformers.image_utils import load_image

# Constants for text generation
MAX_MAX_NEW_TOKENS = 4096
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load LFM2-VL-1.6B
MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
model_m = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID_M,
    trust_remote_code=True,
    torch_dtype="bfloat16",
).to(device).eval()

# Load LFM2-VL-450M
MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
model_t = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID_T,
    trust_remote_code=True,
    torch_dtype="bfloat16",
).to(device).eval()

@spaces.GPU
def generate_image(model_name: str, text: str, image: Image.Image,
                   max_new_tokens: int = 1024,
                   temperature: float = 0.6,
                   top_p: float = 0.9,
                   top_k: int = 50,
                   repetition_penalty: float = 1.2):
    """
    Generate responses using the selected model for image input.
    """
    if model_name == "LFM2-VL-1.6B":
        processor = processor_m
        model = model_m
    elif model_name == "LFM2-VL-450M":
        processor = processor_t
        model = model_t
    else:
        yield "Invalid model selected.", "Invalid model selected."
        return

    if image is None:
        yield "Please upload an image.", "Please upload an image."
        return

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": text},
        ]
    }]
    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt_full],
        images=[image],
        return_tensors="pt",
        padding=True,
        truncation=False,
        max_length=MAX_INPUT_TOKEN_LENGTH
    ).to(device)
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer, buffer

# Define examples for image inference
image_examples = [
    ["Explain the movie shot in detail.", "images/4.png"],
    ["OCR the image in the same format to .md.", "images/5.jpg"],
    ["According to this diagram, where do severe droughts occur?", "images/1.png"],
    ["Could you describe this image?", "images/2.jpg"],
    ["Provide a description of this image.", "images/3.jpg"],
]

# Updated CSS with model choice highlighting
css = """
.submit-btn {
    background-color: #2980b9 !important;
    color: white !important;
}
.submit-btn:hover {
    background-color: #3498db !important;
}
.canvas-output {
    border: 2px solid #4682B4;
    border-radius: 10px;
    padding: 20px;
}
"""

# Create the Gradio Interface
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
    gr.Markdown("# **LFM2-VL by [LiquidAI](https://huggingface.co/LiquidAI)**")
    with gr.Row():
        with gr.Column():
            image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
            image_upload = gr.Image(type="pil", label="Image")
            image_submit = gr.Button("Submit", elem_classes="submit-btn")
            gr.Examples(
                examples=image_examples,
                inputs=[image_query, image_upload]
            )

            with gr.Accordion("Advanced options", open=False):
                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)

        with gr.Column():
            with gr.Column(elem_classes="canvas-output"):
                gr.Markdown("## Output")
                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
                with gr.Accordion("(Result.md)", open=False):                
                    markdown_output = gr.Markdown(label="(Result.md)")

            model_choice = gr.Dropdown(
                choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
                label="Select Model",
                value="LFM2-VL-1.6B"
            )

            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
            gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
            gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")

    # Define the submit button action
    image_submit.click(fn=generate_image,
                       inputs=[
                           model_choice, image_query, image_upload,
                           max_new_tokens, temperature, top_p, top_k,
                           repetition_penalty
                       ],
                       outputs=[output, markdown_output])

if __name__ == "__main__":
    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)