Spaces:
Runtime error
Runtime error
File size: 8,840 Bytes
abd88b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import threading
import gradio as gr
image_model_id = "Qwen/Qwen-VL-Chat-Int4"
image_tokenizer = AutoTokenizer.from_pretrained(image_model_id, trust_remote_code=True)
image_model = AutoModelForCausalLM.from_pretrained(image_model_id, device_map="cuda", trust_remote_code=True).eval()
# Load model and tokenizer
code_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
code_tokenizer = AutoTokenizer.from_pretrained(code_model_id, trust_remote_code=True)
code_tokenizer.pad_token_id = code_tokenizer.eos_token_id
code_model = AutoModelForCausalLM.from_pretrained(
code_model_id,
torch_dtype="float16",
device_map="auto"
).eval()
stop_image_generation = threading.Event()
stop_code_generation = threading.Event()
def generate_response_image(uploaded_image, user_prompt, temperature, top_p, max_new_tokens):
stop_image_generation.clear()
temp_path = "/tmp/temp_image.png"
uploaded_image.save(temp_path)
image_sys_prompt = (
"You are a helpful assistant that describes images very concisely. "
"Provide a one-sentence summary of the image in less than 15 words. "
"Use simple, direct language."
)
# Compose prompt using tokenizer's helper
query_text = image_tokenizer.from_list_format([
{"image": temp_path},
{"text": f"<|system|>\n{image_sys_prompt}\n<|end|>"},
{"text": f"<|user|>\n{user_prompt}\n<|end|>"},
{"text": "<|assistant|>"}
])
# Tokenize the input text -> get input_ids and attention_mask tensors
inputs = image_tokenizer(query_text, return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(image_tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
**inputs,
streamer=streamer,
temperature=temperature,
top_p=top_p,
max_new_tokens=max_new_tokens,
do_sample=True,
use_cache=True,
return_dict_in_generate=True,
)
thread = threading.Thread(target=image_model.generate, kwargs=generation_kwargs)
thread.start()
response = ""
for new_text in streamer:
if stop_image_generation.is_set():
break
response += new_text
yield response
def stop_image_generation_func():
stop_image_generation.set()
return ""
def generate_stream_local(prompt, temperature, top_p, max_new_tokens):
stop_code_generation.clear()
inputs = code_tokenizer(prompt, return_tensors="pt").to(code_model.device)
streamer = TextIteratorStreamer(code_tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
**inputs,
streamer=streamer,
temperature=temperature,
top_p=top_p,
max_new_tokens=max_new_tokens,
do_sample=True,
use_cache=True,
return_dict_in_generate=True,
)
thread = threading.Thread(target=code_model.generate, kwargs=generation_kwargs)
thread.start()
for new_text in streamer:
if stop_code_generation.is_set():
break
yield new_text
# --- Respond logic for Gradio ---
def respond(message, temperature, top_p, max_new_tokens):
sys_prompt = (
"You are an AI coding assistant. If the user input is too vague to generate accurate code "
"(e.g., lacks programming language, method, or details), ask clarifying questions before attempting to write the code.\n"
"Think silently first and write your reasoning inside <think>...</think>. Then provide your final user-facing answer."
)
full_prompt = [
{"role": "system", "content": sys_prompt},
{"role": "user", "content": message}
]
prompt = code_tokenizer.apply_chat_template(full_prompt, tokenize=False, add_generation_prompt=True)
response = ""
for part in generate_stream_local(prompt, temperature, top_p, max_new_tokens):
response += part
yield response
# Future work should separate the reasoning process from the final answer.
# if "</think>" in response:
# yield response.split("</think>")[-1].strip()
def stop_code_generation_func():
stop_code_generation.set()
return "π§Ύ Generated Code Output"
with gr.Blocks(theme=gr.themes.Soft()) as demo:
# πΌοΈ Image Description Tab
with gr.Tab("πΌοΈ Image Description"):
gr.Markdown("## π§ Qwen-VL: Vision-Language Streaming Chat with Image Upload")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
image_input = gr.Image(
type="pil",
label="π€ Upload Image",
height=480,
width=480
)
with gr.Column(scale=1):
prompt_input = gr.Textbox(
label="π¬ Prompt",
placeholder="e.g. Describe the image content",
value="Describe the picture",
lines=2
)
with gr.Row():
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.05,
label="π² Temperature",
info="Controls randomness. Higher = more creative."
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="π Top-p",
info="Cumulative probability for nucleus sampling."
)
max_new_tokens = gr.Slider(
minimum=50,
maximum=1000,
value=500,
step=10,
label="π Max New Tokens",
info="Maximum length of generated output."
)
generate_btn = gr.Button("π Generate Description", variant="primary")
stop_btn = gr.Button("βΉοΈ Stop and Clear", variant="stop")
output = gr.Textbox(
label="π Streaming Response",
placeholder="The model will respond here...",
lines=10,
interactive=False
)
generate_btn.click(
fn=generate_response_image,
inputs=[image_input, prompt_input, temperature, top_p, max_new_tokens],
outputs=output
)
stop_btn.click(fn=stop_image_generation_func, outputs=output)
# π» Code Generator Tab
with gr.Tab("π» Code Generator"):
gr.Markdown("## π€ DeepSeek-R1-Distill-Qwen: Code Generation from Natural Language")
with gr.Row(equal_height=True):
with gr.Column(scale=2):
code_des = gr.Textbox(
label="π§Ύ Describe Your Code",
placeholder="e.g. Write a Python function to reverse a string",
lines=8
)
generate_code_btn = gr.Button("π§ Generate Code", variant="primary")
stop_code_btn = gr.Button("βΉοΈ Stop and Clear", variant="stop")
with gr.Column(scale=1):
temperature_code = gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.05,
label="π² Temperature",
info="Higher = more creative code."
)
top_p_code = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="π Top-p",
info="Top-p sampling filter."
)
max_new_tokens_code = gr.Slider(
minimum=50,
maximum=2048,
value=1000,
step=10,
label="π Max New Tokens",
info="Maximum token length of generated code."
)
output_code = gr.Markdown(
value="π§Ύ Generated Code Output",
label="π§Ύ Generated Code Output",
show_label=True,
visible=True,
container=True,
height = 300,
show_copy_button=True
)
generate_code_btn.click(
fn=respond,
inputs=[code_des, temperature_code, top_p_code, max_new_tokens_code],
outputs=output_code
)
stop_code_btn.click(fn=stop_code_generation_func, outputs=output_code)
demo.launch()
|