from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import threading import gradio as gr image_model_id = "Qwen/Qwen-VL-Chat-Int4" image_tokenizer = AutoTokenizer.from_pretrained(image_model_id, trust_remote_code=True) image_model = AutoModelForCausalLM.from_pretrained(image_model_id, device_map="cuda", trust_remote_code=True).eval() # Load model and tokenizer code_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" code_tokenizer = AutoTokenizer.from_pretrained(code_model_id, trust_remote_code=True) code_tokenizer.pad_token_id = code_tokenizer.eos_token_id code_model = AutoModelForCausalLM.from_pretrained( code_model_id, torch_dtype="float16", device_map="auto" ).eval() stop_image_generation = threading.Event() stop_code_generation = threading.Event() def generate_response_image(uploaded_image, user_prompt, temperature, top_p, max_new_tokens): stop_image_generation.clear() temp_path = "/tmp/temp_image.png" uploaded_image.save(temp_path) image_sys_prompt = ( "You are a helpful assistant that describes images very concisely. " "Provide a one-sentence summary of the image in less than 15 words. " "Use simple, direct language." ) # Compose prompt using tokenizer's helper query_text = image_tokenizer.from_list_format([ {"image": temp_path}, {"text": f"<|system|>\n{image_sys_prompt}\n<|end|>"}, {"text": f"<|user|>\n{user_prompt}\n<|end|>"}, {"text": "<|assistant|>"} ]) # Tokenize the input text -> get input_ids and attention_mask tensors inputs = image_tokenizer(query_text, return_tensors="pt").to("cuda") streamer = TextIteratorStreamer(image_tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( **inputs, streamer=streamer, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens, do_sample=True, use_cache=True, return_dict_in_generate=True, ) thread = threading.Thread(target=image_model.generate, kwargs=generation_kwargs) thread.start() response = "" for new_text in streamer: if stop_image_generation.is_set(): break response += new_text yield response def stop_image_generation_func(): stop_image_generation.set() return "" def generate_stream_local(prompt, temperature, top_p, max_new_tokens): stop_code_generation.clear() inputs = code_tokenizer(prompt, return_tensors="pt").to(code_model.device) streamer = TextIteratorStreamer(code_tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( **inputs, streamer=streamer, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens, do_sample=True, use_cache=True, return_dict_in_generate=True, ) thread = threading.Thread(target=code_model.generate, kwargs=generation_kwargs) thread.start() for new_text in streamer: if stop_code_generation.is_set(): break yield new_text # --- Respond logic for Gradio --- def respond(message, temperature, top_p, max_new_tokens): sys_prompt = ( "You are an AI coding assistant. If the user input is too vague to generate accurate code " "(e.g., lacks programming language, method, or details), ask clarifying questions before attempting to write the code.\n" "Think silently first and write your reasoning inside .... Then provide your final user-facing answer." ) full_prompt = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": message} ] prompt = code_tokenizer.apply_chat_template(full_prompt, tokenize=False, add_generation_prompt=True) response = "" for part in generate_stream_local(prompt, temperature, top_p, max_new_tokens): response += part yield response # Future work should separate the reasoning process from the final answer. # if "" in response: # yield response.split("")[-1].strip() def stop_code_generation_func(): stop_code_generation.set() return "๐Ÿงพ Generated Code Output" with gr.Blocks(theme=gr.themes.Soft()) as demo: # ๐Ÿ–ผ๏ธ Image Description Tab with gr.Tab("๐Ÿ–ผ๏ธ Image Description"): gr.Markdown("## ๐Ÿง  Qwen-VL: Vision-Language Streaming Chat with Image Upload") with gr.Row(equal_height=True): with gr.Column(scale=1): image_input = gr.Image( type="pil", label="๐Ÿ“ค Upload Image", height=480, width=480 ) with gr.Column(scale=1): prompt_input = gr.Textbox( label="๐Ÿ’ฌ Prompt", placeholder="e.g. Describe the image content", value="Describe the picture", lines=2 ) with gr.Row(): temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="๐ŸŽฒ Temperature", info="Controls randomness. Higher = more creative." ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="๐Ÿ” Top-p", info="Cumulative probability for nucleus sampling." ) max_new_tokens = gr.Slider( minimum=50, maximum=1000, value=500, step=10, label="๐Ÿ“ Max New Tokens", info="Maximum length of generated output." ) generate_btn = gr.Button("๐Ÿš€ Generate Description", variant="primary") stop_btn = gr.Button("โน๏ธ Stop and Clear", variant="stop") output = gr.Textbox( label="๐Ÿ“„ Streaming Response", placeholder="The model will respond here...", lines=10, interactive=False ) generate_btn.click( fn=generate_response_image, inputs=[image_input, prompt_input, temperature, top_p, max_new_tokens], outputs=output ) stop_btn.click(fn=stop_image_generation_func, outputs=output) # ๐Ÿ’ป Code Generator Tab with gr.Tab("๐Ÿ’ป Code Generator"): gr.Markdown("## ๐Ÿค– DeepSeek-R1-Distill-Qwen: Code Generation from Natural Language") with gr.Row(equal_height=True): with gr.Column(scale=2): code_des = gr.Textbox( label="๐Ÿงพ Describe Your Code", placeholder="e.g. Write a Python function to reverse a string", lines=8 ) generate_code_btn = gr.Button("๐Ÿง  Generate Code", variant="primary") stop_code_btn = gr.Button("โน๏ธ Stop and Clear", variant="stop") with gr.Column(scale=1): temperature_code = gr.Slider( minimum=0.1, maximum=1.5, value=0.7, step=0.05, label="๐ŸŽฒ Temperature", info="Higher = more creative code." ) top_p_code = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="๐Ÿ” Top-p", info="Top-p sampling filter." ) max_new_tokens_code = gr.Slider( minimum=50, maximum=2048, value=1000, step=10, label="๐Ÿ“ Max New Tokens", info="Maximum token length of generated code." ) output_code = gr.Markdown( value="๐Ÿงพ Generated Code Output", label="๐Ÿงพ Generated Code Output", show_label=True, visible=True, container=True, height = 300, show_copy_button=True ) generate_code_btn.click( fn=respond, inputs=[code_des, temperature_code, top_p_code, max_new_tokens_code], outputs=output_code ) stop_code_btn.click(fn=stop_code_generation_func, outputs=output_code) demo.launch()