Spaces:

ColdSlim
/

DermalCare

Sleeping

App Files Files Community

DermalCare / app.py

ColdSlim

Update app.py

ba6b39e verified 14 days ago

raw

history blame contribute delete

3.77 kB

	"""
	PetBull‑7B‑VL demo – ZeroGPU‑ready (Qwen2.5‑VL API)
	"""
	import os
	import spaces
	import torch
	import gradio as gr
	from PIL import Image
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
	from peft import PeftModel
	from qwen_vl_utils import process_vision_info # pip install qwen-vl-utils
	import transformers, accelerate, numpy as np

	print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
	os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

	# ---- Config ----
	BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
	ADAPTER_REPO = "ColdSlim/PetBull-7B" # your LoRA
	ADAPTER_REV = "master"
	OFFLOAD_DIR = "offload"
	DTYPE = torch.float16

	# ---- Processor (no GPU) ----
	processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

	# ---- Base model ON CPU (do NOT touch CUDA here) ----
	base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	BASE_MODEL,
	torch_dtype=DTYPE,
	low_cpu_mem_usage=True,
	device_map={"": "cpu"},
	offload_folder=OFFLOAD_DIR,
	trust_remote_code=True,
	)

	# ---- Attach LoRA ON CPU ----
	model = PeftModel.from_pretrained(
	base,
	ADAPTER_REPO,
	revision=ADAPTER_REV,
	device_map={"": "cpu"},
	).eval()

	_model_on_gpu = False # once-per-session move

	# ---- Inference on GPU (ZeroGPU pattern) ----
	@spaces.GPU(duration=120)
	def generate_answer(image, question, temperature=0.7, top_p=0.95, max_tokens=256):
	"""
	Uses Qwen2.5-VL chat template + qwen_vl_utils to prepare image+text, then generate.
	"""
	global _model_on_gpu
	if image is None:
	image = Image.new("RGB", (224, 224), color="white")

	if not _model_on_gpu:
	model.to("cuda")
	_model_on_gpu = True

	# Build chat messages in Qwen format
	messages = [{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": question or "Describe this image."},
	],
	}]

	# Processor helpers
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)

	# Pack tensors on GPU
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = {k: (v.to("cuda") if hasattr(v, "to") else v) for k, v in inputs.items()}

	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	)

	# Trim prompt tokens before decode (Qwen style)
	trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
	return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

	# ---- UI ----
	with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU, Qwen2.5‑VL)") as demo:
	gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
	with gr.Row():
	with gr.Column():
	img_in = gr.Image(type="pil", label="Pet photo (optional)")
	txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
	ask = gr.Button("Ask PetBull")
	temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
	topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
	max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
	with gr.Column():
	answer = gr.Textbox(lines=12, label="Assistant", interactive=False)

	ask.click(generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer)

	demo.queue().launch(show_api=False, share=True)