Spaces:

ColdSlim
/

DermalCare

Running on Zero

App Files Files Community

DermalCare / app.py

ColdSlim

Update app.py

c8b3c1b verified 17 days ago

raw

history blame

3.22 kB

	"""
	PetBull‑7B‑VL demo – ZeroGPU‑ready
	"""
	import os
	import torch
	import spaces # <-- NEW: import spaces for ZeroGPU
	import gradio as gr
	from PIL import Image
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from peft import PeftModel

	# 0. Environment tweaks for Accelerate (unchanged)
	os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

	# 1. Config
	BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
	ADAPTER_REPO = "ColdSlim/PetBull-7B"
	ADAPTER_REV = "master"
	OFFLOAD_DIR = "offload"

	dtype = torch.float16 # <-- use float16 for GPU
	# 2. Load processor
	processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

	# 3. Load base model on CPU; stream shards to save RAM
	base = AutoModelForVision2Seq.from_pretrained(
	BASE_MODEL,
	torch_dtype=dtype,
	low_cpu_mem_usage=True,
	device_map={"": "cpu"},
	offload_folder=OFFLOAD_DIR,
	trust_remote_code=True,
	)

	# 4. Attach LoRA adapter on CPU
	model = PeftModel.from_pretrained(
	base,
	ADAPTER_REPO,
	revision=ADAPTER_REV,
	device_map={"": "cpu"},
	).eval()

	# Keep track of whether the model has been moved to GPU
	_model_on_gpu = False

	# 5. Inference helper – run on GPU when called
	@spaces.GPU # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
	def generate_answer(image, question: str,
	temperature: float = 0.7,
	top_p: float = 0.95,
	max_tokens: int = 256):
	global _model_on_gpu
	# provide a placeholder image if none was uploaded
	if image is None:
	image = Image.new("RGB", (224, 224), color="white")

	# move model to GPU once
	if not _model_on_gpu:
	model.to("cuda")
	_model_on_gpu = True

	# prepare inputs on GPU
	inputs = processor(text=[question], images=[image],
	return_tensors="pt").to("cuda")

	with torch.no_grad():
	output_ids = model.generate(**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p)
	# decode on CPU
	outputs = output_ids.to("cpu")
	return processor.batch_decode(outputs, skip_special_tokens=True)[0]

	# 6. Gradio UI (unchanged except for title)
	with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
	gr.Markdown(
	"## PetBull‑7B‑VL – Ask a Vet\n"
	"Upload a photo and/or type a question."
	)
	with gr.Row():
	with gr.Column():
	img_in = gr.Image(type="pil", label="Pet photo (optional)")
	txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
	ask = gr.Button("Ask PetBull")
	temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
	topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
	max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
	with gr.Column():
	answer = gr.Textbox(lines=12, label="Assistant", interactive=False)

	ask.click(generate_answer,
	inputs=[img_in, txt_in, temp, topp, max_tok],
	outputs=answer)

	demo.queue().launch()