Spaces:

ColdSlim
/

DermalCare

Running on Zero

App Files Files Community

DermalCare / app.py

ColdSlim

Update app.py

79837da verified 16 days ago

raw

history blame

3.04 kB

	"""
	PetBull‑7B‑VL demo – ZeroGPU‑ready
	"""
	import os
	import torch
	import spaces
	import gradio as gr
	from PIL import Image
	from transformers import AutoProcessor, AutoModelForCausalLM
	from peft import PeftModel
	import transformers, accelerate, numpy as np

	print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)

	# 0) Safer streaming for model shards
	os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

	# 1) Config
	BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
	ADAPTER_REPO = "ColdSlim/PetBull-7B"
	ADAPTER_REV = "master"
	OFFLOAD_DIR = "offload"
	DTYPE = torch.float16

	# 2) Processor
	processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

	# 3) Load base model ON CPU (no AutoConfig; rely on remote code)
	base = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	torch_dtype=DTYPE,
	low_cpu_mem_usage=True,
	device_map={"": "cpu"},
	offload_folder=OFFLOAD_DIR,
	trust_remote_code=True,
	)

	# 4) Attach LoRA ON CPU
	model = PeftModel.from_pretrained(
	base,
	ADAPTER_REPO,
	revision=ADAPTER_REV,
	device_map={"": "cpu"},
	).eval()

	_model_on_gpu = False # track once-per-session transfer

	# 5) Inference (request GPU only for this function)
	@spaces.GPU(duration=120)
	def generate_answer(
	image,
	question: str,
	temperature: float = 0.7,
	top_p: float = 0.95,
	max_tokens: int = 256,
	) -> str:
	global _model_on_gpu

	if image is None:
	image = Image.new("RGB", (224, 224), color="white")

	# Move model to GPU once (inside GPU-decorated function)
	if not _model_on_gpu:
	model.to("cuda")
	_model_on_gpu = True

	# Prepare inputs on GPU
	inputs = processor(text=[question], images=[image], return_tensors="pt")
	inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}

	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	)

	outputs = output_ids.to("cpu")
	return processor.batch_decode(outputs, skip_special_tokens=True)[0]

	# 6) UI
	with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
	gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
	with gr.Row():
	with gr.Column():
	img_in = gr.Image(type="pil", label="Pet photo (optional)")
	txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
	ask = gr.Button("Ask PetBull")
	temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
	topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
	max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
	with gr.Column():
	answer = gr.Textbox(lines=12, label="Assistant", interactive=False)

	ask.click(
	generate_answer,
	inputs=[img_in, txt_in, temp, topp, max_tok],
	outputs=answer,
	)

	demo.queue().launch()