Spaces:

ColdSlim
/

DermalCare

Running on Zero

App Files Files Community

ColdSlim commited on Jul 8

Commit

a9fcaee

verified ·

1 Parent(s): 97a1db9

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -67

app.py CHANGED Viewed

@@ -1,102 +1,107 @@
 """
-PetBull-7B-VL demo Space
-------------------------
 • Base model  : Qwen/Qwen2.5-VL-7B-Instruct
-• LoRA adapter: ColdSlim/PetBull-7B   (light-weight repo you just pushed)
-Put this file in your Space, add a `requirements.txt` with:
-    transformers>=4.41.0
-    peft>=0.11.0
-    accelerate
-    gradio>=4.33
-Then (optionally) switch the Space hardware to **GPU (shared)** in
-Settings → Hardware for much faster vision-language inference.
 """
-import torch, gradio as gr
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from peft import PeftModel
-from transformers import BitsAndBytesConfig
 # ---------------------------------------------------------------------
-# 1  Load base + LoRA  (≈ 12 GB VRAM in bf16; falls back to CPU if needed)
 # ---------------------------------------------------------------------
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
-ADAPTER_REPO = "ColdSlim/PetBull-7B"     # 👉 replace with your HF path if different
-ADAPTER_REV  = "master"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-dtype  = torch.float16 if device == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
-quant_cfg = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4"
 )
-base      = AutoModelForVision2Seq.from_pretrained(
-                BASE_MODEL, device_map="auto", torch_dtype=torch.float16, quantization_config=quant_cfg, trust_remote_code=True)
-model     = PeftModel.from_pretrained(base, ADAPTER_REPO, revision=ADAPTER_REV)
-model.to(device).eval()
 # ---------------------------------------------------------------------
-# 2  Inference helper
 # ---------------------------------------------------------------------
-def generate_answer(image: Image.Image | None,
-                    question: str,
-                    temperature: float = 0.7,
-                    top_p: float = 0.95,
-                    max_tokens: int = 512) -> str:
-    """
-    Runs one‐shot VQA chat. Image is optional; if None we still obey the
-    prompt format required by Qwen-VL by inserting a blank white image.
-    """
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
-    inputs = processor(text=[question],
-                       images=[image],
-                       return_tensors="pt").to(device)
     with torch.no_grad():
-        output_ids = model.generate(**inputs,
-                                    max_new_tokens=max_tokens,
-                                    temperature=temperature,
-                                    top_p=top_p)
-    return processor.batch_decode(output_ids,
-                                  skip_special_tokens=True)[0]
 # ---------------------------------------------------------------------
-# 3  Gradio UI
 # ---------------------------------------------------------------------
-with gr.Blocks(title="PetBull-7B-VL – Ask a Vet Bot") as demo:
     gr.Markdown(
-        """
-        ## 🐾 PetBull-7B-VL
-        Upload a photo of your pet **and/or** ask a question.
-        The model will analyse the image (if provided) and give tailored advice.
-        """
     )
     with gr.Row():
-        with gr.Column(scale=1):
-            img_in   = gr.Image(type="pil", label="Pet photo (optional)")
-            txt_in   = gr.Textbox(lines=3, placeholder="Describe the issue or ask a question…")
-            run_btn  = gr.Button("Ask PetBull")
-            temp_sl  = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
-            topp_sl  = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
-            max_sl   = gr.Slider(32, 1024, 512, step=8, label="Max new tokens")
-        with gr.Column(scale=1):
-            answer   = gr.Textbox(lines=12, label="Assistant", interactive=False)
-    run_btn.click(fn=generate_answer,
-                  inputs=[img_in, txt_in, temp_sl, topp_sl, max_sl],
-                  outputs=answer)
 demo.queue().launch()

 """
+PetBull-7B-VL demo – CPU-only, 16 GB-friendly
+--------------------------------------------
 • Base model  : Qwen/Qwen2.5-VL-7B-Instruct
+• LoRA adapter: ColdSlim/PetBull-7B   (master branch)
+This script:
+  ✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
+  ✓ streams weights to avoid peak memory spikes
+  ✓ off-loads large tensors to disk when RAM is tight
 """
+import os, torch, gradio as gr
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from peft import PeftModel
 # ---------------------------------------------------------------------
+# 0 Env tweaks for Hugging Face Accelerate
+# ---------------------------------------------------------------------
+os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"     # safer streaming
+# ---------------------------------------------------------------------
+# 1 Config
 # ---------------------------------------------------------------------
 BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
+ADAPTER_REPO = "ColdSlim/PetBull-7B"
+ADAPTER_REV  = "master"          # your model repo branch
+OFFLOAD_DIR  = "offload"         # folder on disk for big tensors
+device = "cpu"                   # force CPU
+dtype  = torch.bfloat16          # lighter than FP16 on modern CPUs
+# ---------------------------------------------------------------------
+# 2 Load processor (tiny)
+# ---------------------------------------------------------------------
 processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
+# ---------------------------------------------------------------------
+# 3 Load base model with memory-savvy flags
+# ---------------------------------------------------------------------
+base = AutoModelForVision2Seq.from_pretrained(
+    BASE_MODEL,
+    torch_dtype=dtype,
+    low_cpu_mem_usage=True,          # stream shards
+    device_map={"": "cpu"},          # everything on CPU
+    offload_folder=OFFLOAD_DIR,      # mmap big tensors to disk
+    trust_remote_code=True
 )
+# ---------------------------------------------------------------------
+# 4 Attach LoRA
+# ---------------------------------------------------------------------
+model = PeftModel.from_pretrained(
+    base,
+    ADAPTER_REPO,
+    revision=ADAPTER_REV,
+    device_map={"": "cpu"}
+).eval()
 # ---------------------------------------------------------------------
+# 5 Inference helper
 # ---------------------------------------------------------------------
+def generate_answer(
+    image: Image.Image | None,
+    question: str,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    max_tokens: int = 256,           # keep small for RAM headroom
+) -> str:
     if image is None:
         image = Image.new("RGB", (224, 224), color="white")
+    inputs = processor(text=[question], images=[image], return_tensors="pt")
     with torch.no_grad():
+        output_ids = model.generate(
+            **inputs, max_new_tokens=max_tokens,
+            temperature=temperature, top_p=top_p
+        )
+    return processor.batch_decode(output_ids, skip_special_tokens=True)[0]
 # ---------------------------------------------------------------------
+# 6 Gradio UI
 # ---------------------------------------------------------------------
+with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
     gr.Markdown(
+        "## 🐾 PetBull-7B-VL – Ask a Vet\n"
+        "Upload a photo and/or type a question."
     )
     with gr.Row():
+        with gr.Column():
+            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
+            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
+            ask     = gr.Button("Ask PetBull")
+            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
+            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
+            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
+        with gr.Column():
+            answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)
+    ask.click(generate_answer,
+              inputs=[img_in, txt_in, temp, topp, max_tok],
+              outputs=answer)
 demo.queue().launch()