ColdSlim commited on
Commit
a9fcaee
·
verified ·
1 Parent(s): 97a1db9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -67
app.py CHANGED
@@ -1,102 +1,107 @@
1
  """
2
- PetBull-7B-VL demo Space
3
- ------------------------
4
 
5
  • Base model : Qwen/Qwen2.5-VL-7B-Instruct
6
- • LoRA adapter: ColdSlim/PetBull-7B (light-weight repo you just pushed)
7
 
8
- Put this file in your Space, add a `requirements.txt` with:
9
- transformers>=4.41.0
10
- peft>=0.11.0
11
- accelerate
12
- gradio>=4.33
13
-
14
- Then (optionally) switch the Space hardware to **GPU (shared)** in
15
- Settings → Hardware for much faster vision-language inference.
16
  """
17
 
18
- import torch, gradio as gr
19
  from PIL import Image
20
  from transformers import AutoProcessor, AutoModelForVision2Seq
21
  from peft import PeftModel
22
- from transformers import BitsAndBytesConfig
23
 
24
  # ---------------------------------------------------------------------
25
- # 1 Load base + LoRA (≈ 12 GB VRAM in bf16; falls back to CPU if needed)
 
 
 
 
 
26
  # ---------------------------------------------------------------------
27
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
28
- ADAPTER_REPO = "ColdSlim/PetBull-7B" # 👉 replace with your HF path if different
29
- ADAPTER_REV = "master"
 
30
 
31
- device = "cuda" if torch.cuda.is_available() else "cpu"
32
- dtype = torch.float16 if device == "cuda" else torch.float32
33
 
 
 
 
34
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
35
 
36
- quant_cfg = BitsAndBytesConfig(
37
- load_in_4bit=True,
38
- bnb_4bit_compute_dtype=torch.float16,
39
- bnb_4bit_use_double_quant=True,
40
- bnb_4bit_quant_type="nf4"
 
 
 
 
 
41
  )
42
 
43
- base = AutoModelForVision2Seq.from_pretrained(
44
- BASE_MODEL, device_map="auto", torch_dtype=torch.float16, quantization_config=quant_cfg, trust_remote_code=True)
45
- model = PeftModel.from_pretrained(base, ADAPTER_REPO, revision=ADAPTER_REV)
46
- model.to(device).eval()
 
 
 
 
 
47
 
48
  # ---------------------------------------------------------------------
49
- # 2 Inference helper
50
  # ---------------------------------------------------------------------
51
- def generate_answer(image: Image.Image | None,
52
- question: str,
53
- temperature: float = 0.7,
54
- top_p: float = 0.95,
55
- max_tokens: int = 512) -> str:
56
- """
57
- Runs one‐shot VQA chat. Image is optional; if None we still obey the
58
- prompt format required by Qwen-VL by inserting a blank white image.
59
- """
60
  if image is None:
61
  image = Image.new("RGB", (224, 224), color="white")
62
 
63
- inputs = processor(text=[question],
64
- images=[image],
65
- return_tensors="pt").to(device)
66
-
67
  with torch.no_grad():
68
- output_ids = model.generate(**inputs,
69
- max_new_tokens=max_tokens,
70
- temperature=temperature,
71
- top_p=top_p)
72
- return processor.batch_decode(output_ids,
73
- skip_special_tokens=True)[0]
74
 
75
  # ---------------------------------------------------------------------
76
- # 3 Gradio UI
77
  # ---------------------------------------------------------------------
78
- with gr.Blocks(title="PetBull-7B-VL – Ask a Vet Bot") as demo:
79
  gr.Markdown(
80
- """
81
- ## 🐾 PetBull-7B-VL
82
- Upload a photo of your pet **and/or** ask a question.
83
- The model will analyse the image (if provided) and give tailored advice.
84
- """
85
  )
86
 
87
  with gr.Row():
88
- with gr.Column(scale=1):
89
- img_in = gr.Image(type="pil", label="Pet photo (optional)")
90
- txt_in = gr.Textbox(lines=3, placeholder="Describe the issue or ask a question…")
91
- run_btn = gr.Button("Ask PetBull")
92
- temp_sl = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
93
- topp_sl = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
94
- max_sl = gr.Slider(32, 1024, 512, step=8, label="Max new tokens")
95
- with gr.Column(scale=1):
96
- answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
97
-
98
- run_btn.click(fn=generate_answer,
99
- inputs=[img_in, txt_in, temp_sl, topp_sl, max_sl],
100
- outputs=answer)
101
 
102
  demo.queue().launch()
 
1
  """
2
+ PetBull-7B-VL demo – CPU-only, 16 GB-friendly
3
+ --------------------------------------------
4
 
5
  • Base model : Qwen/Qwen2.5-VL-7B-Instruct
6
+ • LoRA adapter: ColdSlim/PetBull-7B (master branch)
7
 
8
+ This script:
9
+ ✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
10
+ ✓ streams weights to avoid peak memory spikes
11
+ ✓ off-loads large tensors to disk when RAM is tight
 
 
 
 
12
  """
13
 
14
+ import os, torch, gradio as gr
15
  from PIL import Image
16
  from transformers import AutoProcessor, AutoModelForVision2Seq
17
  from peft import PeftModel
 
18
 
19
  # ---------------------------------------------------------------------
20
+ # 0 Env tweaks for Hugging Face Accelerate
21
+ # ---------------------------------------------------------------------
22
+ os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true" # safer streaming
23
+
24
+ # ---------------------------------------------------------------------
25
+ # 1 Config
26
  # ---------------------------------------------------------------------
27
  BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
28
+ ADAPTER_REPO = "ColdSlim/PetBull-7B"
29
+ ADAPTER_REV = "master" # your model repo branch
30
+ OFFLOAD_DIR = "offload" # folder on disk for big tensors
31
 
32
+ device = "cpu" # force CPU
33
+ dtype = torch.bfloat16 # lighter than FP16 on modern CPUs
34
 
35
+ # ---------------------------------------------------------------------
36
+ # 2 Load processor (tiny)
37
+ # ---------------------------------------------------------------------
38
  processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
39
 
40
+ # ---------------------------------------------------------------------
41
+ # 3 Load base model with memory-savvy flags
42
+ # ---------------------------------------------------------------------
43
+ base = AutoModelForVision2Seq.from_pretrained(
44
+ BASE_MODEL,
45
+ torch_dtype=dtype,
46
+ low_cpu_mem_usage=True, # stream shards
47
+ device_map={"": "cpu"}, # everything on CPU
48
+ offload_folder=OFFLOAD_DIR, # mmap big tensors to disk
49
+ trust_remote_code=True
50
  )
51
 
52
+ # ---------------------------------------------------------------------
53
+ # 4 Attach LoRA
54
+ # ---------------------------------------------------------------------
55
+ model = PeftModel.from_pretrained(
56
+ base,
57
+ ADAPTER_REPO,
58
+ revision=ADAPTER_REV,
59
+ device_map={"": "cpu"}
60
+ ).eval()
61
 
62
  # ---------------------------------------------------------------------
63
+ # 5 Inference helper
64
  # ---------------------------------------------------------------------
65
+ def generate_answer(
66
+ image: Image.Image | None,
67
+ question: str,
68
+ temperature: float = 0.7,
69
+ top_p: float = 0.95,
70
+ max_tokens: int = 256, # keep small for RAM headroom
71
+ ) -> str:
 
 
72
  if image is None:
73
  image = Image.new("RGB", (224, 224), color="white")
74
 
75
+ inputs = processor(text=[question], images=[image], return_tensors="pt")
 
 
 
76
  with torch.no_grad():
77
+ output_ids = model.generate(
78
+ **inputs, max_new_tokens=max_tokens,
79
+ temperature=temperature, top_p=top_p
80
+ )
81
+ return processor.batch_decode(output_ids, skip_special_tokens=True)[0]
 
82
 
83
  # ---------------------------------------------------------------------
84
+ # 6 Gradio UI
85
  # ---------------------------------------------------------------------
86
+ with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
87
  gr.Markdown(
88
+ "## 🐾 PetBull-7B-VL – Ask a Vet\n"
89
+ "Upload a photo and/or type a question."
 
 
 
90
  )
91
 
92
  with gr.Row():
93
+ with gr.Column():
94
+ img_in = gr.Image(type="pil", label="Pet photo (optional)")
95
+ txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
96
+ ask = gr.Button("Ask PetBull")
97
+ temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
98
+ topp = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
99
+ max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
100
+ with gr.Column():
101
+ answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
102
+
103
+ ask.click(generate_answer,
104
+ inputs=[img_in, txt_in, temp, topp, max_tok],
105
+ outputs=answer)
106
 
107
  demo.queue().launch()