Kuberwastaken commited on
Commit
e8b90e6
·
1 Parent(s): 6e0e43b

Switching to SmolVLM 500M

Browse files
Files changed (1) hide show
  1. app.py +28 -11
app.py CHANGED
@@ -23,29 +23,46 @@ import re
23
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
24
 
25
  def initialize_vision_model():
26
- # Using BLIP for image captioning - lightweight but effective
27
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
28
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
29
-
 
 
 
 
30
  return {
31
  "processor": processor,
32
- "model": model
 
33
  }
34
 
35
- def analyze_image(image, vision_components):
36
  processor = vision_components["processor"]
37
  model = vision_components["model"]
 
38
  if isinstance(image, np.ndarray):
39
  image = Image.fromarray(image)
40
  try:
41
- inputs = processor(image, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
42
  with torch.no_grad():
43
- outputs = model.generate(**inputs, max_length=30)
44
- caption = processor.decode(outputs[0], skip_special_tokens=True)
45
- return caption if isinstance(caption, str) else ""
46
  except Exception as e:
47
  print(f"Error in analyze_image: {str(e)}")
48
- return "" # Return empty string on error
49
 
50
  def initialize_llm():
51
  model_id = "meta-llama/Llama-3.2-1B-Instruct"
 
23
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
24
 
25
  def initialize_vision_model():
26
+ model_id = "HuggingFaceTB/SmolVLM-500M-Instruct"
27
+ processor = AutoProcessor.from_pretrained(model_id)
28
+ model = AutoModelForVision2Seq.from_pretrained(
29
+ model_id,
30
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
31
+ )
32
+ device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ model = model.to(device)
34
  return {
35
  "processor": processor,
36
+ "model": model,
37
+ "device": device
38
  }
39
 
40
+ def analyze_image(image, vision_components, instruction="What do you see?"):
41
  processor = vision_components["processor"]
42
  model = vision_components["model"]
43
+ device = vision_components["device"]
44
  if isinstance(image, np.ndarray):
45
  image = Image.fromarray(image)
46
  try:
47
+ # Prepare chat template
48
+ messages = [
49
+ {
50
+ "role": "user",
51
+ "content": [
52
+ {"type": "image"},
53
+ {"type": "text", "text": instruction}
54
+ ]
55
+ }
56
+ ]
57
+ text = processor.apply_chat_template(messages, add_generation_prompt=True)
58
+ inputs = processor(text, [image], return_tensors="pt", do_image_splitting=False).to(device)
59
  with torch.no_grad():
60
+ generated_ids = model.generate(**inputs, max_new_tokens=100)
61
+ output = processor.batch_decode(generated_ids[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
62
+ return output[0].strip() if output else ""
63
  except Exception as e:
64
  print(f"Error in analyze_image: {str(e)}")
65
+ return ""
66
 
67
  def initialize_llm():
68
  model_id = "meta-llama/Llama-3.2-1B-Instruct"