matteomarjanovic commited on
Commit
166d9fd
·
1 Parent(s): 8749689

switch to groq for image description

Browse files
Files changed (2) hide show
  1. app.py +29 -22
  2. requirements.txt +1 -1
app.py CHANGED
@@ -6,7 +6,8 @@ import spaces #[uncomment to use ZeroGPU]
6
  from diffusers import DiffusionPipeline
7
  import torch
8
  import subprocess
9
- from transformers import IdeficsForVisionText2Text, AutoProcessor
 
10
 
11
  subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
12
 
@@ -28,10 +29,9 @@ pipe.load_lora_weights(lora_path, weight_name=weigths_file)
28
  MAX_SEED = np.iinfo(np.int32).max
29
  MAX_IMAGE_SIZE = 1024
30
 
31
- # Load IDEFICS model for generate the prompt
32
- checkpoint = "HuggingFaceM4/idefics-9b"
33
- processor = AutoProcessor.from_pretrained(checkpoint)
34
- idefics_model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
35
 
36
 
37
  @spaces.GPU #[uncomment to use ZeroGPU]
@@ -68,22 +68,29 @@ def generate_description_fn(
68
  image,
69
  progress=gr.Progress(track_tqdm=True),
70
  ):
71
- if randomize_seed:
72
- seed = random.randint(0, MAX_SEED)
73
-
74
- prompt = [
75
- "https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3542&q=80",
76
- ]
77
-
78
- generator = torch.Generator().manual_seed(seed)
79
-
80
- inputs = processor(prompt, return_tensors="pt").to("cuda")
81
- bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
82
-
83
- generated_ids = idefics_model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
84
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
85
-
86
- return generated_text[0]
 
 
 
 
 
 
 
87
 
88
 
89
  examples = [
@@ -105,7 +112,7 @@ with gr.Blocks(css=css) as demo:
105
  with gr.Row():
106
  with gr.Column(elem_id="col-input-image"):
107
  gr.Markdown(" # Drop your image here")
108
- input_image = gr.Image()
109
  generate_button = gr.Button("Generate", scale=0, variant="primary")
110
  generated_prompt = gr.Markdown("")
111
  with gr.Column(elem_id="col-container"):
 
6
  from diffusers import DiffusionPipeline
7
  import torch
8
  import subprocess
9
+ from groq import Groq
10
+ import base64
11
 
12
  subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
13
 
 
29
  MAX_SEED = np.iinfo(np.int32).max
30
  MAX_IMAGE_SIZE = 1024
31
 
32
+ def encode_image(image_path):
33
+ with open(image_path, "rb") as image_file:
34
+ return base64.b64encode(image_file.read()).decode('utf-8')
 
35
 
36
 
37
  @spaces.GPU #[uncomment to use ZeroGPU]
 
68
  image,
69
  progress=gr.Progress(track_tqdm=True),
70
  ):
71
+ base64_image = encode_image(image)
72
+
73
+ client = Groq()
74
+
75
+ chat_completion = client.chat.completions.create(
76
+ messages=[
77
+ {
78
+ "role": "user",
79
+ "content": [
80
+ {"type": "text", "text": "What's in this image?"},
81
+ {
82
+ "type": "image_url",
83
+ "image_url": {
84
+ "url": f"data:image/jpeg;base64,{base64_image}",
85
+ },
86
+ },
87
+ ],
88
+ }
89
+ ],
90
+ model="llama-3.2-11b-vision-preview",
91
+ )
92
+
93
+ return chat_completion.choices[0].message.content
94
 
95
 
96
  examples = [
 
112
  with gr.Row():
113
  with gr.Column(elem_id="col-input-image"):
114
  gr.Markdown(" # Drop your image here")
115
+ input_image = gr.Image(type="filepath")
116
  generate_button = gr.Button("Generate", scale=0, variant="primary")
117
  generated_prompt = gr.Markdown("")
118
  with gr.Column(elem_id="col-container"):
requirements.txt CHANGED
@@ -6,4 +6,4 @@ transformers
6
  xformers
7
  sentencepiece
8
  peft
9
- bitsandbytes
 
6
  xformers
7
  sentencepiece
8
  peft
9
+ groq