prithivMLmods commited on
Commit
926d2ec
·
verified ·
1 Parent(s): f341af1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -14
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import random
3
  import uuid
4
  import json
 
5
  import time
6
  import asyncio
7
  from threading import Thread
@@ -19,11 +20,9 @@ from transformers import (
19
  AutoTokenizer,
20
  TextIteratorStreamer,
21
  )
22
- from transformers.image_utils import load_image
23
-
24
- import subprocess
25
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
26
 
 
 
27
 
28
  # Constants for text generation
29
  MAX_MAX_NEW_TOKENS = 2048
@@ -32,8 +31,8 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
32
 
33
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
34
 
35
- # Load typhoon
36
- MODEL_ID_M = "Qwen/Qwen2.5-VL-3B-Instruct"
37
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
38
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
39
  MODEL_ID_M,
@@ -42,7 +41,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
42
  ).to(device).eval()
43
 
44
  # Load Space Thinker
45
- MODEL_ID_Z = "One-RL-to-See-Them-All/Orsta-32B-0326"
46
  processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
47
  model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
48
  MODEL_ID_Z,
@@ -50,7 +49,14 @@ model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
50
  torch_dtype=torch.float16
51
  ).to(device).eval()
52
 
53
-
 
 
 
 
 
 
 
54
 
55
  def downsample_video(video_path):
56
  """
@@ -83,12 +89,15 @@ def generate_image(model_name: str, text: str, image: Image.Image,
83
  """
84
  Generates responses using the selected model for image input.
85
  """
86
- if model_name == "Qwen2.5-VL-3B":
87
  processor = processor_m
88
  model = model_m
89
- elif model_name == "Orsta-32B-0326":
90
  processor = processor_z
91
  model = model_z
 
 
 
92
  else:
93
  yield "Invalid model selected."
94
  return
@@ -133,12 +142,15 @@ def generate_video(model_name: str, text: str, video_path: str,
133
  """
134
  Generates responses using the selected model for video input.
135
  """
136
- if model_name == "Qwen2.5-VL-3B":
137
  processor = processor_m
138
  model = model_m
139
- elif model_name == "Orsta-32B-0326":
140
  processor = processor_z
141
  model = model_z
 
 
 
142
  else:
143
  yield "Invalid model selected."
144
  return
@@ -239,9 +251,9 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
239
  with gr.Column():
240
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
241
  model_choice = gr.Radio(
242
- choices=["Qwen2.5-VL-3B", "Orsta-32B-0326"],
243
  label="Select Model",
244
- value="Orsta-32B-0326"
245
  )
246
 
247
  image_submit.click(
 
2
  import random
3
  import uuid
4
  import json
5
+ import requests
6
  import time
7
  import asyncio
8
  from threading import Thread
 
20
  AutoTokenizer,
21
  TextIteratorStreamer,
22
  )
 
 
 
 
23
 
24
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
25
+ from transformers.image_utils import load_image
26
 
27
  # Constants for text generation
28
  MAX_MAX_NEW_TOKENS = 2048
 
31
 
32
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
33
 
34
+ # Load SkyCaptioner-V1
35
+ MODEL_ID_M = "Skywork/SkyCaptioner-V1"
36
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
37
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
38
  MODEL_ID_M,
 
41
  ).to(device).eval()
42
 
43
  # Load Space Thinker
44
+ MODEL_ID_Z = "remyxai/SpaceThinker-Qwen2.5VL-3B"
45
  processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
46
  model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
47
  MODEL_ID_Z,
 
49
  torch_dtype=torch.float16
50
  ).to(device).eval()
51
 
52
+ # Load blip2-opt-2.7b
53
+ MODEL_ID_K = "Salesforce/blip2-opt-2.7b"
54
+ processor_k = Blip2Processor.from_pretrained(MODEL_ID_K, trust_remote_code=True)
55
+ model_k = Blip2ForConditionalGeneration.from_pretrained(
56
+ MODEL_ID_K,
57
+ trust_remote_code=True,
58
+ torch_dtype=torch.float16
59
+ ).to(device).eval()
60
 
61
  def downsample_video(video_path):
62
  """
 
89
  """
90
  Generates responses using the selected model for image input.
91
  """
92
+ if model_name == "SkyCaptioner-V1":
93
  processor = processor_m
94
  model = model_m
95
+ elif model_name == "SpaceThinker-3B":
96
  processor = processor_z
97
  model = model_z
98
+ elif model_name == "blip2-opt-2.7b":
99
+ processor = processor_k
100
+ model = model_k
101
  else:
102
  yield "Invalid model selected."
103
  return
 
142
  """
143
  Generates responses using the selected model for video input.
144
  """
145
+ if model_name == "SkyCaptioner-V1":
146
  processor = processor_m
147
  model = model_m
148
+ elif model_name == "SpaceThinker-3B":
149
  processor = processor_z
150
  model = model_z
151
+ elif model_name == "blip2-opt-2.7b":
152
+ processor = processor_k
153
+ model = model_k
154
  else:
155
  yield "Invalid model selected."
156
  return
 
251
  with gr.Column():
252
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
253
  model_choice = gr.Radio(
254
+ choices=["SkyCaptioner-V1", "SpaceThinker-3B", "blip2-opt-2.7b"],
255
  label="Select Model",
256
+ value="SkyCaptioner-V1"
257
  )
258
 
259
  image_submit.click(