prithivMLmods commited on
Commit
faf747c
·
verified ·
1 Parent(s): 00bd3c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -15
app.py CHANGED
@@ -16,8 +16,7 @@ import cv2
16
 
17
  from transformers import (
18
  Qwen2_5_VLForConditionalGeneration,
19
- Qwen2VLForConditionalGeneration,
20
- AutoModel,
21
  AutoProcessor,
22
  AutoTokenizer,
23
  TextIteratorStreamer,
@@ -57,15 +56,13 @@ model_k = Qwen2VLForConditionalGeneration.from_pretrained(
57
  torch_dtype=torch.float16
58
  ).to(device).eval()
59
 
60
- # Load llama-nemoretriever-colembed-1b-v1
61
- MODEL_ID_Y = "nvidia/llama-nemoretriever-colembed-1b-v1"
62
- #processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
63
- model_y = AutoModel.from_pretrained(
64
  MODEL_ID_Y,
65
  trust_remote_code=True,
66
- torch_dtype=torch.float16,
67
- attn_implementation="flash_attention_2",
68
- revision='1f0fdea7f5b19532a750be109b19072d719b8177'
69
  ).to(device).eval()
70
 
71
 
@@ -109,8 +106,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
109
  elif model_name == "coreOCR-7B-050325-preview":
110
  processor = processor_k
111
  model = model_k
112
- elif model_name == "llama-nemoretriever-colembed-1b-v1":
113
- #processor = processor_y
114
  model = model_y
115
  else:
116
  yield "Invalid model selected."
@@ -166,8 +163,8 @@ def generate_video(model_name: str, text: str, video_path: str,
166
  elif model_name == "coreOCR-7B-050325-preview":
167
  processor = processor_k
168
  model = model_k
169
- elif model_name == "llama-nemoretriever-colembed-1b-v1":
170
- #processor = processor_y
171
  model = model_y
172
  else:
173
  yield "Invalid model selected."
@@ -270,7 +267,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
270
  with gr.Column():
271
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
272
  model_choice = gr.Radio(
273
- choices=["SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "llama-nemoretriever-colembed-1b-v1"],
274
  label="Select Model",
275
  value="SkyCaptioner-V1"
276
  )
@@ -278,7 +275,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
278
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
279
  gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
280
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
281
- gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
 
282
 
283
  image_submit.click(
284
  fn=generate_image,
 
16
 
17
  from transformers import (
18
  Qwen2_5_VLForConditionalGeneration,
19
+ Qwen2VLForConditionalGeneration,
 
20
  AutoProcessor,
21
  AutoTokenizer,
22
  TextIteratorStreamer,
 
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
+ # Load remyxai/SpaceOm
60
+ MODEL_ID_Y = "remyxai/SpaceOm"
61
+ processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
62
+ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
63
  MODEL_ID_Y,
64
  trust_remote_code=True,
65
+ torch_dtype=torch.float16
 
 
66
  ).to(device).eval()
67
 
68
 
 
106
  elif model_name == "coreOCR-7B-050325-preview":
107
  processor = processor_k
108
  model = model_k
109
+ elif model_name == "SpaceOm-3B":
110
+ processor = processor_y
111
  model = model_y
112
  else:
113
  yield "Invalid model selected."
 
163
  elif model_name == "coreOCR-7B-050325-preview":
164
  processor = processor_k
165
  model = model_k
166
+ elif model_name == "SpaceOm-3B":
167
+ processor = processor_y
168
  model = model_y
169
  else:
170
  yield "Invalid model selected."
 
267
  with gr.Column():
268
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
269
  model_choice = gr.Radio(
270
+ choices=["SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "SpaceOm-3B"],
271
  label="Select Model",
272
  value="SkyCaptioner-V1"
273
  )
 
275
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
276
  gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
277
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
278
+ gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
279
+ gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 "thinking" tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")
280
 
281
  image_submit.click(
282
  fn=generate_image,