prithivMLmods commited on
Commit
4334bf2
·
verified ·
1 Parent(s): 907a254

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -30,8 +30,8 @@ import ast
30
  import html
31
 
32
  # Constants for text generation
33
- MAX_MAX_NEW_TOKENS = 2048
34
- DEFAULT_MAX_NEW_TOKENS = 1024
35
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
36
 
37
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -78,8 +78,8 @@ model_x = AutoModelForVision2Seq.from_pretrained(
78
  torch_dtype=torch.float16
79
  ).to(device).eval()
80
 
81
- # Nemesis-VLMer-7B-0818
82
- MODEL_ID_N = "prithivMLmods/Nemesis-VLMer-7B-0818"
83
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
84
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
  MODEL_ID_N,
@@ -153,7 +153,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
153
  elif model_name == "Typhoon-OCR-7B":
154
  processor = processor_l
155
  model = model_l
156
- elif model_name == "Nemesis-VLMer-7B":
157
  processor = processor_n
158
  model = model_n
159
  else:
@@ -234,7 +234,7 @@ def generate_video(model_name: str, text: str, video_path: str,
234
  elif model_name == "Typhoon-OCR-7B":
235
  processor = processor_l
236
  model = model_l
237
- elif model_name == "Nemesis-VLMer-7B":
238
  processor = processor_n
239
  model = model_n
240
  else:
@@ -299,6 +299,7 @@ def generate_video(model_name: str, text: str, video_path: str,
299
  # Define examples for image and video inference
300
  image_examples = [
301
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
 
302
  ["OCR the image", "images/2.jpg"],
303
  ["Convert this page to docling", "images/1.png"],
304
  ["Convert this page to docling", "images/3.png"],
@@ -367,7 +368,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
367
  formatted_output = gr.Markdown(label="(Result.md)")
368
 
369
  model_choice = gr.Radio(
370
- choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B", "SmolDocling-256M-preview", "Nemesis-VLMer-7B"],
371
  label="Select Model",
372
  value="Nanonets-OCR-s"
373
  )
@@ -377,7 +378,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
377
  gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
378
  gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
379
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
380
- gr.Markdown("> [Nemesis-VLMer-7B-0818](https://huggingface.co/prithivMLmods/Nemesis-VLMer-7B-0818): The Nemesis-VLMer-7B-0818 model is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Reasoning, Content Analysis, and Visual Question Answering. Built on top of the Qwen2.5-VL architecture, this model enhances multimodal comprehension capabilities ")
381
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
382
 
383
  image_submit.click(
@@ -393,4 +394,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
393
  )
394
 
395
  if __name__ == "__main__":
396
- demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
30
  import html
31
 
32
  # Constants for text generation
33
+ MAX_MAX_NEW_TOKENS = 8192
34
+ DEFAULT_MAX_NEW_TOKENS = 4096
35
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
36
 
37
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
78
  torch_dtype=torch.float16
79
  ).to(device).eval()
80
 
81
+ # Thyme-RL
82
+ MODEL_ID_N = "Kwai-Keye/Thyme-RL"
83
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
84
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
  MODEL_ID_N,
 
153
  elif model_name == "Typhoon-OCR-7B":
154
  processor = processor_l
155
  model = model_l
156
+ elif model_name == "Thyme-RL":
157
  processor = processor_n
158
  model = model_n
159
  else:
 
234
  elif model_name == "Typhoon-OCR-7B":
235
  processor = processor_l
236
  model = model_l
237
+ elif model_name == "Thyme-RL":
238
  processor = processor_n
239
  model = model_n
240
  else:
 
299
  # Define examples for image and video inference
300
  image_examples = [
301
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
302
+ ["Describe the image!", "images/8.png"],
303
  ["OCR the image", "images/2.jpg"],
304
  ["Convert this page to docling", "images/1.png"],
305
  ["Convert this page to docling", "images/3.png"],
 
368
  formatted_output = gr.Markdown(label="(Result.md)")
369
 
370
  model_choice = gr.Radio(
371
+ choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
372
  label="Select Model",
373
  value="Nanonets-OCR-s"
374
  )
 
378
  gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
379
  gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
380
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
381
+ gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
382
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
383
 
384
  image_submit.click(
 
394
  )
395
 
396
  if __name__ == "__main__":
397
+ demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)