Spaces:

prithivMLmods
/

Multimodal-OCR2

Running on Zero

App Files Files Community

prithivMLmods commited on 3 days ago

Commit

4334bf2

verified ·

1 Parent(s): 907a254

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -30,8 +30,8 @@ import ast
 import html
 # Constants for text generation
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -78,8 +78,8 @@ model_x = AutoModelForVision2Seq.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Nemesis-VLMer-7B-0818
-MODEL_ID_N = "prithivMLmods/Nemesis-VLMer-7B-0818"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
 model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_N,
@@ -153,7 +153,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "Typhoon-OCR-7B":
         processor = processor_l
         model = model_l
-    elif model_name == "Nemesis-VLMer-7B":
         processor = processor_n
         model = model_n
     else:
@@ -234,7 +234,7 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "Typhoon-OCR-7B":
         processor = processor_l
         model = model_l
-    elif model_name == "Nemesis-VLMer-7B":
         processor = processor_n
         model = model_n
     else:
@@ -299,6 +299,7 @@ def generate_video(model_name: str, text: str, video_path: str,
 # Define examples for image and video inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["OCR the image", "images/2.jpg"],
     ["Convert this page to docling", "images/1.png"],
     ["Convert this page to docling", "images/3.png"],
@@ -367,7 +368,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     formatted_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B", "SmolDocling-256M-preview", "Nemesis-VLMer-7B"],
                 label="Select Model",
                 value="Nanonets-OCR-s"
             )
@@ -377,7 +378,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
             gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
             gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
-            gr.Markdown("> [Nemesis-VLMer-7B-0818](https://huggingface.co/prithivMLmods/Nemesis-VLMer-7B-0818): The Nemesis-VLMer-7B-0818 model is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Reasoning, Content Analysis, and Visual Question Answering. Built on top of the Qwen2.5-VL architecture, this model enhances multimodal comprehension capabilities ")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
@@ -393,4 +394,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

 import html
 # Constants for text generation
+MAX_MAX_NEW_TOKENS = 8192
+DEFAULT_MAX_NEW_TOKENS = 4096
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     torch_dtype=torch.float16
 ).to(device).eval()
+# Thyme-RL
+MODEL_ID_N = "Kwai-Keye/Thyme-RL"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
 model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_N,
     elif model_name == "Typhoon-OCR-7B":
         processor = processor_l
         model = model_l
+    elif model_name == "Thyme-RL":
         processor = processor_n
         model = model_n
     else:
     elif model_name == "Typhoon-OCR-7B":
         processor = processor_l
         model = model_l
+    elif model_name == "Thyme-RL":
         processor = processor_n
         model = model_n
     else:
 # Define examples for image and video inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
+    ["Describe the image!", "images/8.png"],
     ["OCR the image", "images/2.jpg"],
     ["Convert this page to docling", "images/1.png"],
     ["Convert this page to docling", "images/3.png"],
                     formatted_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
                 label="Select Model",
                 value="Nanonets-OCR-s"
             )
             gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
             gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
             gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
+            gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)