Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -30,8 +30,8 @@ import ast
|
|
30 |
import html
|
31 |
|
32 |
# Constants for text generation
|
33 |
-
MAX_MAX_NEW_TOKENS =
|
34 |
-
DEFAULT_MAX_NEW_TOKENS =
|
35 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
36 |
|
37 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
@@ -78,8 +78,8 @@ model_x = AutoModelForVision2Seq.from_pretrained(
|
|
78 |
torch_dtype=torch.float16
|
79 |
).to(device).eval()
|
80 |
|
81 |
-
#
|
82 |
-
MODEL_ID_N = "
|
83 |
processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
|
84 |
model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
85 |
MODEL_ID_N,
|
@@ -153,7 +153,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
153 |
elif model_name == "Typhoon-OCR-7B":
|
154 |
processor = processor_l
|
155 |
model = model_l
|
156 |
-
elif model_name == "
|
157 |
processor = processor_n
|
158 |
model = model_n
|
159 |
else:
|
@@ -234,7 +234,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
234 |
elif model_name == "Typhoon-OCR-7B":
|
235 |
processor = processor_l
|
236 |
model = model_l
|
237 |
-
elif model_name == "
|
238 |
processor = processor_n
|
239 |
model = model_n
|
240 |
else:
|
@@ -299,6 +299,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
299 |
# Define examples for image and video inference
|
300 |
image_examples = [
|
301 |
["Reconstruct the doc [table] as it is.", "images/0.png"],
|
|
|
302 |
["OCR the image", "images/2.jpg"],
|
303 |
["Convert this page to docling", "images/1.png"],
|
304 |
["Convert this page to docling", "images/3.png"],
|
@@ -367,7 +368,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
367 |
formatted_output = gr.Markdown(label="(Result.md)")
|
368 |
|
369 |
model_choice = gr.Radio(
|
370 |
-
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B", "SmolDocling-256M-preview"
|
371 |
label="Select Model",
|
372 |
value="Nanonets-OCR-s"
|
373 |
)
|
@@ -377,7 +378,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
377 |
gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
378 |
gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
379 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
380 |
-
gr.Markdown("> [
|
381 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
382 |
|
383 |
image_submit.click(
|
@@ -393,4 +394,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
393 |
)
|
394 |
|
395 |
if __name__ == "__main__":
|
396 |
-
demo.queue(max_size=
|
|
|
30 |
import html
|
31 |
|
32 |
# Constants for text generation
|
33 |
+
MAX_MAX_NEW_TOKENS = 8192
|
34 |
+
DEFAULT_MAX_NEW_TOKENS = 4096
|
35 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
36 |
|
37 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
|
78 |
torch_dtype=torch.float16
|
79 |
).to(device).eval()
|
80 |
|
81 |
+
# Thyme-RL
|
82 |
+
MODEL_ID_N = "Kwai-Keye/Thyme-RL"
|
83 |
processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
|
84 |
model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
85 |
MODEL_ID_N,
|
|
|
153 |
elif model_name == "Typhoon-OCR-7B":
|
154 |
processor = processor_l
|
155 |
model = model_l
|
156 |
+
elif model_name == "Thyme-RL":
|
157 |
processor = processor_n
|
158 |
model = model_n
|
159 |
else:
|
|
|
234 |
elif model_name == "Typhoon-OCR-7B":
|
235 |
processor = processor_l
|
236 |
model = model_l
|
237 |
+
elif model_name == "Thyme-RL":
|
238 |
processor = processor_n
|
239 |
model = model_n
|
240 |
else:
|
|
|
299 |
# Define examples for image and video inference
|
300 |
image_examples = [
|
301 |
["Reconstruct the doc [table] as it is.", "images/0.png"],
|
302 |
+
["Describe the image!", "images/8.png"],
|
303 |
["OCR the image", "images/2.jpg"],
|
304 |
["Convert this page to docling", "images/1.png"],
|
305 |
["Convert this page to docling", "images/3.png"],
|
|
|
368 |
formatted_output = gr.Markdown(label="(Result.md)")
|
369 |
|
370 |
model_choice = gr.Radio(
|
371 |
+
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
|
372 |
label="Select Model",
|
373 |
value="Nanonets-OCR-s"
|
374 |
)
|
|
|
378 |
gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
379 |
gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
380 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
381 |
+
gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
|
382 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
383 |
|
384 |
image_submit.click(
|
|
|
394 |
)
|
395 |
|
396 |
if __name__ == "__main__":
|
397 |
+
demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
|