Spaces:
Running
Running
Synced repo using 'sync_with_huggingface' Github Action
Browse files- gradio_app.py +12 -5
- requirements.txt +1 -1
gradio_app.py
CHANGED
@@ -119,11 +119,13 @@ with gr.Blocks(title="Marker") as demo:
|
|
119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
120 |
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
|
121 |
|
|
|
122 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
123 |
show_blocks_ckb = gr.Checkbox(label="Show Blocks", info="Display detected blocks, only when output is JSON", value=False, interactive=False)
|
124 |
debug_ckb = gr.Checkbox(label="Debug", value=False, info="Show debug information")
|
125 |
-
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
126 |
strip_existing_ocr_ckb = gr.Checkbox(label="Strip existing OCR", value=False, info="Strip existing OCR text from the PDF and re-OCR.")
|
|
|
|
|
127 |
run_marker_btn = gr.Button("Run Marker", interactive=False)
|
128 |
with gr.Column():
|
129 |
result_md = gr.Markdown(label="Result markdown", visible=False)
|
@@ -191,7 +193,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
191 |
)
|
192 |
|
193 |
# Run Marker
|
194 |
-
def run_marker_img(filename, page_range, force_ocr, output_format, show_blocks, debug, use_llm, strip_existing_ocr):
|
195 |
"""
|
196 |
Run marker on the given PDF file and return processed results in multiple formats.
|
197 |
|
@@ -209,7 +211,10 @@ with gr.Blocks(title="Marker") as demo:
|
|
209 |
Defaults to False.
|
210 |
strip_existing_ocr (bool, optional): If True, strip embedded OCR text and re-run OCR.
|
211 |
Defaults to False.
|
212 |
-
|
|
|
|
|
|
|
213 |
Returns:
|
214 |
tuple:
|
215 |
- markdown_result (str): Markdown output string.
|
@@ -226,7 +231,9 @@ with gr.Blocks(title="Marker") as demo:
|
|
226 |
"debug": debug,
|
227 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
228 |
"use_llm": use_llm,
|
229 |
-
"strip_existing_ocr": strip_existing_ocr
|
|
|
|
|
230 |
}
|
231 |
config_parser = ConfigParser(cli_options)
|
232 |
rendered = convert_pdf(
|
@@ -310,7 +317,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
310 |
|
311 |
run_marker_btn.click(
|
312 |
fn=run_marker_img,
|
313 |
-
inputs=[in_file, page_range_txt, force_ocr_ckb, output_format_dd, show_blocks_ckb, debug_ckb, use_llm_ckb, strip_existing_ocr_ckb],
|
314 |
outputs=[result_md, result_json, result_html, debug_img_pdf, debug_img_layout, in_img]
|
315 |
)
|
316 |
|
|
|
119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
120 |
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
|
121 |
|
122 |
+
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
123 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
124 |
show_blocks_ckb = gr.Checkbox(label="Show Blocks", info="Display detected blocks, only when output is JSON", value=False, interactive=False)
|
125 |
debug_ckb = gr.Checkbox(label="Debug", value=False, info="Show debug information")
|
|
|
126 |
strip_existing_ocr_ckb = gr.Checkbox(label="Strip existing OCR", value=False, info="Strip existing OCR text from the PDF and re-OCR.")
|
127 |
+
format_lines_ckb = gr.Checkbox(label="Format lines", value=False, info="Format lines in the document with OCR model")
|
128 |
+
disable_ocr_math_ckb = gr.Checkbox(label="Disable math", value=False, info="Disable math in OCR output - no inline math")
|
129 |
run_marker_btn = gr.Button("Run Marker", interactive=False)
|
130 |
with gr.Column():
|
131 |
result_md = gr.Markdown(label="Result markdown", visible=False)
|
|
|
193 |
)
|
194 |
|
195 |
# Run Marker
|
196 |
+
def run_marker_img(filename, page_range, force_ocr, output_format, show_blocks, debug, use_llm, strip_existing_ocr, format_lines, disable_ocr_math):
|
197 |
"""
|
198 |
Run marker on the given PDF file and return processed results in multiple formats.
|
199 |
|
|
|
211 |
Defaults to False.
|
212 |
strip_existing_ocr (bool, optional): If True, strip embedded OCR text and re-run OCR.
|
213 |
Defaults to False.
|
214 |
+
format_lines (bool, optional): If True, format lines in the document with OCR model.
|
215 |
+
Defaults to False.
|
216 |
+
disable_ocr_math (bool, optional): If True, disable math in OCR output - no inline math.
|
217 |
+
Defaults to False.
|
218 |
Returns:
|
219 |
tuple:
|
220 |
- markdown_result (str): Markdown output string.
|
|
|
231 |
"debug": debug,
|
232 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
233 |
"use_llm": use_llm,
|
234 |
+
"strip_existing_ocr": strip_existing_ocr,
|
235 |
+
"format_lines": format_lines,
|
236 |
+
"disable_ocr_math": disable_ocr_math,
|
237 |
}
|
238 |
config_parser = ConfigParser(cli_options)
|
239 |
rendered = convert_pdf(
|
|
|
317 |
|
318 |
run_marker_btn.click(
|
319 |
fn=run_marker_img,
|
320 |
+
inputs=[in_file, page_range_txt, force_ocr_ckb, output_format_dd, show_blocks_ckb, debug_ckb, use_llm_ckb, strip_existing_ocr_ckb, format_lines_ckb, disable_ocr_math_ckb],
|
321 |
outputs=[result_md, result_json, result_html, debug_img_pdf, debug_img_layout, in_img]
|
322 |
)
|
323 |
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
torch==2.5.1
|
2 |
-
marker-pdf[full]==1.7.
|
3 |
gradio[mcp]==5.28.0
|
4 |
huggingface-hub==0.28.1
|
5 |
|
|
|
1 |
torch==2.5.1
|
2 |
+
marker-pdf[full]==1.7.1
|
3 |
gradio[mcp]==5.28.0
|
4 |
huggingface-hub==0.28.1
|
5 |
|