File size: 2,959 Bytes
d660e96
005e1a9
d660e96
 
005e1a9
 
d660e96
005e1a9
d660e96
5155dfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
005e1a9
5155dfc
d660e96
 
5155dfc
 
 
 
005e1a9
 
5155dfc
005e1a9
 
 
 
 
 
 
 
 
 
 
5155dfc
005e1a9
d660e96
005e1a9
5155dfc
005e1a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d660e96
005e1a9
 
 
 
 
d660e96
 
005e1a9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from docling.document_converter import DocumentConverter
import tempfile
import os
import shutil
import requests

converter = DocumentConverter()

# def convert_input(pdf_file, pdf_url):
#     temp_pdf_path = None

#     try:
#         if pdf_file:
#             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
#                 tmp.write(pdf_file.read())
#                 temp_pdf_path = tmp.name
#                 source = temp_pdf_path
#         elif pdf_url:
#             source = pdf_url
#         else:
#             raise gr.Error("Please upload a PDF or provide a URL.")

#         doc = converter.convert(source).document
#         markdown = doc.export_to_markdown()

#         # Save markdown to a file
#         with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp_md:
#             tmp_md.write(markdown)
#             markdown_path = tmp_md.name

#         return pdf_file.name if pdf_file else None, markdown, markdown_path

#     except Exception as e:
#         return None, f"Error: {str(e)}", None
#     finally:
#         if temp_pdf_path and os.path.exists(temp_pdf_path):
#             os.remove(temp_pdf_path)

def convert_input(pdf_file, pdf_url):
    source = None

    try:
        if pdf_file is not None:
            # pdf_file is a NamedString object with .name being the local path
            source = pdf_file.name
            pdf_preview_path = source
        elif pdf_url:
            source = pdf_url
            pdf_preview_path = None  # No local file to preview
        else:
            raise gr.Error("Please upload a PDF or provide a URL.")

        doc = converter.convert(source).document
        markdown = doc.export_to_markdown()

        # Save markdown to a file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp_md:
            tmp_md.write(markdown)
            markdown_path = tmp_md.name

        return pdf_preview_path, markdown, markdown_path

    except Exception as e:
        return None, f"Error: {str(e)}", None


with gr.Blocks() as demo:
    gr.Markdown("# πŸ“„ PDF to Markdown with `docling`")
    gr.Markdown("Upload a PDF or enter a URL (e.g., from arXiv). View Markdown and download it.")

    with gr.Row():
        pdf_input = gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"])
        url_input = gr.Textbox(label="🌐 Or enter PDF URL (e.g., https://arxiv.org/pdf/...)")

    convert_btn = gr.Button("Convert to Markdown")

    with gr.Row():
        pdf_preview = gr.File(label="πŸ“˜ PDF Preview", interactive=False)
        md_file = gr.File(label="πŸ“ Download Markdown", interactive=False)

    md_output = gr.Markdown(label="πŸ“„ Markdown Output")

    convert_btn.click(
        fn=convert_input,
        inputs=[pdf_input, url_input],
        outputs=[pdf_preview, md_output, md_file]
    )

if __name__ == "__main__":
    demo.launch()