ma7583 commited on
Commit
005e1a9
Β·
verified Β·
1 Parent(s): 45f218f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -21
app.py CHANGED
@@ -1,33 +1,63 @@
1
  import gradio as gr
2
- from docling.parsers.pdf_parser import PDFParser
3
  import tempfile
4
  import os
 
 
5
 
6
- def pdf_to_markdown(pdf_file):
7
- if pdf_file is None:
8
- return "No file uploaded."
9
 
10
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
11
- tmp.write(pdf_file.read())
12
- tmp_path = tmp.name
13
 
14
  try:
15
- parser = PDFParser(tmp_path)
16
- doc = parser.parse()
17
- markdown = doc.to_markdown()
18
- return markdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  except Exception as e:
20
- return f"Error parsing PDF: {str(e)}"
21
  finally:
22
- os.remove(tmp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- iface = gr.Interface(
25
- fn=pdf_to_markdown,
26
- inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
27
- outputs=gr.Markdown(label="Markdown Output"),
28
- title="PDF to Markdown with docling",
29
- description="Upload a PDF file. This app parses it using `docling` and displays the Markdown version."
30
- )
31
 
32
  if __name__ == "__main__":
33
- iface.launch()
 
1
  import gradio as gr
2
+ from docling.document_converter import DocumentConverter
3
  import tempfile
4
  import os
5
+ import shutil
6
+ import requests
7
 
8
+ converter = DocumentConverter()
 
 
9
 
10
+ def convert_input(pdf_file, pdf_url):
11
+ temp_pdf_path = None
 
12
 
13
  try:
14
+ if pdf_file:
15
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
16
+ tmp.write(pdf_file.read())
17
+ temp_pdf_path = tmp.name
18
+ source = temp_pdf_path
19
+ elif pdf_url:
20
+ source = pdf_url
21
+ else:
22
+ raise gr.Error("Please upload a PDF or provide a URL.")
23
+
24
+ doc = converter.convert(source).document
25
+ markdown = doc.export_to_markdown()
26
+
27
+ # Save markdown to a file
28
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") as tmp_md:
29
+ tmp_md.write(markdown)
30
+ markdown_path = tmp_md.name
31
+
32
+ return pdf_file.name if pdf_file else None, markdown, markdown_path
33
+
34
  except Exception as e:
35
+ return None, f"Error: {str(e)}", None
36
  finally:
37
+ if temp_pdf_path and os.path.exists(temp_pdf_path):
38
+ os.remove(temp_pdf_path)
39
+
40
+ with gr.Blocks() as demo:
41
+ gr.Markdown("# πŸ“„ PDF to Markdown with `docling`")
42
+ gr.Markdown("Upload a PDF or enter a URL (e.g., from arXiv). View Markdown and download it.")
43
+
44
+ with gr.Row():
45
+ pdf_input = gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"])
46
+ url_input = gr.Textbox(label="🌐 Or enter PDF URL (e.g., https://arxiv.org/pdf/...)")
47
+
48
+ convert_btn = gr.Button("Convert to Markdown")
49
+
50
+ with gr.Row():
51
+ pdf_preview = gr.File(label="πŸ“˜ PDF Preview", interactive=False)
52
+ md_file = gr.File(label="πŸ“ Download Markdown", interactive=False)
53
+
54
+ md_output = gr.Markdown(label="πŸ“„ Markdown Output")
55
 
56
+ convert_btn.click(
57
+ fn=convert_input,
58
+ inputs=[pdf_input, url_input],
59
+ outputs=[pdf_preview, md_output, md_file]
60
+ )
 
 
61
 
62
  if __name__ == "__main__":
63
+ demo.launch()