leonarb commited on
Commit
a67d3a2
·
verified ·
1 Parent(s): 001bc7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -31
app.py CHANGED
@@ -60,39 +60,35 @@ def ocr_page(pdf_path, page_num):
60
  return decoded[0] if decoded else ""
61
 
62
  def convert_pdf_to_epub(pdf_file, title, author, language):
63
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
64
- tmp_pdf.write(pdf_file.read())
65
- tmp_pdf_path = tmp_pdf.name
 
66
 
 
67
  reader = PdfReader(tmp_pdf_path)
68
- num_pages = len(reader.pages)
69
-
70
- book = epub.EpubBook()
71
- book.set_title(title)
72
- book.add_author(author)
73
- book.set_language(language)
74
-
75
- # Set cover from page 1
76
- cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
77
- cover_image_bytes = base64.b64decode(cover_image_b64)
78
- book.set_cover("cover.jpg", cover_image_bytes)
79
-
80
- # Add OCR'd pages as chapters
81
- for i in range(num_pages):
82
- text = ocr_page(tmp_pdf_path, i)
83
- chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
84
- chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
85
- book.add_item(chapter)
86
- book.spine.append(chapter)
87
-
88
- # Finalize EPUB
89
- book.add_item(epub.EpubNcx())
90
- book.add_item(epub.EpubNav())
91
- epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
92
- epub.write_epub(epub_path, book, {})
93
-
94
- with open(epub_path, "rb") as f:
95
- return epub_path, f.read()
96
 
97
  def interface_fn(pdf, title, author, language):
98
  epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
 
60
  return decoded[0] if decoded else ""
61
 
62
  def convert_pdf_to_epub(pdf_file, title, author, language):
63
+ # Save the uploaded file to a temporary path
64
+ tmp_pdf_path = "/tmp/uploaded.pdf"
65
+ with open(tmp_pdf_path, "wb") as f:
66
+ f.write(pdf_file.read()) # This ensures the file isn't empty
67
 
68
+ # Now it's safe to read it
69
  reader = PdfReader(tmp_pdf_path)
70
+
71
+ # Extract the first page for the cover (if needed)
72
+ first_page = reader.pages[0]
73
+ cover_path = "/tmp/cover.jpg"
74
+ images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
75
+ images[0].save(cover_path, "JPEG")
76
+
77
+ # Run OCR and get text from olmocr
78
+ ocr_text = olmocr.process(tmp_pdf_path)
79
+
80
+ # Use metadata
81
+ epub_path = "/tmp/output.epub"
82
+ create_epub_from_text(
83
+ text=ocr_text,
84
+ output_path=epub_path,
85
+ title=title,
86
+ author=author,
87
+ language=language,
88
+ cover_image=cover_path
89
+ )
90
+
91
+ return epub_path, cover_path
 
 
 
 
 
 
92
 
93
  def interface_fn(pdf, title, author, language):
94
  epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)