leonarb commited on
Commit
10b8e9d
·
verified ·
1 Parent(s): 9157a09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -14,6 +14,7 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
14
  from olmocr.data.renderpdf import render_pdf_to_base64png
15
  from olmocr.prompts import build_finetuning_prompt
16
  from olmocr.prompts.anchor import get_anchor_text
 
17
 
18
  # Set Hugging Face and Torch cache to a guaranteed-writable location
19
  cache_dir = "/tmp/huggingface_cache"
@@ -29,6 +30,9 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
29
  ).eval().to(device)
30
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
31
 
 
 
 
32
  def ocr_page(pdf_path, page_num):
33
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
34
  anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
@@ -89,22 +93,19 @@ def create_epub_from_text(text, output_path, title, author, language, cover_imag
89
  epub.write_epub(output_path, book)
90
 
91
  def convert_pdf_to_epub(pdf_file, title, author, language):
92
- # Save the uploaded file to a temporary path
93
- tmp_pdf_path = pdf_file.name # Use the actual temp file path from Gradio
94
 
95
- # Now it's safe to read it
96
  reader = PdfReader(tmp_pdf_path)
97
-
98
- # Extract the first page for the cover (if needed)
99
- first_page = reader.pages[0]
100
  cover_path = "/tmp/cover.jpg"
101
  images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
102
  images[0].save(cover_path, "JPEG")
103
 
104
- # Run OCR and get text from olmocr
105
- ocr_text = olmocr.process(tmp_pdf_path)
 
106
 
107
- # Use metadata
108
  epub_path = "/tmp/output.epub"
109
  create_epub_from_text(
110
  text=ocr_text,
 
14
  from olmocr.data.renderpdf import render_pdf_to_base64png
15
  from olmocr.prompts import build_finetuning_prompt
16
  from olmocr.prompts.anchor import get_anchor_text
17
+ from olmocr.pipeline import PDFToTextOCR # ✅ Import the OCR pipeline
18
 
19
  # Set Hugging Face and Torch cache to a guaranteed-writable location
20
  cache_dir = "/tmp/huggingface_cache"
 
30
  ).eval().to(device)
31
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
32
 
33
+ # Initialize olmocr OCR pipeline
34
+ ocr_pipeline = PDFToTextOCR()
35
+
36
  def ocr_page(pdf_path, page_num):
37
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
38
  anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
 
93
  epub.write_epub(output_path, book)
94
 
95
  def convert_pdf_to_epub(pdf_file, title, author, language):
96
+ tmp_pdf_path = pdf_file.name
 
97
 
98
+ # Read the first page for cover
99
  reader = PdfReader(tmp_pdf_path)
 
 
 
100
  cover_path = "/tmp/cover.jpg"
101
  images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
102
  images[0].save(cover_path, "JPEG")
103
 
104
+ # Run OCR using olmocr pipeline
105
+ ocr_result = ocr_pipeline(tmp_pdf_path)
106
+ ocr_text = "\n\n".join([page.text for page in ocr_result.pages])
107
 
108
+ # Create EPUB
109
  epub_path = "/tmp/output.epub"
110
  create_epub_from_text(
111
  text=ocr_text,