leonarb commited on
Commit
89a1632
·
verified ·
1 Parent(s): 10b8e9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -13
app.py CHANGED
@@ -11,17 +11,19 @@ from ebooklib import epub
11
  from pdf2image import convert_from_path
12
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
13
 
14
- from olmocr.data.renderpdf import render_pdf_to_base64png
15
- from olmocr.prompts import build_finetuning_prompt
16
- from olmocr.prompts.anchor import get_anchor_text
17
- from olmocr.pipeline import PDFToTextOCR # ✅ Import the OCR pipeline
18
-
19
- # Set Hugging Face and Torch cache to a guaranteed-writable location
20
  cache_dir = "/tmp/huggingface_cache"
21
  os.environ["HF_HOME"] = cache_dir
22
  os.environ["TORCH_HOME"] = cache_dir
 
23
  os.makedirs(cache_dir, exist_ok=True)
24
 
 
 
 
 
 
 
25
  # Load model and processor
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
  model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -30,8 +32,8 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
30
  ).eval().to(device)
31
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
32
 
33
- # Initialize olmocr OCR pipeline
34
- ocr_pipeline = PDFToTextOCR()
35
 
36
  def ocr_page(pdf_path, page_num):
37
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
@@ -95,17 +97,17 @@ def create_epub_from_text(text, output_path, title, author, language, cover_imag
95
  def convert_pdf_to_epub(pdf_file, title, author, language):
96
  tmp_pdf_path = pdf_file.name
97
 
98
- # Read the first page for cover
99
  reader = PdfReader(tmp_pdf_path)
 
100
  cover_path = "/tmp/cover.jpg"
101
  images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
102
  images[0].save(cover_path, "JPEG")
103
 
104
- # Run OCR using olmocr pipeline
105
- ocr_result = ocr_pipeline(tmp_pdf_path)
106
- ocr_text = "\n\n".join([page.text for page in ocr_result.pages])
107
 
108
- # Create EPUB
109
  epub_path = "/tmp/output.epub"
110
  create_epub_from_text(
111
  text=ocr_text,
 
11
  from pdf2image import convert_from_path
12
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
13
 
14
+ # Set cache and log paths
 
 
 
 
 
15
  cache_dir = "/tmp/huggingface_cache"
16
  os.environ["HF_HOME"] = cache_dir
17
  os.environ["TORCH_HOME"] = cache_dir
18
+ os.environ["OLMOCR_LOG_PATH"] = "/tmp/olmocr-pipeline-debug.log"
19
  os.makedirs(cache_dir, exist_ok=True)
20
 
21
+ # Import olmocr pipeline after setting log path
22
+ from olmocr.pipeline import PDFToTextOCR
23
+ from olmocr.data.renderpdf import render_pdf_to_base64png
24
+ from olmocr.prompts import build_finetuning_prompt
25
+ from olmocr.prompts.anchor import get_anchor_text
26
+
27
  # Load model and processor
28
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
  model = Qwen2VLForConditionalGeneration.from_pretrained(
 
32
  ).eval().to(device)
33
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
34
 
35
+ # Load OCR pipeline
36
+ olmocr = PDFToTextOCR(model=model, processor=processor)
37
 
38
  def ocr_page(pdf_path, page_num):
39
  image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
 
97
  def convert_pdf_to_epub(pdf_file, title, author, language):
98
  tmp_pdf_path = pdf_file.name
99
 
100
+ # Read PDF to get cover
101
  reader = PdfReader(tmp_pdf_path)
102
+ first_page = reader.pages[0]
103
  cover_path = "/tmp/cover.jpg"
104
  images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
105
  images[0].save(cover_path, "JPEG")
106
 
107
+ # Run OCR
108
+ ocr_text = olmocr.process(tmp_pdf_path)
 
109
 
110
+ # Write EPUB
111
  epub_path = "/tmp/output.epub"
112
  create_epub_from_text(
113
  text=ocr_text,