Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 6

Commit

89a1632

verified ·

1 Parent(s): 10b8e9d

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -13

app.py CHANGED Viewed

@@ -11,17 +11,19 @@ from ebooklib import epub
 from pdf2image import convert_from_path
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-from olmocr.data.renderpdf import render_pdf_to_base64png
-from olmocr.prompts import build_finetuning_prompt
-from olmocr.prompts.anchor import get_anchor_text
-from olmocr.pipeline import PDFToTextOCR  # ✅ Import the OCR pipeline
-# Set Hugging Face and Torch cache to a guaranteed-writable location
 cache_dir = "/tmp/huggingface_cache"
 os.environ["HF_HOME"] = cache_dir
 os.environ["TORCH_HOME"] = cache_dir
 os.makedirs(cache_dir, exist_ok=True)
 # Load model and processor
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -30,8 +32,8 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 ).eval().to(device)
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-# Initialize olmocr OCR pipeline
-ocr_pipeline = PDFToTextOCR()
 def ocr_page(pdf_path, page_num):
     image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
@@ -95,17 +97,17 @@ def create_epub_from_text(text, output_path, title, author, language, cover_imag
 def convert_pdf_to_epub(pdf_file, title, author, language):
     tmp_pdf_path = pdf_file.name
-    # Read the first page for cover
     reader = PdfReader(tmp_pdf_path)
     cover_path = "/tmp/cover.jpg"
     images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
     images[0].save(cover_path, "JPEG")
-    # Run OCR using olmocr pipeline
-    ocr_result = ocr_pipeline(tmp_pdf_path)
-    ocr_text = "\n\n".join([page.text for page in ocr_result.pages])
-    # Create EPUB
     epub_path = "/tmp/output.epub"
     create_epub_from_text(
         text=ocr_text,

 from pdf2image import convert_from_path
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+# Set cache and log paths
 cache_dir = "/tmp/huggingface_cache"
 os.environ["HF_HOME"] = cache_dir
 os.environ["TORCH_HOME"] = cache_dir
+os.environ["OLMOCR_LOG_PATH"] = "/tmp/olmocr-pipeline-debug.log"
 os.makedirs(cache_dir, exist_ok=True)
+# Import olmocr pipeline after setting log path
+from olmocr.pipeline import PDFToTextOCR
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_finetuning_prompt
+from olmocr.prompts.anchor import get_anchor_text
 # Load model and processor
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = Qwen2VLForConditionalGeneration.from_pretrained(
 ).eval().to(device)
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+# Load OCR pipeline
+olmocr = PDFToTextOCR(model=model, processor=processor)
 def ocr_page(pdf_path, page_num):
     image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
 def convert_pdf_to_epub(pdf_file, title, author, language):
     tmp_pdf_path = pdf_file.name
+    # Read PDF to get cover
     reader = PdfReader(tmp_pdf_path)
+    first_page = reader.pages[0]
     cover_path = "/tmp/cover.jpg"
     images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
     images[0].save(cover_path, "JPEG")
+    # Run OCR
+    ocr_text = olmocr.process(tmp_pdf_path)
+    # Write EPUB
     epub_path = "/tmp/output.epub"
     create_epub_from_text(
         text=ocr_text,