import os import torch import base64 from io import BytesIO from PIL import Image import gradio as gr from ebooklib import epub from transformers import AutoProcessor, Qwen2VLForConditionalGeneration from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts import build_finetuning_prompt from olmocr.prompts.anchor import get_anchor_text from PyPDF2 import PdfReader # Set a writable cache directory for HF os.environ['HF_HOME'] = '/tmp/.cache/huggingface' # Load processor and model processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") model = Qwen2VLForConditionalGeneration.from_pretrained( "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 ).eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def extract_text_from_page(pdf_path, page_num): # Render image image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024) image = Image.open(BytesIO(base64.b64decode(image_base64))) # Prompt and input anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000) prompt = build_finetuning_prompt(anchor_text) messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): output = model.generate( **inputs, temperature=0.8, max_new_tokens=256, num_return_sequences=1, do_sample=True, ) prompt_len = inputs["input_ids"].shape[1] new_tokens = output[:, prompt_len:] decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0] return decoded, image_base64 if page_num == 1 else None def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"): file_path = file.name reader = PdfReader(file_path) num_pages = len(reader.pages) all_text = [] cover_image_data = None for page in range(1, num_pages + 1): text, cover_image = extract_text_from_page(file_path, page) all_text.append(f"
{text}
") if cover_image and not cover_image_data: cover_image_data = cover_image # base64 # Build EPUB book = epub.EpubBook() book.set_identifier("id123456") book.set_title(title) book.set_language(language) book.add_author(author) # Add cover image if cover_image_data: cover_bytes = base64.b64decode(cover_image_data) book.set_cover("cover.jpg", cover_bytes) # Create chapter with all text chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language) chapter.content = f"