import gradio as gr from transformers import AutoProcessor, AutoModelForCausalLM from pdf2image import convert_from_path import base64 import io from PIL import Image # Load the OCR model and processor from Hugging Face processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview") model = AutoModelForCausalLM.from_pretrained("allenai/olmOCR-7B-0225-preview") def process_pdf(pdf_file): """ Process the uploaded PDF file, extract text from each page, and generate HTML to display each page's image and text with copy buttons. """ # Check if a PDF file was uploaded if pdf_file is None: return "

Please upload a PDF file.

" # Convert PDF to images try: pages = convert_from_path(pdf_file.name) except Exception as e: return f"

Error converting PDF to images: {str(e)}

" # Start building the HTML output html = '

' # Process each page for i, page in enumerate(pages): # Convert the page image to base64 for embedding in HTML buffered = io.BytesIO() page.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() img_data = f"data:image/png;base64,{img_str}" # Extract text from the page using the OCR model try: inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt") outputs = model.generate(**inputs) text = processor.decode(outputs[0], skip_special_tokens=True) except Exception as e: text = f"Error extracting text: {str(e)}" # Generate HTML for this page's section textarea_id = f"text{i+1}" html += f'''

Page {i+1}

Page {i+1}

''' # Close the pages div and add JavaScript for copy functionality html += '

' html += ''' ''' return html # Define the Gradio interface with gr.Blocks(title="PDF Text Extractor") as demo: gr.Markdown("# PDF Text Extractor") gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.") with gr.Row(): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) submit_btn = gr.Button("Extract Text") output_html = gr.HTML() submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html) # Launch the interface demo.launch()