File size: 4,909 Bytes
ab9d843 ab366bd ab9d843 ab366bd ab9d843 ab366bd ab9d843 ab366bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
from pathlib import Path
import fitz # PyMuPDF for PDF handling
from PIL import Image
import pytesseract # For OCR
from transformers import BlipProcessor, BlipForConditionalGeneration # For image captioning
import io
import torch
import gradio as gr
# Create output directory
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)
def pdf_to_images(pdf_path):
"""
Convert PDF pages to appropriately sized images
"""
try:
# Open the PDF
pdf_document = fitz.open(pdf_path)
images = []
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# Get the page dimensions to determine appropriate resolution
rect = page.rect
width = rect.width
height = rect.height
# Calculate appropriate zoom factor to get good quality images
# Aim for approximately 2000 pixels on the longest side
zoom = 2000 / max(width, height)
# Create a transformation matrix
mat = fitz.Matrix(zoom, zoom)
# Render page to an image
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Save image
image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
img.save(image_path, "PNG")
images.append((image_path, img))
pdf_document.close()
return images
except Exception as e:
print(f"Error converting PDF to images: {str(e)}")
return []
def extract_text_from_image(image):
"""
Extract text from an image using OCR
"""
try:
text = pytesseract.image_to_string(image)
return text.strip()
except Exception as e:
print(f"Error during OCR: {str(e)}")
return ""
def analyze_image(image_path):
"""
Analyze image content using BLIP model for image captioning
"""
try:
# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Load and process image
image = Image.open(image_path).convert('RGB')
inputs = processor(image, return_tensors="pt")
# Generate caption
with torch.no_grad():
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
except Exception as e:
print(f"Error during image analysis: {str(e)}")
return "Image content could not be analyzed."
def process_pdf(pdf_path, output_txt_path):
"""
Main function to process the PDF and generate output
"""
# Convert PDF to images
print("Converting PDF to images...")
images = pdf_to_images(pdf_path)
if not images:
print("No images were generated from the PDF.")
return
# Prepare output file
with open(output_txt_path, 'w', encoding='utf-8') as f:
f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
f.write("=" * 50 + "\n\n")
# Process each page
for page_num, (image_path, image) in enumerate(images, 1):
print(f"Processing page {page_num}...")
# Write page header
f.write(f"Page {page_num}\n")
f.write("-" * 30 + "\n\n")
# Extract and write text
text = extract_text_from_image(image)
if text:
f.write("Extracted Text:\n")
f.write(text)
f.write("\n\n")
else:
f.write("No text could be extracted from this page.\n\n")
# Analyze image and write description
description = analyze_image(image_path)
f.write("Image Description:\n")
f.write(f"{description}\n")
f.write("\n" + "=" * 50 + "\n\n")
print(f"Processing complete. Results saved to {output_txt_path}")
def process_uploaded_pdf(pdf_file):
if pdf_file is None:
return "Please upload a PDF file."
output_txt = OUTPUT_DIR / "analysis_results.txt"
process_pdf(pdf_file.name, output_txt)
# Read and return the results
with open(output_txt, 'r', encoding='utf-8') as f:
results = f.read()
return results
# Create Gradio interface
interface = gr.Interface(
fn=process_uploaded_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.Textbox(label="Analysis Results"),
title="PDF Analyzer",
description="Upload a PDF file to extract text and analyze images."
)
interface.launch() |