File size: 5,489 Bytes
ab9d843 ab366bd ab9d843 ab366bd ab9d843 ab366bd 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 ab366bd 36ada58 ab366bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import os
from pathlib import Path
import fitz # PyMuPDF for PDF handling
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration # For image captioning
import torch
import gradio as gr
# Create output directory
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)
def generate_page_image(pdf_path, page_num):
"""
Generate an image from a specific PDF page for analysis
"""
try:
# Open the PDF
pdf_document = fitz.open(pdf_path)
page = pdf_document[page_num]
# Get the page dimensions to determine appropriate resolution
rect = page.rect
width = rect.width
height = rect.height
# Calculate appropriate zoom factor to get good quality images
# Aim for approximately 2000 pixels on the longest side
zoom = 2000 / max(width, height)
# Create a transformation matrix
mat = fitz.Matrix(zoom, zoom)
# Render page to an image
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Save image
image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
img.save(image_path, "PNG")
pdf_document.close()
return image_path
except Exception as e:
print(f"Error generating image for page {page_num + 1}: {str(e)}")
return None
def extract_text_from_pdf(pdf_path, page_num):
"""
Extract text directly from a specific PDF page
"""
try:
# Open the PDF
pdf_document = fitz.open(pdf_path)
page = pdf_document[page_num]
# Extract text
text = page.get_text("text")
pdf_document.close()
return text.strip()
except Exception as e:
print(f"Error extracting text from page {page_num + 1}: {str(e)}")
return ""
def analyze_image(image_path):
"""
Analyze image content using BLIP model for image captioning
"""
try:
# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Load and process image
image = Image.open(image_path).convert('RGB')
inputs = processor(image, return_tensors="pt")
# Generate caption
with torch.no_grad():
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
except Exception as e:
print(f"Error during image analysis: {str(e)}")
return "Image content could not be analyzed."
def process_pdf(pdf_path, output_txt_path):
"""
Main function to process the PDF and generate output
"""
try:
# Open the PDF to get page count
pdf_document = fitz.open(pdf_path)
num_pages = len(pdf_document)
pdf_document.close()
if num_pages == 0:
print("The PDF is empty.")
return
# Prepare output file
with open(output_txt_path, 'w', encoding='utf-8') as f:
f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
f.write("=" * 50 + "\n\n")
# Process each page
for page_num in range(num_pages):
print(f"Processing page {page_num + 1}...")
# Write page header
f.write(f"Page {page_num + 1}\n")
f.write("-" * 30 + "\n\n")
# Extract and write text
text = extract_text_from_pdf(pdf_path, page_num)
if text:
f.write("Extracted Text:\n")
f.write(text)
f.write("\n\n")
else:
f.write("No text could be extracted from this page.\n\n")
# Generate image for analysis and write description
image_path = generate_page_image(pdf_path, page_num)
if image_path:
description = analyze_image(image_path)
f.write("Image Description:\n")
f.write(f"{description}\n")
f.write("\n" + "=" * 50 + "\n\n")
else:
f.write("Image Description:\n")
f.write("Could not generate image for analysis.\n")
f.write("\n" + "=" * 50 + "\n\n")
print(f"Processing complete. Results saved to {output_txt_path}")
except Exception as e:
print(f"Error processing PDF: {str(e)}")
def process_uploaded_pdf(pdf_file):
if pdf_file is None:
return "Please upload a PDF file."
output_txt = OUTPUT_DIR / "analysis_results.txt"
process_pdf(pdf_file.name, output_txt)
# Read and return the results
with open(output_txt, 'r', encoding='utf-8') as f:
results = f.read()
return results
# Create Gradio interface
interface = gr.Interface(
fn=process_uploaded_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.Textbox(label="Analysis Results"),
title="PDF Analyzer",
description="Upload a PDF file to extract text directly and analyze images."
)
interface.launch() |