Spaces:

amit01Xindus
/

new-project

Sleeping

App Files Files Community

new-project / pdf_json.py

amit01Xindus

Upload 8 files

96c003e verified 2 months ago

raw

history blame contribute delete

21 kB

	import os
	import base64
	import json
	import requests
	from typing import Dict, List, Any, Optional
	import fitz # PyMuPDF
	from PIL import Image
	import io
	import re
	from dataclasses import dataclass, asdict
	from pathlib import Path
	from datetime import datetime

	@dataclass
	class TextBlock:
	text: str
	x: float
	y: float
	width: float
	height: float
	font_size: float
	font_name: str
	is_bold: bool = False
	is_italic: bool = False
	block_id: str = ""

	def to_dict(self) -> Dict[str, Any]:
	"""Convert TextBlock to dictionary"""
	return asdict(self)

	@dataclass
	class ImageData:
	index: int
	base64_data: str
	bbox: tuple
	width: float
	height: float
	format: str = "PNG"

	def to_dict(self) -> Dict[str, Any]:
	"""Convert ImageData to dictionary"""
	return asdict(self)

	@dataclass
	class TableData:
	bbox: tuple
	data: List[List[str]]
	rows: int
	columns: int

	def to_dict(self) -> Dict[str, Any]:
	"""Convert TableData to dictionary"""
	return asdict(self)

	@dataclass
	class PageData:
	page_number: int
	text_blocks: List[TextBlock]
	images: List[ImageData]
	tables: List[TableData]
	page_width: float
	page_height: float
	word_count: int = 0
	character_count: int = 0

	def to_dict(self) -> Dict[str, Any]:
	"""Convert PageData to dictionary"""
	return {
	"page_number": self.page_number,
	"text_blocks": [block.to_dict() for block in self.text_blocks],
	"images": [img.to_dict() for img in self.images],
	"tables": [table.to_dict() for table in self.tables],
	"page_width": self.page_width,
	"page_height": self.page_height,
	"word_count": self.word_count,
	"character_count": self.character_count
	}

	class PDFToJSONConverter:
	def __init__(self, huggingface_token: str = None):
	self.hf_token = huggingface_token
	self.hf_headers = {
	"Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
	}
	self.models = {
	"document_layout": "microsoft/layoutlm-base-uncased",
	"table_detection": "microsoft/table-transformer-detection",
	"ocr": "microsoft/trocr-base-printed",
	"math_detection": "facebook/detr-resnet-50"
	}
	self.hf_inference_url = "https://api-inference.huggingface.co/models"

	def pdf_to_base64(self, pdf_path: str) -> str:
	"""Convert PDF file to base64 string"""
	try:
	with open(pdf_path, "rb") as pdf_file:
	return base64.b64encode(pdf_file.read()).decode('utf-8')
	except Exception as e:
	raise Exception(f"Error converting PDF to base64: {str(e)}")

	def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
	"""Extract all content from PDF and return structured data"""
	doc = None
	try:
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	doc = fitz.open(pdf_path)

	if doc is None:
	raise RuntimeError("Failed to open PDF document")

	if doc.page_count == 0:
	raise ValueError("PDF document has no pages")

	print(f"📄 PDF opened successfully: {doc.page_count} pages")

	pages_data = []
	document_stats = {
	"total_pages": doc.page_count,
	"total_words": 0,
	"total_characters": 0,
	"total_images": 0,
	"total_tables": 0
	}

	for page_num in range(doc.page_count):
	try:
	page = doc[page_num]
	print(f"🔄 Processing page {page_num + 1}/{doc.page_count}")

	# Extract text blocks
	text_blocks = []
	try:
	page_dict = page.get_text("dict")
	text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
	except Exception as e:
	print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
	text_blocks = self._extract_text_blocks_simple(page, page_num)

	# Extract images
	images = self._extract_images_safely(page, doc, page_num)

	# Extract tables
	tables = self._detect_tables_safely(page)

	# Get page dimensions
	page_rect = page.rect

	# Calculate statistics
	page_text = " ".join([block.text for block in text_blocks])
	word_count = len(page_text.split())
	char_count = len(page_text)

	# Create page data
	page_data = PageData(
	page_number=page_num + 1,
	text_blocks=text_blocks,
	images=images,
	tables=tables,
	page_width=page_rect.width,
	page_height=page_rect.height,
	word_count=word_count,
	character_count=char_count
	)

	pages_data.append(page_data)

	# Update document statistics
	document_stats["total_words"] += word_count
	document_stats["total_characters"] += char_count
	document_stats["total_images"] += len(images)
	document_stats["total_tables"] += len(tables)

	except Exception as e:
	print(f"❌ Error processing page {page_num + 1}: {e}")
	# Create empty page data for failed pages
	empty_page = PageData(
	page_number=page_num + 1,
	text_blocks=[],
	images=[],
	tables=[],
	page_width=595,
	page_height=842,
	word_count=0,
	character_count=0
	)
	pages_data.append(empty_page)

	result = {
	"document_info": {
	"filename": os.path.basename(pdf_path),
	"file_size": os.path.getsize(pdf_path),
	"conversion_timestamp": self._get_current_timestamp(),
	"converter_version": "1.0.0"
	},
	"document_statistics": document_stats,
	"pages": [page.to_dict() for page in pages_data]
	}

	return result

	except Exception as e:
	raise Exception(f"Error extracting PDF content: {str(e)}")
	finally:
	if doc is not None:
	try:
	doc.close()
	print("✅ PDF document closed successfully")
	except Exception as e:
	print(f"⚠️ Error closing PDF document: {e}")

	def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
	"""Extract text blocks from page dictionary with detailed formatting"""
	text_blocks = []

	for block_idx, block in enumerate(page_dict.get("blocks", [])):
	if "lines" not in block:
	continue

	for line_idx, line in enumerate(block["lines"]):
	for span_idx, span in enumerate(line["spans"]):
	text_content = span.get("text", "").strip()
	if text_content:
	bbox = span["bbox"]
	font_info = {
	"size": span.get("size", 12),
	"font": span.get("font", "Arial"),
	"is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
	"is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
	}

	text_block = TextBlock(
	text=text_content,
	x=round(bbox[0], 2),
	y=round(bbox[1], 2),
	width=round(bbox[2] - bbox[0], 2),
	height=round(bbox[3] - bbox[1], 2),
	font_size=round(font_info["size"], 2),
	font_name=font_info["font"],
	is_bold=font_info["is_bold"],
	is_italic=font_info["is_italic"],
	block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
	)
	text_blocks.append(text_block)

	return text_blocks

	def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
	"""Fallback method for text extraction"""
	text_blocks = []
	try:
	blocks_data = page.get_text("blocks")
	for block_idx, block in enumerate(blocks_data):
	if block[6] == 0: # Text block
	text = block[4].strip()
	if text:
	x0, y0, x1, y1 = block[0], block[1], block[2], block[3]

	lines = text.split('\n')
	line_height = (y1 - y0) / max(len(lines), 1)

	for line_idx, line in enumerate(lines):
	if line.strip():
	text_block = TextBlock(
	text=line.strip(),
	x=round(x0, 2),
	y=round(y0 + (line_idx * line_height), 2),
	width=round(x1 - x0, 2),
	height=round(line_height, 2),
	font_size=12.0,
	font_name="Arial",
	is_bold=False,
	is_italic=False,
	block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
	)
	text_blocks.append(text_block)
	except Exception as e:
	print(f"⚠️ Simple text block extraction failed: {e}")

	return text_blocks

	def _extract_images_safely(self, page, doc, page_num) -> List[ImageData]:
	"""Extract images from page and return structured data"""
	images = []
	try:
	image_list = page.get_images(full=True)
	for img_index, img_info in enumerate(image_list):
	try:
	xref = img_info[0]

	# Get image rectangles
	img_rects = [r for r in page.get_image_rects(xref)]
	if not img_rects:
	continue

	bbox = img_rects[0]

	# Extract image data
	pix = fitz.Pixmap(doc, xref)
	if pix.n - pix.alpha < 4: # Valid image
	img_data = pix.tobytes("png")
	img_base64 = base64.b64encode(img_data).decode()

	image_data = ImageData(
	index=img_index,
	base64_data=img_base64,
	bbox=(round(bbox.x0, 2), round(bbox.y0, 2),
	round(bbox.x1, 2), round(bbox.y1, 2)),
	width=round(bbox.x1 - bbox.x0, 2),
	height=round(bbox.y1 - bbox.y0, 2),
	format="PNG"
	)
	images.append(image_data)
	pix = None
	except Exception as e:
	print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
	continue
	except Exception as e:
	print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
	return images

	def _detect_tables_safely(self, page) -> List[TableData]:
	"""Extract tables from page and return structured data"""
	tables = []
	try:
	tabs = page.find_tables()
	for tab_index, tab in enumerate(tabs):
	try:
	table_data = tab.extract()
	if table_data:
	# Clean table data
	cleaned_data = []
	for row in table_data:
	cleaned_row = [str(cell).strip() if cell else "" for cell in row]
	if any(cleaned_row): # Only add non-empty rows
	cleaned_data.append(cleaned_row)

	if cleaned_data:
	table_obj = TableData(
	bbox=(round(tab.bbox.x0, 2), round(tab.bbox.y0, 2),
	round(tab.bbox.x1, 2), round(tab.bbox.y1, 2)),
	data=cleaned_data,
	rows=len(cleaned_data),
	columns=max(len(row) for row in cleaned_data) if cleaned_data else 0
	)
	tables.append(table_obj)
	except Exception as e:
	print(f"⚠️ Error extracting table {tab_index}: {e}")
	continue
	except Exception as e:
	print(f"⚠️ General error in table detection: {e}")
	return tables

	def convert_to_json(self, pdf_content: Dict[str, Any], output_path: str = None,
	pretty_print: bool = True, include_base64_images: bool = True) -> str:
	"""Convert PDF content to JSON format"""
	print("🔄 Converting to JSON format...")

	try:
	# Create a copy of the content for modification
	json_content = pdf_content.copy()

	# Add metadata
	json_content["conversion_options"] = {
	"pretty_print": pretty_print,
	"include_base64_images": include_base64_images,
	"json_schema_version": "1.0"
	}

	# Optionally remove base64 image data to reduce file size
	if not include_base64_images:
	for page in json_content["pages"]:
	for image in page["images"]:
	image["base64_data"] = "[Base64 data removed - set include_base64_images=True to include]"

	# Convert to JSON string
	if pretty_print:
	json_string = json.dumps(json_content, indent=2, ensure_ascii=False)
	else:
	json_string = json.dumps(json_content, ensure_ascii=False)

	# Save to file if output path provided
	if output_path:
	try:
	Path(output_path).parent.mkdir(parents=True, exist_ok=True)
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(json_string)
	print(f"✅ JSON saved to: {output_path}")
	print(f"📊 File size: {len(json_string):,} characters")
	except Exception as e:
	print(f"⚠️ Error saving JSON to {output_path}: {e}")

	return json_string

	except Exception as e:
	raise Exception(f"Error converting to JSON: {str(e)}")

	def create_json_summary(self, pdf_content: Dict[str, Any]) -> Dict[str, Any]:
	"""Create a summary of the PDF content without full data"""
	summary = {
	"document_info": pdf_content.get("document_info", {}),
	"document_statistics": pdf_content.get("document_statistics", {}),
	"page_summaries": []
	}

	for page in pdf_content.get("pages", []):
	page_summary = {
	"page_number": page["page_number"],
	"text_blocks_count": len(page["text_blocks"]),
	"images_count": len(page["images"]),
	"tables_count": len(page["tables"]),
	"word_count": page["word_count"],
	"character_count": page["character_count"],
	"page_dimensions": {
	"width": page["page_width"],
	"height": page["page_height"]
	},
	"sample_text": " ".join([block["text"] for block in page["text_blocks"][:3]])[:200] + "..." if page["text_blocks"] else ""
	}
	summary["page_summaries"].append(page_summary)

	return summary

	def _get_current_timestamp(self) -> str:
	"""Get current timestamp as string"""
	return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	def process_pdf_to_json(self, pdf_path: str, output_path: str = None,
	pretty_print: bool = True, include_base64_images: bool = True,
	create_summary: bool = False, use_hf_models: bool = False) -> str:
	"""Main method to process PDF and convert to JSON"""
	print(f"🚀 Processing PDF to JSON: {pdf_path}")

	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	print("📄 Extracting PDF content...")
	pdf_content = self.extract_pdf_content(pdf_path)

	if use_hf_models and self.hf_token:
	print("🤖 Attempting to enhance with Hugging Face models...")
	try:
	print("Note: Hugging Face model integration requires further implementation.")
	except Exception as e:
	print(f"⚠️ Hugging Face enhancement failed: {e}")

	print("🔄 Converting to JSON...")
	json_content = self.convert_to_json(
	pdf_content,
	output_path,
	pretty_print,
	include_base64_images
	)

	# Create summary file if requested
	if create_summary and output_path:
	summary_path = output_path.replace('.json', '_summary.json')
	summary_data = self.create_json_summary(pdf_content)
	summary_json = json.dumps(summary_data, indent=2, ensure_ascii=False)

	try:
	with open(summary_path, 'w', encoding='utf-8') as f:
	f.write(summary_json)
	print(f"✅ Summary JSON saved to: {summary_path}")
	except Exception as e:
	print(f"⚠️ Error saving summary: {e}")

	print("✅ Processing complete!")
	return json_content

	def main():
	"""Main function to demonstrate PDF to JSON conversion"""
	# Set your Hugging Face token if needed
	HF_TOKEN = os.getenv("HF_API_TOKEN")

	# Initialize converter
	converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)

	# Define paths
	pdf_path = "new-pdf.pdf" # Change this to your PDF file path
	output_path = "converted_document.json" # Output JSON file path

	try:
	# Convert PDF to JSON
	json_content = converter.process_pdf_to_json(
	pdf_path=pdf_path,
	output_path=output_path,
	pretty_print=True, # Format JSON with indentation
	include_base64_images=True, # Include image data (set False to reduce file size)
	create_summary=True, # Create additional summary file
	use_hf_models=False # Set to True if you want to use HuggingFace models
	)

	print(f"✅ Successfully converted '{pdf_path}' to '{output_path}'")
	print(f"📊 JSON length: {len(json_content):,} characters")
	print(f"📄 Open '{output_path}' to view the structured JSON data!")

	# Optional: Print first 500 characters of JSON as preview
	print("\n📋 JSON Preview (first 500 characters):")
	print("-" * 50)
	print(json_content[:500] + "..." if len(json_content) > 500 else json_content)

	except FileNotFoundError as e:
	print(f"❌ Error: {e}")
	print("Please ensure the PDF file exists at the specified path.")
	except Exception as e:
	print(f"❌ An unexpected error occurred: {str(e)}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	main()