Spaces:

amit01Xindus
/

new-project

Sleeping

App Files Files Community

new-project / pdf_html.py

amit01Xindus

Upload 8 files

96c003e verified 2 months ago

raw

history blame contribute delete

25.1 kB

	import os
	import base64
	import json
	import requests
	from typing import Dict, List, Any
	import fitz # PyMuPDF
	from PIL import Image
	import io
	import re
	from dataclasses import dataclass
	from pathlib import Path
	from datetime import datetime

	@dataclass
	class TextBlock:
	text: str
	x: float
	y: float
	width: float
	height: float
	font_size: float
	font_name: str
	is_bold: bool = False
	is_italic: bool = False
	block_id: str = ""

	class PDFToHTMLConverter:
	def __init__(self, huggingface_token: str = None):
	self.hf_token = huggingface_token
	self.hf_headers = {
	"Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
	}
	self.models = {
	"document_layout": "microsoft/layoutlm-base-uncased",
	"table_detection": "microsoft/table-transformer-detection",
	"ocr": "microsoft/trocr-base-printed",
	"math_detection": "facebook/detr-resnet-50"
	}
	self.hf_inference_url = "https://api-inference.huggingface.co/models"

	def pdf_to_base64(self, pdf_path: str) -> str:
	try:
	with open(pdf_path, "rb") as pdf_file:
	return base64.b64encode(pdf_file.read()).decode('utf-8')
	except Exception as e:
	raise Exception(f"Error converting PDF to base64: {str(e)}")

	def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
	doc = None
	try:
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	doc = fitz.open(pdf_path)

	if doc is None:
	raise RuntimeError("Failed to open PDF document")

	if doc.page_count == 0:
	raise ValueError("PDF document has no pages")

	print(f"📄 PDF opened successfully: {doc.page_count} pages")

	pages_content = []

	for page_num in range(doc.page_count):
	try:
	page = doc[page_num]
	print(f"🔄 Processing page {page_num + 1}/{doc.page_count}")

	text_blocks = []
	try:
	page_dict = page.get_text("dict")
	text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
	except Exception as e:
	print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
	text_blocks = self._extract_text_blocks_simple(page, page_num)

	images = self._extract_images_safely(page, doc, page_num)
	tables = self._detect_tables_safely(page)

	page_rect = page.rect

	pages_content.append({
	"page_number": page_num + 1,
	"text_blocks": text_blocks,
	"images": images,
	"tables": tables,
	"page_width": page_rect.width,
	"page_height": page_rect.height
	})

	except Exception as e:
	print(f"❌ Error processing page {page_num + 1}: {e}")
	pages_content.append({
	"page_number": page_num + 1,
	"text_blocks": [],
	"images": [],
	"tables": [],
	"page_width": 595,
	"page_height": 842
	})

	result = {
	"pages": pages_content,
	"total_pages": doc.page_count
	}
	return result

	except Exception as e:
	raise Exception(f"Error extracting PDF content: {str(e)}")
	finally:
	if doc is not None:
	try:
	doc.close()
	print("✅ PDF document closed successfully")
	except Exception as e:
	print(f"⚠️ Error closing PDF document: {e}")

	def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
	text_blocks = []

	for block_idx, block in enumerate(page_dict.get("blocks", [])):
	if "lines" not in block:
	continue

	for line_idx, line in enumerate(block["lines"]):
	for span_idx, span in enumerate(line["spans"]):
	text_content = span.get("text", "").strip()
	if text_content:
	bbox = span["bbox"]
	font_info = {
	"size": span.get("size", 12),
	"font": span.get("font", "Arial"),
	"is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
	"is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
	}

	text_block = TextBlock(
	text=text_content,
	x=bbox[0],
	y=bbox[1],
	width=bbox[2] - bbox[0],
	height=bbox[3] - bbox[1],
	font_size=font_info["size"],
	font_name=font_info["font"],
	is_bold=font_info["is_bold"],
	is_italic=font_info["is_italic"],
	block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
	)
	text_blocks.append(text_block)

	return text_blocks

	def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
	text_blocks = []
	try:
	blocks_data = page.get_text("blocks")
	for block_idx, block in enumerate(blocks_data):
	if block[6] == 0:
	text = block[4].strip()
	if text:
	x0, y0, x1, y1 = block[0], block[1], block[2], block[3]

	lines = text.split('\n')
	line_height = (y1 - y0) / max(len(lines), 1)

	for line_idx, line in enumerate(lines):
	if line.strip():
	text_block = TextBlock(
	text=line.strip(),
	x=x0,
	y=y0 + (line_idx * line_height),
	width=x1 - x0,
	height=line_height,
	font_size=12,
	font_name="Arial",
	is_bold=False,
	is_italic=False,
	block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
	)
	text_blocks.append(text_block)
	except Exception as e:
	print(f"⚠️ Simple text block extraction failed: {e}")

	return text_blocks

	def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
	images = []
	try:
	image_list = page.get_images(full=True)
	for img_index, img_info in enumerate(image_list):
	try:
	xref = img_info[0]

	img_rects = [r for r in page.get_image_rects(xref)]
	if not img_rects:
	continue

	bbox = img_rects[0]

	pix = fitz.Pixmap(doc, xref)
	if pix.n - pix.alpha < 4:
	img_data = pix.tobytes("png")
	img_base64 = base64.b64encode(img_data).decode()

	images.append({
	"index": img_index,
	"data": img_base64,
	"bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
	})
	pix = None
	except Exception as e:
	print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
	continue
	except Exception as e:
	print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
	return images

	def _detect_tables_safely(self, page) -> List[Dict]:
	tables = []
	try:
	tabs = page.find_tables()
	for tab_index, tab in enumerate(tabs):
	try:
	table_data = tab.extract()
	if table_data:
	cleaned_data = []
	for row in table_data:
	cleaned_row = [str(cell).strip() if cell else "" for cell in row]
	if any(cleaned_row):
	cleaned_data.append(cleaned_row)

	if cleaned_data:
	tables.append({
	"bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1),
	"data": cleaned_data
	})
	except Exception as e:
	print(f"⚠️ Error extracting table {tab_index}: {e}")
	continue
	except Exception as e:
	print(f"⚠️ General error in table detection: {e}")
	return tables

	def enhance_math_symbols(self, text: str) -> str:
	math_replacements = {
	'±': '±', '×': '×', '÷': '÷', '∑': '∑',
	'∏': '∏', '√': '√', '∞': '∞', '∫': '∫',
	'∂': '∂', '∆': 'Δ', '∇': '∇', '∈': '∈',
	'∉': '∉', '⊂': '⊂', '⊃': '⊃', '⊆': '&sube;',
	'⊇': '&supe;', '∪': '∪', '∩': '∩', '≤': '≤',
	'≥': '≥', '≠': '≠', '≡': '&equiv;', '≈': '≈',
	'∝': '&prop;', '∴': '&there4;',
	'α': 'α', 'β': 'β', 'γ': 'γ', 'δ': 'δ',
	'ε': 'ε', 'ζ': 'ζ', 'η': 'η', 'θ': 'θ',
	'ι': 'ι', 'κ': 'κ', 'λ': 'λ', 'μ': 'μ',
	'ν': 'ν', 'ξ': 'ξ', 'π': 'π', 'ρ': 'ρ', 'σ': 'σ',
	'τ': 'τ', 'υ': 'υ', 'φ': 'φ', 'χ': 'χ',
	'ψ': 'ψ', 'ω': 'ω',
	'½': '½', '⅓': '&frac13;', '¼': '¼', '⅔': '&frac23;',
	'¾': '¾', '⅛': '&frac18;', '²': '²', '³': '³',
	'¹': '¹', '°': '°'
	}

	for symbol, html_entity in math_replacements.items():
	text = text.replace(symbol, html_entity)

	return text

	def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str:
	html_content = []
	html_content.append("""<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>PDF Document</title>
	<style>
	* {
	box-sizing: border-box;
	margin: 0;
	padding: 0;
	}

	body {
	font-family: 'Times New Roman', Times, serif;
	background-color: #f5f5f5;
	padding: 20px;
	line-height: 1.2;
	color: #000000;
	}

	.document-container {
	max-width: 1200px;
	margin: 0 auto;
	background-color: white;
	box-shadow: 0 4px 12px rgba(0,0,0,0.1);
	border: 1px solid #ddd;
	}

	.page-wrapper {
	background-color: white;
	margin: 0;
	padding: 40px;
	border-bottom: 2px solid #000;
	position: relative;
	min-height: 800px;
	page-break-after: always;
	overflow: visible;
	}

	.page-header {
	background-color: #f8f8f8;
	padding: 10px 15px;
	margin: -40px -40px 30px -40px;
	border-bottom: 2px solid #000;
	font-weight: bold;
	color: #000;
	font-size: 14px;
	text-align: center;
	}

	.content-layer {
	position: relative;
	width: 100%;
	height: 100%;
	}

	.text-content {
	position: relative;
	z-index: 10;
	line-height: 1.4;
	}

	.text-block {
	margin-bottom: 8px;
	font-family: 'Times New Roman', Times, serif;
	color: #000;
	word-wrap: break-word;
	overflow-wrap: break-word;
	}

	.text-block.inline {
	display: inline;
	margin-bottom: 0;
	margin-right: 5px;
	}

	.text-group {
	margin-bottom: 12px;
	line-height: 1.3;
	}

	.bold {
	font-weight: bold;
	}

	.italic {
	font-style: italic;
	}

	.table-container {
	margin: 20px 0;
	background-color: white;
	overflow: auto;
	z-index: 20;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}

	.table {
	width: 100%;
	border-collapse: collapse;
	border: 2px solid #000;
	font-family: 'Times New Roman', Times, serif;
	font-size: 12px;
	color: #000;
	background-color: white;
	margin: 0;
	}

	.table td, .table th {
	border: 1px solid #000;
	padding: 8px 12px;
	text-align: left;
	vertical-align: top;
	background-color: white;
	font-family: 'Times New Roman', Times, serif;
	word-wrap: break-word;
	min-width: 60px;
	}

	.table th {
	background-color: #f0f0f0;
	font-weight: bold;
	text-align: center;
	}

	.table tr:nth-child(even) td {
	background-color: #f9f9f9;
	}

	.table tr:hover td {
	background-color: #f0f0f0;
	}

	.image-container {
	margin: 15px 0;
	border: 1px solid #ccc;
	background-color: white;
	text-align: center;
	overflow: hidden;
	z-index: 5;
	}

	.image {
	max-width: 100%;
	height: auto;
	display: block;
	margin: 0 auto;
	}

	.math-symbol {
	font-family: 'Times New Roman', serif;
	}

	.document-info {
	background-color: #f8f8f8;
	padding: 15px;
	border: 1px solid #ccc;
	margin-bottom: 20px;
	text-align: center;
	font-family: 'Times New Roman', Times, serif;
	}

	@media print {
	body {
	background-color: white;
	padding: 0;
	}
	.page-wrapper {
	border: none;
	box-shadow: none;
	margin: 0;
	page-break-after: always;
	}
	.document-info {
	display: none;
	}
	.table {
	border: 2px solid #000 !important;
	}
	.table td, .table th {
	border: 1px solid #000 !important;
	}
	}
	</style>
	</head>
	<body>
	<div class="document-container">""")

	html_content.append(f"""
	<div class="document-info">
	<h1>PDF Document Conversion</h1>
	<p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p>
	<p><strong>Converted on:</strong> {self._get_current_timestamp()}</p>
	</div>""")

	for page in pdf_content["pages"]:
	page_width = max(page["page_width"], 595)
	page_height = max(page["page_height"], 842)

	html_content.append(f"""
	<div class="page-wrapper">
	<div class="page-header">
	Page {page["page_number"]} ({page_width:.0f}×{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])}
	</div>
	<div class="content-layer">""")

	# Add images first
	for img in page["images"]:
	html_content.append(f"""
	<div class="image-container">
	<img class="image" src="data:image/png;base64,{img['data']}"
	alt="Page {page['page_number']} Image {img['index']}">
	</div>""")

	# Add tables with improved generation
	for table_idx, table in enumerate(page["tables"]):
	print(f"🔄 Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})")
	html_content.append(self._generate_html_table(
	table["data"],
	header_rows=table.get("header_rows", 1)
	))

	# Add text content (non-overlapping groups)
	text_groups = self._group_overlapping_text(page["text_blocks"])

	html_content.append(' <div class="text-content">')

	for group in text_groups:
	if len(group) == 1:
	block = group[0]
	if block.text.strip():
	enhanced_text = self.enhance_math_symbols(block.text)
	enhanced_text = enhanced_text.replace('<', '<').replace('>', '>')

	css_classes = ["text-block"]
	if block.is_bold:
	css_classes.append("bold")
	if block.is_italic:
	css_classes.append("italic")
	if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']):
	css_classes.append("math-symbol")

	font_family = "'Times New Roman', Times, serif"
	if 'arial' in block.font_name.lower():
	font_family = "Arial, sans-serif"
	elif 'helvetica' in block.font_name.lower():
	font_family = "Helvetica, Arial, sans-serif"
	elif 'courier' in block.font_name.lower():
	font_family = "'Courier New', monospace"

	font_size = max(block.font_size * 0.9, 10)

	html_content.append(f"""
	<div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
	{enhanced_text}
	</div>""")
	else:
	group.sort(key=lambda b: b.x)
	html_content.append(' <div class="text-group">')

	for block in group:
	if block.text.strip():
	enhanced_text = self.enhance_math_symbols(block.text)
	enhanced_text = enhanced_text.replace('<', '<').replace('>', '>')

	css_classes = ["text-block", "inline"]
	if block.is_bold:
	css_classes.append("bold")
	if block.is_italic:
	css_classes.append("italic")
	if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']):
	css_classes.append("math-symbol")

	font_family = "'Times New Roman', Times, serif"
	if 'arial' in block.font_name.lower():
	font_family = "Arial, sans-serif"
	elif 'helvetica' in block.font_name.lower():
	font_family = "Helvetica, Arial, sans-serif"
	elif 'courier' in block.font_name.lower():
	font_family = "'Courier New', monospace"

	font_size = max(block.font_size * 0.9, 10)

	html_content.append(f"""
	<span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
	{enhanced_text}
	</span>""")

	html_content.append(' </div>')

	html_content.append(""" </div>
	</div>
	</div>""")

	html_content.append(" </div>")
	html_content.append("""
	</body>
	</html>""")
	final_html = "\n".join(html_content)

	if output_path:
	try:
	Path(output_path).parent.mkdir(parents=True, exist_ok=True)
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(final_html)
	print(f"✅ HTML saved to: {output_path}")
	except Exception as e:
	print(f"⚠️ Error saving HTML to {output_path}: {e}")

	return final_html

	def _get_current_timestamp(self) -> str:
	return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str:
	print(f"🚀 Processing PDF: {pdf_path}")

	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	print("📄 Extracting PDF content...")
	pdf_content = self.extract_pdf_content(pdf_path)

	if use_hf_models and self.hf_token:
	print("🤖 Attempting to enhance with Hugging Face models...")
	try:
	print("Note: Hugging Face model integration requires further implementation.")
	except Exception as e:
	print(f"⚠️ Hugging Face enhancement failed: {e}")

	print("🔄 Converting to HTML...")
	html_content = self.convert_to_html(pdf_content, output_path)

	print("✅ Processing complete!")
	return html_content

	def main():
	HF_TOKEN = os.getenv("HF_API_TOKEN")

	converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
	pdf_path = "new-pdf.pdf"
	output_path = "sample_converted.html"

	try:
	html_content = converter.process_pdf(
	pdf_path=pdf_path,
	output_path=output_path,
	use_hf_models=False
	)

	print(f"✅ Successfully converted '{pdf_path}' to '{output_path}'")
	print(f"🌐 Open '{output_path}' in your web browser to view the result!")

	except FileNotFoundError as e:
	print(f"❌ Error: {e}")
	print("Please ensure the PDF file exists at the specified path.")
	except Exception as e:
	print(f"❌ An unexpected error occurred: {str(e)}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	main()