new-project / pdf_html.py
amit01Xindus's picture
Upload 8 files
96c003e verified
import os
import base64
import json
import requests
from typing import Dict, List, Any
import fitz # PyMuPDF
from PIL import Image
import io
import re
from dataclasses import dataclass
from pathlib import Path
from datetime import datetime
@dataclass
class TextBlock:
text: str
x: float
y: float
width: float
height: float
font_size: float
font_name: str
is_bold: bool = False
is_italic: bool = False
block_id: str = ""
class PDFToHTMLConverter:
def __init__(self, huggingface_token: str = None):
self.hf_token = huggingface_token
self.hf_headers = {
"Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
}
self.models = {
"document_layout": "microsoft/layoutlm-base-uncased",
"table_detection": "microsoft/table-transformer-detection",
"ocr": "microsoft/trocr-base-printed",
"math_detection": "facebook/detr-resnet-50"
}
self.hf_inference_url = "https://api-inference.huggingface.co/models"
def pdf_to_base64(self, pdf_path: str) -> str:
try:
with open(pdf_path, "rb") as pdf_file:
return base64.b64encode(pdf_file.read()).decode('utf-8')
except Exception as e:
raise Exception(f"Error converting PDF to base64: {str(e)}")
def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
doc = None
try:
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
doc = fitz.open(pdf_path)
if doc is None:
raise RuntimeError("Failed to open PDF document")
if doc.page_count == 0:
raise ValueError("PDF document has no pages")
print(f"πŸ“„ PDF opened successfully: {doc.page_count} pages")
pages_content = []
for page_num in range(doc.page_count):
try:
page = doc[page_num]
print(f"πŸ”„ Processing page {page_num + 1}/{doc.page_count}")
text_blocks = []
try:
page_dict = page.get_text("dict")
text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
except Exception as e:
print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
text_blocks = self._extract_text_blocks_simple(page, page_num)
images = self._extract_images_safely(page, doc, page_num)
tables = self._detect_tables_safely(page)
page_rect = page.rect
pages_content.append({
"page_number": page_num + 1,
"text_blocks": text_blocks,
"images": images,
"tables": tables,
"page_width": page_rect.width,
"page_height": page_rect.height
})
except Exception as e:
print(f"❌ Error processing page {page_num + 1}: {e}")
pages_content.append({
"page_number": page_num + 1,
"text_blocks": [],
"images": [],
"tables": [],
"page_width": 595,
"page_height": 842
})
result = {
"pages": pages_content,
"total_pages": doc.page_count
}
return result
except Exception as e:
raise Exception(f"Error extracting PDF content: {str(e)}")
finally:
if doc is not None:
try:
doc.close()
print("βœ… PDF document closed successfully")
except Exception as e:
print(f"⚠️ Error closing PDF document: {e}")
def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
text_blocks = []
for block_idx, block in enumerate(page_dict.get("blocks", [])):
if "lines" not in block:
continue
for line_idx, line in enumerate(block["lines"]):
for span_idx, span in enumerate(line["spans"]):
text_content = span.get("text", "").strip()
if text_content:
bbox = span["bbox"]
font_info = {
"size": span.get("size", 12),
"font": span.get("font", "Arial"),
"is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
"is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
}
text_block = TextBlock(
text=text_content,
x=bbox[0],
y=bbox[1],
width=bbox[2] - bbox[0],
height=bbox[3] - bbox[1],
font_size=font_info["size"],
font_name=font_info["font"],
is_bold=font_info["is_bold"],
is_italic=font_info["is_italic"],
block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
)
text_blocks.append(text_block)
return text_blocks
def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
text_blocks = []
try:
blocks_data = page.get_text("blocks")
for block_idx, block in enumerate(blocks_data):
if block[6] == 0:
text = block[4].strip()
if text:
x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
lines = text.split('\n')
line_height = (y1 - y0) / max(len(lines), 1)
for line_idx, line in enumerate(lines):
if line.strip():
text_block = TextBlock(
text=line.strip(),
x=x0,
y=y0 + (line_idx * line_height),
width=x1 - x0,
height=line_height,
font_size=12,
font_name="Arial",
is_bold=False,
is_italic=False,
block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
)
text_blocks.append(text_block)
except Exception as e:
print(f"⚠️ Simple text block extraction failed: {e}")
return text_blocks
def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
images = []
try:
image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list):
try:
xref = img_info[0]
img_rects = [r for r in page.get_image_rects(xref)]
if not img_rects:
continue
bbox = img_rects[0]
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha < 4:
img_data = pix.tobytes("png")
img_base64 = base64.b64encode(img_data).decode()
images.append({
"index": img_index,
"data": img_base64,
"bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
})
pix = None
except Exception as e:
print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
continue
except Exception as e:
print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
return images
def _detect_tables_safely(self, page) -> List[Dict]:
tables = []
try:
tabs = page.find_tables()
for tab_index, tab in enumerate(tabs):
try:
table_data = tab.extract()
if table_data:
cleaned_data = []
for row in table_data:
cleaned_row = [str(cell).strip() if cell else "" for cell in row]
if any(cleaned_row):
cleaned_data.append(cleaned_row)
if cleaned_data:
tables.append({
"bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1),
"data": cleaned_data
})
except Exception as e:
print(f"⚠️ Error extracting table {tab_index}: {e}")
continue
except Exception as e:
print(f"⚠️ General error in table detection: {e}")
return tables
def enhance_math_symbols(self, text: str) -> str:
math_replacements = {
'Β±': '&plusmn;', 'Γ—': '&times;', 'Γ·': '&divide;', 'βˆ‘': '&sum;',
'∏': '&prod;', '√': '&radic;', '∞': '&infin;', '∫': '&int;',
'βˆ‚': '&part;', 'βˆ†': '&Delta;', 'βˆ‡': '&nabla;', '∈': '&isin;',
'βˆ‰': '&notin;', 'βŠ‚': '&sub;', 'βŠƒ': '&sup;', 'βŠ†': '&sube;',
'βŠ‡': '&supe;', 'βˆͺ': '&cup;', '∩': '&cap;', '≀': '&le;',
'β‰₯': '&ge;', 'β‰ ': '&ne;', '≑': '&equiv;', 'β‰ˆ': '&asymp;',
'∝': '&prop;', '∴': '&there4;',
'Ξ±': '&alpha;', 'Ξ²': '&beta;', 'Ξ³': '&gamma;', 'Ξ΄': '&delta;',
'Ξ΅': '&epsilon;', 'ΞΆ': '&zeta;', 'Ξ·': '&eta;', 'ΞΈ': '&theta;',
'ΞΉ': '&iota;', 'ΞΊ': '&kappa;', 'Ξ»': '&lambda;', 'ΞΌ': '&mu;',
'Ξ½': '&nu;', 'ΞΎ': '&xi;', 'Ο€': '&pi;', 'ρ': '&rho;', 'Οƒ': '&sigma;',
'Ο„': '&tau;', 'Ο…': '&upsilon;', 'Ο†': '&phi;', 'Ο‡': '&chi;',
'ψ': '&psi;', 'Ο‰': '&omega;',
'Β½': '&frac12;', 'β…“': '&frac13;', 'ΒΌ': '&frac14;', 'β…”': '&frac23;',
'ΒΎ': '&frac34;', 'β…›': '&frac18;', 'Β²': '&sup2;', 'Β³': '&sup3;',
'ΒΉ': '&sup1;', 'Β°': '&deg;'
}
for symbol, html_entity in math_replacements.items():
text = text.replace(symbol, html_entity)
return text
def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str:
html_content = []
html_content.append("""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>PDF Document</title>
<style>
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: 'Times New Roman', Times, serif;
background-color: #f5f5f5;
padding: 20px;
line-height: 1.2;
color: #000000;
}
.document-container {
max-width: 1200px;
margin: 0 auto;
background-color: white;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
border: 1px solid #ddd;
}
.page-wrapper {
background-color: white;
margin: 0;
padding: 40px;
border-bottom: 2px solid #000;
position: relative;
min-height: 800px;
page-break-after: always;
overflow: visible;
}
.page-header {
background-color: #f8f8f8;
padding: 10px 15px;
margin: -40px -40px 30px -40px;
border-bottom: 2px solid #000;
font-weight: bold;
color: #000;
font-size: 14px;
text-align: center;
}
.content-layer {
position: relative;
width: 100%;
height: 100%;
}
.text-content {
position: relative;
z-index: 10;
line-height: 1.4;
}
.text-block {
margin-bottom: 8px;
font-family: 'Times New Roman', Times, serif;
color: #000;
word-wrap: break-word;
overflow-wrap: break-word;
}
.text-block.inline {
display: inline;
margin-bottom: 0;
margin-right: 5px;
}
.text-group {
margin-bottom: 12px;
line-height: 1.3;
}
.bold {
font-weight: bold;
}
.italic {
font-style: italic;
}
.table-container {
margin: 20px 0;
background-color: white;
overflow: auto;
z-index: 20;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.table {
width: 100%;
border-collapse: collapse;
border: 2px solid #000;
font-family: 'Times New Roman', Times, serif;
font-size: 12px;
color: #000;
background-color: white;
margin: 0;
}
.table td, .table th {
border: 1px solid #000;
padding: 8px 12px;
text-align: left;
vertical-align: top;
background-color: white;
font-family: 'Times New Roman', Times, serif;
word-wrap: break-word;
min-width: 60px;
}
.table th {
background-color: #f0f0f0;
font-weight: bold;
text-align: center;
}
.table tr:nth-child(even) td {
background-color: #f9f9f9;
}
.table tr:hover td {
background-color: #f0f0f0;
}
.image-container {
margin: 15px 0;
border: 1px solid #ccc;
background-color: white;
text-align: center;
overflow: hidden;
z-index: 5;
}
.image {
max-width: 100%;
height: auto;
display: block;
margin: 0 auto;
}
.math-symbol {
font-family: 'Times New Roman', serif;
}
.document-info {
background-color: #f8f8f8;
padding: 15px;
border: 1px solid #ccc;
margin-bottom: 20px;
text-align: center;
font-family: 'Times New Roman', Times, serif;
}
@media print {
body {
background-color: white;
padding: 0;
}
.page-wrapper {
border: none;
box-shadow: none;
margin: 0;
page-break-after: always;
}
.document-info {
display: none;
}
.table {
border: 2px solid #000 !important;
}
.table td, .table th {
border: 1px solid #000 !important;
}
}
</style>
</head>
<body>
<div class="document-container">""")
html_content.append(f"""
<div class="document-info">
<h1>PDF Document Conversion</h1>
<p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p>
<p><strong>Converted on:</strong> {self._get_current_timestamp()}</p>
</div>""")
for page in pdf_content["pages"]:
page_width = max(page["page_width"], 595)
page_height = max(page["page_height"], 842)
html_content.append(f"""
<div class="page-wrapper">
<div class="page-header">
Page {page["page_number"]} ({page_width:.0f}Γ—{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])}
</div>
<div class="content-layer">""")
# Add images first
for img in page["images"]:
html_content.append(f"""
<div class="image-container">
<img class="image" src="data:image/png;base64,{img['data']}"
alt="Page {page['page_number']} Image {img['index']}">
</div>""")
# Add tables with improved generation
for table_idx, table in enumerate(page["tables"]):
print(f"πŸ”„ Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})")
html_content.append(self._generate_html_table(
table["data"],
header_rows=table.get("header_rows", 1)
))
# Add text content (non-overlapping groups)
text_groups = self._group_overlapping_text(page["text_blocks"])
html_content.append(' <div class="text-content">')
for group in text_groups:
if len(group) == 1:
block = group[0]
if block.text.strip():
enhanced_text = self.enhance_math_symbols(block.text)
enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')
css_classes = ["text-block"]
if block.is_bold:
css_classes.append("bold")
if block.is_italic:
css_classes.append("italic")
if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
css_classes.append("math-symbol")
font_family = "'Times New Roman', Times, serif"
if 'arial' in block.font_name.lower():
font_family = "Arial, sans-serif"
elif 'helvetica' in block.font_name.lower():
font_family = "Helvetica, Arial, sans-serif"
elif 'courier' in block.font_name.lower():
font_family = "'Courier New', monospace"
font_size = max(block.font_size * 0.9, 10)
html_content.append(f"""
<div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
{enhanced_text}
</div>""")
else:
group.sort(key=lambda b: b.x)
html_content.append(' <div class="text-group">')
for block in group:
if block.text.strip():
enhanced_text = self.enhance_math_symbols(block.text)
enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')
css_classes = ["text-block", "inline"]
if block.is_bold:
css_classes.append("bold")
if block.is_italic:
css_classes.append("italic")
if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
css_classes.append("math-symbol")
font_family = "'Times New Roman', Times, serif"
if 'arial' in block.font_name.lower():
font_family = "Arial, sans-serif"
elif 'helvetica' in block.font_name.lower():
font_family = "Helvetica, Arial, sans-serif"
elif 'courier' in block.font_name.lower():
font_family = "'Courier New', monospace"
font_size = max(block.font_size * 0.9, 10)
html_content.append(f"""
<span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
{enhanced_text}
</span>""")
html_content.append(' </div>')
html_content.append(""" </div>
</div>
</div>""")
html_content.append(" </div>")
html_content.append("""
</body>
</html>""")
final_html = "\n".join(html_content)
if output_path:
try:
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(final_html)
print(f"βœ… HTML saved to: {output_path}")
except Exception as e:
print(f"⚠️ Error saving HTML to {output_path}: {e}")
return final_html
def _get_current_timestamp(self) -> str:
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str:
print(f"πŸš€ Processing PDF: {pdf_path}")
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
print("πŸ“„ Extracting PDF content...")
pdf_content = self.extract_pdf_content(pdf_path)
if use_hf_models and self.hf_token:
print("πŸ€– Attempting to enhance with Hugging Face models...")
try:
print("Note: Hugging Face model integration requires further implementation.")
except Exception as e:
print(f"⚠️ Hugging Face enhancement failed: {e}")
print("πŸ”„ Converting to HTML...")
html_content = self.convert_to_html(pdf_content, output_path)
print("βœ… Processing complete!")
return html_content
def main():
HF_TOKEN = os.getenv("HF_API_TOKEN")
converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
pdf_path = "new-pdf.pdf"
output_path = "sample_converted.html"
try:
html_content = converter.process_pdf(
pdf_path=pdf_path,
output_path=output_path,
use_hf_models=False
)
print(f"βœ… Successfully converted '{pdf_path}' to '{output_path}'")
print(f"🌐 Open '{output_path}' in your web browser to view the result!")
except FileNotFoundError as e:
print(f"❌ Error: {e}")
print("Please ensure the PDF file exists at the specified path.")
except Exception as e:
print(f"❌ An unexpected error occurred: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()