Spaces:
Sleeping
Sleeping
import os | |
import base64 | |
import json | |
import requests | |
from typing import Dict, List, Any | |
import fitz # PyMuPDF | |
from PIL import Image | |
import io | |
import re | |
from dataclasses import dataclass | |
from pathlib import Path | |
from datetime import datetime | |
class TextBlock: | |
text: str | |
x: float | |
y: float | |
width: float | |
height: float | |
font_size: float | |
font_name: str | |
is_bold: bool = False | |
is_italic: bool = False | |
block_id: str = "" | |
class PDFToHTMLConverter: | |
def __init__(self, huggingface_token: str = None): | |
self.hf_token = huggingface_token | |
self.hf_headers = { | |
"Authorization": f"Bearer {huggingface_token}" if huggingface_token else None | |
} | |
self.models = { | |
"document_layout": "microsoft/layoutlm-base-uncased", | |
"table_detection": "microsoft/table-transformer-detection", | |
"ocr": "microsoft/trocr-base-printed", | |
"math_detection": "facebook/detr-resnet-50" | |
} | |
self.hf_inference_url = "https://api-inference.huggingface.co/models" | |
def pdf_to_base64(self, pdf_path: str) -> str: | |
try: | |
with open(pdf_path, "rb") as pdf_file: | |
return base64.b64encode(pdf_file.read()).decode('utf-8') | |
except Exception as e: | |
raise Exception(f"Error converting PDF to base64: {str(e)}") | |
def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]: | |
doc = None | |
try: | |
if not os.path.exists(pdf_path): | |
raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
doc = fitz.open(pdf_path) | |
if doc is None: | |
raise RuntimeError("Failed to open PDF document") | |
if doc.page_count == 0: | |
raise ValueError("PDF document has no pages") | |
print(f"π PDF opened successfully: {doc.page_count} pages") | |
pages_content = [] | |
for page_num in range(doc.page_count): | |
try: | |
page = doc[page_num] | |
print(f"π Processing page {page_num + 1}/{doc.page_count}") | |
text_blocks = [] | |
try: | |
page_dict = page.get_text("dict") | |
text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num) | |
except Exception as e: | |
print(f"β οΈ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}") | |
text_blocks = self._extract_text_blocks_simple(page, page_num) | |
images = self._extract_images_safely(page, doc, page_num) | |
tables = self._detect_tables_safely(page) | |
page_rect = page.rect | |
pages_content.append({ | |
"page_number": page_num + 1, | |
"text_blocks": text_blocks, | |
"images": images, | |
"tables": tables, | |
"page_width": page_rect.width, | |
"page_height": page_rect.height | |
}) | |
except Exception as e: | |
print(f"β Error processing page {page_num + 1}: {e}") | |
pages_content.append({ | |
"page_number": page_num + 1, | |
"text_blocks": [], | |
"images": [], | |
"tables": [], | |
"page_width": 595, | |
"page_height": 842 | |
}) | |
result = { | |
"pages": pages_content, | |
"total_pages": doc.page_count | |
} | |
return result | |
except Exception as e: | |
raise Exception(f"Error extracting PDF content: {str(e)}") | |
finally: | |
if doc is not None: | |
try: | |
doc.close() | |
print("β PDF document closed successfully") | |
except Exception as e: | |
print(f"β οΈ Error closing PDF document: {e}") | |
def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]: | |
text_blocks = [] | |
for block_idx, block in enumerate(page_dict.get("blocks", [])): | |
if "lines" not in block: | |
continue | |
for line_idx, line in enumerate(block["lines"]): | |
for span_idx, span in enumerate(line["spans"]): | |
text_content = span.get("text", "").strip() | |
if text_content: | |
bbox = span["bbox"] | |
font_info = { | |
"size": span.get("size", 12), | |
"font": span.get("font", "Arial"), | |
"is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16, | |
"is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2 | |
} | |
text_block = TextBlock( | |
text=text_content, | |
x=bbox[0], | |
y=bbox[1], | |
width=bbox[2] - bbox[0], | |
height=bbox[3] - bbox[1], | |
font_size=font_info["size"], | |
font_name=font_info["font"], | |
is_bold=font_info["is_bold"], | |
is_italic=font_info["is_italic"], | |
block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}" | |
) | |
text_blocks.append(text_block) | |
return text_blocks | |
def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]: | |
text_blocks = [] | |
try: | |
blocks_data = page.get_text("blocks") | |
for block_idx, block in enumerate(blocks_data): | |
if block[6] == 0: | |
text = block[4].strip() | |
if text: | |
x0, y0, x1, y1 = block[0], block[1], block[2], block[3] | |
lines = text.split('\n') | |
line_height = (y1 - y0) / max(len(lines), 1) | |
for line_idx, line in enumerate(lines): | |
if line.strip(): | |
text_block = TextBlock( | |
text=line.strip(), | |
x=x0, | |
y=y0 + (line_idx * line_height), | |
width=x1 - x0, | |
height=line_height, | |
font_size=12, | |
font_name="Arial", | |
is_bold=False, | |
is_italic=False, | |
block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}" | |
) | |
text_blocks.append(text_block) | |
except Exception as e: | |
print(f"β οΈ Simple text block extraction failed: {e}") | |
return text_blocks | |
def _extract_images_safely(self, page, doc, page_num) -> List[Dict]: | |
images = [] | |
try: | |
image_list = page.get_images(full=True) | |
for img_index, img_info in enumerate(image_list): | |
try: | |
xref = img_info[0] | |
img_rects = [r for r in page.get_image_rects(xref)] | |
if not img_rects: | |
continue | |
bbox = img_rects[0] | |
pix = fitz.Pixmap(doc, xref) | |
if pix.n - pix.alpha < 4: | |
img_data = pix.tobytes("png") | |
img_base64 = base64.b64encode(img_data).decode() | |
images.append({ | |
"index": img_index, | |
"data": img_base64, | |
"bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1) | |
}) | |
pix = None | |
except Exception as e: | |
print(f"β οΈ Error extracting image {img_index} on page {page_num+1}: {e}") | |
continue | |
except Exception as e: | |
print(f"β οΈ General error in image extraction for page {page_num+1}: {e}") | |
return images | |
def _detect_tables_safely(self, page) -> List[Dict]: | |
tables = [] | |
try: | |
tabs = page.find_tables() | |
for tab_index, tab in enumerate(tabs): | |
try: | |
table_data = tab.extract() | |
if table_data: | |
cleaned_data = [] | |
for row in table_data: | |
cleaned_row = [str(cell).strip() if cell else "" for cell in row] | |
if any(cleaned_row): | |
cleaned_data.append(cleaned_row) | |
if cleaned_data: | |
tables.append({ | |
"bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1), | |
"data": cleaned_data | |
}) | |
except Exception as e: | |
print(f"β οΈ Error extracting table {tab_index}: {e}") | |
continue | |
except Exception as e: | |
print(f"β οΈ General error in table detection: {e}") | |
return tables | |
def enhance_math_symbols(self, text: str) -> str: | |
math_replacements = { | |
'Β±': '±', 'Γ': '×', 'Γ·': '÷', 'β': '∑', | |
'β': '∏', 'β': '√', 'β': '∞', 'β«': '∫', | |
'β': '∂', 'β': 'Δ', 'β': '∇', 'β': '∈', | |
'β': '∉', 'β': '⊂', 'β': '⊃', 'β': '⊆', | |
'β': '⊇', 'βͺ': '∪', 'β©': '∩', 'β€': '≤', | |
'β₯': '≥', 'β ': '≠', 'β‘': '≡', 'β': '≈', | |
'β': '∝', 'β΄': '∴', | |
'Ξ±': 'α', 'Ξ²': 'β', 'Ξ³': 'γ', 'Ξ΄': 'δ', | |
'Ξ΅': 'ε', 'ΞΆ': 'ζ', 'Ξ·': 'η', 'ΞΈ': 'θ', | |
'ΞΉ': 'ι', 'ΞΊ': 'κ', 'Ξ»': 'λ', 'ΞΌ': 'μ', | |
'Ξ½': 'ν', 'ΞΎ': 'ξ', 'Ο': 'π', 'Ο': 'ρ', 'Ο': 'σ', | |
'Ο': 'τ', 'Ο ': 'υ', 'Ο': 'φ', 'Ο': 'χ', | |
'Ο': 'ψ', 'Ο': 'ω', | |
'Β½': '½', 'β ': '⅓', 'ΒΌ': '¼', 'β ': '⅔', | |
'ΒΎ': '¾', 'β ': '⅛', 'Β²': '²', 'Β³': '³', | |
'ΒΉ': '¹', 'Β°': '°' | |
} | |
for symbol, html_entity in math_replacements.items(): | |
text = text.replace(symbol, html_entity) | |
return text | |
def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str: | |
html_content = [] | |
html_content.append("""<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>PDF Document</title> | |
<style> | |
* { | |
box-sizing: border-box; | |
margin: 0; | |
padding: 0; | |
} | |
body { | |
font-family: 'Times New Roman', Times, serif; | |
background-color: #f5f5f5; | |
padding: 20px; | |
line-height: 1.2; | |
color: #000000; | |
} | |
.document-container { | |
max-width: 1200px; | |
margin: 0 auto; | |
background-color: white; | |
box-shadow: 0 4px 12px rgba(0,0,0,0.1); | |
border: 1px solid #ddd; | |
} | |
.page-wrapper { | |
background-color: white; | |
margin: 0; | |
padding: 40px; | |
border-bottom: 2px solid #000; | |
position: relative; | |
min-height: 800px; | |
page-break-after: always; | |
overflow: visible; | |
} | |
.page-header { | |
background-color: #f8f8f8; | |
padding: 10px 15px; | |
margin: -40px -40px 30px -40px; | |
border-bottom: 2px solid #000; | |
font-weight: bold; | |
color: #000; | |
font-size: 14px; | |
text-align: center; | |
} | |
.content-layer { | |
position: relative; | |
width: 100%; | |
height: 100%; | |
} | |
.text-content { | |
position: relative; | |
z-index: 10; | |
line-height: 1.4; | |
} | |
.text-block { | |
margin-bottom: 8px; | |
font-family: 'Times New Roman', Times, serif; | |
color: #000; | |
word-wrap: break-word; | |
overflow-wrap: break-word; | |
} | |
.text-block.inline { | |
display: inline; | |
margin-bottom: 0; | |
margin-right: 5px; | |
} | |
.text-group { | |
margin-bottom: 12px; | |
line-height: 1.3; | |
} | |
.bold { | |
font-weight: bold; | |
} | |
.italic { | |
font-style: italic; | |
} | |
.table-container { | |
margin: 20px 0; | |
background-color: white; | |
overflow: auto; | |
z-index: 20; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
.table { | |
width: 100%; | |
border-collapse: collapse; | |
border: 2px solid #000; | |
font-family: 'Times New Roman', Times, serif; | |
font-size: 12px; | |
color: #000; | |
background-color: white; | |
margin: 0; | |
} | |
.table td, .table th { | |
border: 1px solid #000; | |
padding: 8px 12px; | |
text-align: left; | |
vertical-align: top; | |
background-color: white; | |
font-family: 'Times New Roman', Times, serif; | |
word-wrap: break-word; | |
min-width: 60px; | |
} | |
.table th { | |
background-color: #f0f0f0; | |
font-weight: bold; | |
text-align: center; | |
} | |
.table tr:nth-child(even) td { | |
background-color: #f9f9f9; | |
} | |
.table tr:hover td { | |
background-color: #f0f0f0; | |
} | |
.image-container { | |
margin: 15px 0; | |
border: 1px solid #ccc; | |
background-color: white; | |
text-align: center; | |
overflow: hidden; | |
z-index: 5; | |
} | |
.image { | |
max-width: 100%; | |
height: auto; | |
display: block; | |
margin: 0 auto; | |
} | |
.math-symbol { | |
font-family: 'Times New Roman', serif; | |
} | |
.document-info { | |
background-color: #f8f8f8; | |
padding: 15px; | |
border: 1px solid #ccc; | |
margin-bottom: 20px; | |
text-align: center; | |
font-family: 'Times New Roman', Times, serif; | |
} | |
@media print { | |
body { | |
background-color: white; | |
padding: 0; | |
} | |
.page-wrapper { | |
border: none; | |
box-shadow: none; | |
margin: 0; | |
page-break-after: always; | |
} | |
.document-info { | |
display: none; | |
} | |
.table { | |
border: 2px solid #000 !important; | |
} | |
.table td, .table th { | |
border: 1px solid #000 !important; | |
} | |
} | |
</style> | |
</head> | |
<body> | |
<div class="document-container">""") | |
html_content.append(f""" | |
<div class="document-info"> | |
<h1>PDF Document Conversion</h1> | |
<p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p> | |
<p><strong>Converted on:</strong> {self._get_current_timestamp()}</p> | |
</div>""") | |
for page in pdf_content["pages"]: | |
page_width = max(page["page_width"], 595) | |
page_height = max(page["page_height"], 842) | |
html_content.append(f""" | |
<div class="page-wrapper"> | |
<div class="page-header"> | |
Page {page["page_number"]} ({page_width:.0f}Γ{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])} | |
</div> | |
<div class="content-layer">""") | |
# Add images first | |
for img in page["images"]: | |
html_content.append(f""" | |
<div class="image-container"> | |
<img class="image" src="data:image/png;base64,{img['data']}" | |
alt="Page {page['page_number']} Image {img['index']}"> | |
</div>""") | |
# Add tables with improved generation | |
for table_idx, table in enumerate(page["tables"]): | |
print(f"π Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})") | |
html_content.append(self._generate_html_table( | |
table["data"], | |
header_rows=table.get("header_rows", 1) | |
)) | |
# Add text content (non-overlapping groups) | |
text_groups = self._group_overlapping_text(page["text_blocks"]) | |
html_content.append(' <div class="text-content">') | |
for group in text_groups: | |
if len(group) == 1: | |
block = group[0] | |
if block.text.strip(): | |
enhanced_text = self.enhance_math_symbols(block.text) | |
enhanced_text = enhanced_text.replace('<', '<').replace('>', '>') | |
css_classes = ["text-block"] | |
if block.is_bold: | |
css_classes.append("bold") | |
if block.is_italic: | |
css_classes.append("italic") | |
if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']): | |
css_classes.append("math-symbol") | |
font_family = "'Times New Roman', Times, serif" | |
if 'arial' in block.font_name.lower(): | |
font_family = "Arial, sans-serif" | |
elif 'helvetica' in block.font_name.lower(): | |
font_family = "Helvetica, Arial, sans-serif" | |
elif 'courier' in block.font_name.lower(): | |
font_family = "'Courier New', monospace" | |
font_size = max(block.font_size * 0.9, 10) | |
html_content.append(f""" | |
<div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};"> | |
{enhanced_text} | |
</div>""") | |
else: | |
group.sort(key=lambda b: b.x) | |
html_content.append(' <div class="text-group">') | |
for block in group: | |
if block.text.strip(): | |
enhanced_text = self.enhance_math_symbols(block.text) | |
enhanced_text = enhanced_text.replace('<', '<').replace('>', '>') | |
css_classes = ["text-block", "inline"] | |
if block.is_bold: | |
css_classes.append("bold") | |
if block.is_italic: | |
css_classes.append("italic") | |
if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']): | |
css_classes.append("math-symbol") | |
font_family = "'Times New Roman', Times, serif" | |
if 'arial' in block.font_name.lower(): | |
font_family = "Arial, sans-serif" | |
elif 'helvetica' in block.font_name.lower(): | |
font_family = "Helvetica, Arial, sans-serif" | |
elif 'courier' in block.font_name.lower(): | |
font_family = "'Courier New', monospace" | |
font_size = max(block.font_size * 0.9, 10) | |
html_content.append(f""" | |
<span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};"> | |
{enhanced_text} | |
</span>""") | |
html_content.append(' </div>') | |
html_content.append(""" </div> | |
</div> | |
</div>""") | |
html_content.append(" </div>") | |
html_content.append(""" | |
</body> | |
</html>""") | |
final_html = "\n".join(html_content) | |
if output_path: | |
try: | |
Path(output_path).parent.mkdir(parents=True, exist_ok=True) | |
with open(output_path, 'w', encoding='utf-8') as f: | |
f.write(final_html) | |
print(f"β HTML saved to: {output_path}") | |
except Exception as e: | |
print(f"β οΈ Error saving HTML to {output_path}: {e}") | |
return final_html | |
def _get_current_timestamp(self) -> str: | |
return datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str: | |
print(f"π Processing PDF: {pdf_path}") | |
if not os.path.exists(pdf_path): | |
raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
print("π Extracting PDF content...") | |
pdf_content = self.extract_pdf_content(pdf_path) | |
if use_hf_models and self.hf_token: | |
print("π€ Attempting to enhance with Hugging Face models...") | |
try: | |
print("Note: Hugging Face model integration requires further implementation.") | |
except Exception as e: | |
print(f"β οΈ Hugging Face enhancement failed: {e}") | |
print("π Converting to HTML...") | |
html_content = self.convert_to_html(pdf_content, output_path) | |
print("β Processing complete!") | |
return html_content | |
def main(): | |
HF_TOKEN = os.getenv("HF_API_TOKEN") | |
converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN) | |
pdf_path = "new-pdf.pdf" | |
output_path = "sample_converted.html" | |
try: | |
html_content = converter.process_pdf( | |
pdf_path=pdf_path, | |
output_path=output_path, | |
use_hf_models=False | |
) | |
print(f"β Successfully converted '{pdf_path}' to '{output_path}'") | |
print(f"π Open '{output_path}' in your web browser to view the result!") | |
except FileNotFoundError as e: | |
print(f"β Error: {e}") | |
print("Please ensure the PDF file exists at the specified path.") | |
except Exception as e: | |
print(f"β An unexpected error occurred: {str(e)}") | |
import traceback | |
traceback.print_exc() | |
if __name__ == "__main__": | |
main() |