Spaces:

amit01Xindus
/

new-project

Sleeping

File size: 25,080 Bytes

96c003e

import os
import base64
import json
import requests
from typing import Dict, List, Any
import fitz  # PyMuPDF
from PIL import Image
import io
import re
from dataclasses import dataclass
from pathlib import Path
from datetime import datetime

@dataclass
class TextBlock:
    text: str
    x: float
    y: float
    width: float
    height: float
    font_size: float
    font_name: str
    is_bold: bool = False
    is_italic: bool = False
    block_id: str = ""

class PDFToHTMLConverter:
    def __init__(self, huggingface_token: str = None):
        self.hf_token = huggingface_token
        self.hf_headers = {
            "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None
        }
        self.models = {
            "document_layout": "microsoft/layoutlm-base-uncased",
            "table_detection": "microsoft/table-transformer-detection",
            "ocr": "microsoft/trocr-base-printed",
            "math_detection": "facebook/detr-resnet-50"
        }
        self.hf_inference_url = "https://api-inference.huggingface.co/models"

    def pdf_to_base64(self, pdf_path: str) -> str:
        try:
            with open(pdf_path, "rb") as pdf_file:
                return base64.b64encode(pdf_file.read()).decode('utf-8')
        except Exception as e:
            raise Exception(f"Error converting PDF to base64: {str(e)}")

    def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]:
        doc = None
        try:
            if not os.path.exists(pdf_path):
                raise FileNotFoundError(f"PDF file not found: {pdf_path}")

            doc = fitz.open(pdf_path)

            if doc is None:
                raise RuntimeError("Failed to open PDF document")

            if doc.page_count == 0:
                raise ValueError("PDF document has no pages")

            print(f"📄 PDF opened successfully: {doc.page_count} pages")

            pages_content = []

            for page_num in range(doc.page_count):
                try:
                    page = doc[page_num]
                    print(f"🔄 Processing page {page_num + 1}/{doc.page_count}")

                    text_blocks = []
                    try:
                        page_dict = page.get_text("dict")
                        text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num)
                    except Exception as e:
                        print(f"⚠️ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}")
                        text_blocks = self._extract_text_blocks_simple(page, page_num)

                    images = self._extract_images_safely(page, doc, page_num)
                    tables = self._detect_tables_safely(page)

                    page_rect = page.rect

                    pages_content.append({
                        "page_number": page_num + 1,
                        "text_blocks": text_blocks,
                        "images": images,
                        "tables": tables,
                        "page_width": page_rect.width,
                        "page_height": page_rect.height
                    })

                except Exception as e:
                    print(f"❌ Error processing page {page_num + 1}: {e}")
                    pages_content.append({
                        "page_number": page_num + 1,
                        "text_blocks": [],
                        "images": [],
                        "tables": [],
                        "page_width": 595,
                        "page_height": 842
                    })

            result = {
                "pages": pages_content,
                "total_pages": doc.page_count
            }
            return result

        except Exception as e:
            raise Exception(f"Error extracting PDF content: {str(e)}")
        finally:
            if doc is not None:
                try:
                    doc.close()
                    print("✅ PDF document closed successfully")
                except Exception as e:
                    print(f"⚠️ Error closing PDF document: {e}")

    def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]:
        text_blocks = []
        
        for block_idx, block in enumerate(page_dict.get("blocks", [])):
            if "lines" not in block:
                continue
                
            for line_idx, line in enumerate(block["lines"]):
                for span_idx, span in enumerate(line["spans"]):
                    text_content = span.get("text", "").strip()
                    if text_content:
                        bbox = span["bbox"]
                        font_info = {
                            "size": span.get("size", 12),
                            "font": span.get("font", "Arial"),
                            "is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16,
                            "is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2
                        }
                        
                        text_block = TextBlock(
                            text=text_content,
                            x=bbox[0],
                            y=bbox[1],
                            width=bbox[2] - bbox[0],
                            height=bbox[3] - bbox[1],
                            font_size=font_info["size"],
                            font_name=font_info["font"],
                            is_bold=font_info["is_bold"],
                            is_italic=font_info["is_italic"],
                            block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}"
                        )
                        text_blocks.append(text_block)
        
        return text_blocks

    def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]:
        text_blocks = []
        try:
            blocks_data = page.get_text("blocks")
            for block_idx, block in enumerate(blocks_data):
                if block[6] == 0:
                    text = block[4].strip()
                    if text:
                        x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
                        
                        lines = text.split('\n')
                        line_height = (y1 - y0) / max(len(lines), 1)
                        
                        for line_idx, line in enumerate(lines):
                            if line.strip():
                                text_block = TextBlock(
                                    text=line.strip(),
                                    x=x0,
                                    y=y0 + (line_idx * line_height),
                                    width=x1 - x0,
                                    height=line_height,
                                    font_size=12,
                                    font_name="Arial",
                                    is_bold=False,
                                    is_italic=False,
                                    block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}"
                                )
                                text_blocks.append(text_block)
        except Exception as e:
            print(f"⚠️ Simple text block extraction failed: {e}")
        
        return text_blocks

    def _extract_images_safely(self, page, doc, page_num) -> List[Dict]:
        images = []
        try:
            image_list = page.get_images(full=True)
            for img_index, img_info in enumerate(image_list):
                try:
                    xref = img_info[0]
                    
                    img_rects = [r for r in page.get_image_rects(xref)]
                    if not img_rects:
                        continue
                    
                    bbox = img_rects[0]
                    
                    pix = fitz.Pixmap(doc, xref)
                    if pix.n - pix.alpha < 4:
                        img_data = pix.tobytes("png")
                        img_base64 = base64.b64encode(img_data).decode()

                        images.append({
                            "index": img_index,
                            "data": img_base64,
                            "bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1)
                        })
                    pix = None
                except Exception as e:
                    print(f"⚠️ Error extracting image {img_index} on page {page_num+1}: {e}")
                    continue
        except Exception as e:
            print(f"⚠️ General error in image extraction for page {page_num+1}: {e}")
        return images

    def _detect_tables_safely(self, page) -> List[Dict]:
        tables = []
        try:
            tabs = page.find_tables()
            for tab_index, tab in enumerate(tabs):
                try:
                    table_data = tab.extract()
                    if table_data:
                        cleaned_data = []
                        for row in table_data:
                            cleaned_row = [str(cell).strip() if cell else "" for cell in row]
                            if any(cleaned_row):
                                cleaned_data.append(cleaned_row)
                        
                        if cleaned_data:
                            tables.append({
                                "bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1),
                                "data": cleaned_data
                            })
                except Exception as e:
                    print(f"⚠️ Error extracting table {tab_index}: {e}")
                    continue
        except Exception as e:
            print(f"⚠️ General error in table detection: {e}")
        return tables

    def enhance_math_symbols(self, text: str) -> str:
        math_replacements = {
            '±': '&plusmn;', '×': '&times;', '÷': '&divide;', '∑': '&sum;',
            '∏': '&prod;', '√': '&radic;', '∞': '&infin;', '∫': '&int;',
            '∂': '&part;', '∆': '&Delta;', '∇': '&nabla;', '∈': '&isin;',
            '∉': '&notin;', '⊂': '&sub;', '⊃': '&sup;', '⊆': '&sube;',
            '⊇': '&supe;', '∪': '&cup;', '∩': '&cap;', '≤': '&le;',
            '≥': '&ge;', '≠': '&ne;', '≡': '&equiv;', '≈': '&asymp;',
            '∝': '&prop;', '∴': '&there4;',
            'α': '&alpha;', 'β': '&beta;', 'γ': '&gamma;', 'δ': '&delta;',
            'ε': '&epsilon;', 'ζ': '&zeta;', 'η': '&eta;', 'θ': '&theta;',
            'ι': '&iota;', 'κ': '&kappa;', 'λ': '&lambda;', 'μ': '&mu;',
            'ν': '&nu;', 'ξ': '&xi;', 'π': '&pi;', 'ρ': '&rho;', 'σ': '&sigma;',
            'τ': '&tau;', 'υ': '&upsilon;', 'φ': '&phi;', 'χ': '&chi;',
            'ψ': '&psi;', 'ω': '&omega;',
            '½': '&frac12;', '⅓': '&frac13;', '¼': '&frac14;', '⅔': '&frac23;',
            '¾': '&frac34;', '⅛': '&frac18;', '²': '&sup2;', '³': '&sup3;',
            '¹': '&sup1;', '°': '&deg;'
        }

        for symbol, html_entity in math_replacements.items():
            text = text.replace(symbol, html_entity)
        
        return text

    def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str:
            html_content = []
            html_content.append("""<!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>PDF Document</title>
        <style>
            * {
                box-sizing: border-box;
                margin: 0;
                padding: 0;
            }
            
            body {
                font-family: 'Times New Roman', Times, serif;
                background-color: #f5f5f5;
                padding: 20px;
                line-height: 1.2;
                color: #000000;
            }
            
            .document-container {
                max-width: 1200px;
                margin: 0 auto;
                background-color: white;
                box-shadow: 0 4px 12px rgba(0,0,0,0.1);
                border: 1px solid #ddd;
            }
            
            .page-wrapper {
                background-color: white;
                margin: 0;
                padding: 40px;
                border-bottom: 2px solid #000;
                position: relative;
                min-height: 800px;
                page-break-after: always;
                overflow: visible;
            }
            
            .page-header {
                background-color: #f8f8f8;
                padding: 10px 15px;
                margin: -40px -40px 30px -40px;
                border-bottom: 2px solid #000;
                font-weight: bold;
                color: #000;
                font-size: 14px;
                text-align: center;
            }
            
            .content-layer {
                position: relative;
                width: 100%;
                height: 100%;
            }
            
            .text-content {
                position: relative;
                z-index: 10;
                line-height: 1.4;
            }
            
            .text-block {
                margin-bottom: 8px;
                font-family: 'Times New Roman', Times, serif;
                color: #000;
                word-wrap: break-word;
                overflow-wrap: break-word;
            }
            
            .text-block.inline {
                display: inline;
                margin-bottom: 0;
                margin-right: 5px;
            }
            
            .text-group {
                margin-bottom: 12px;
                line-height: 1.3;
            }
            
            .bold { 
                font-weight: bold; 
            }
            
            .italic { 
                font-style: italic; 
            }
            
            .table-container {
                margin: 20px 0;
                background-color: white;
                overflow: auto;
                z-index: 20;
                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            }
            
            .table {
                width: 100%;
                border-collapse: collapse;
                border: 2px solid #000;
                font-family: 'Times New Roman', Times, serif;
                font-size: 12px;
                color: #000;
                background-color: white;
                margin: 0;
            }
            
            .table td, .table th {
                border: 1px solid #000;
                padding: 8px 12px;
                text-align: left;
                vertical-align: top;
                background-color: white;
                font-family: 'Times New Roman', Times, serif;
                word-wrap: break-word;
                min-width: 60px;
            }
            
            .table th {
                background-color: #f0f0f0;
                font-weight: bold;
                text-align: center;
            }
            
            .table tr:nth-child(even) td {
                background-color: #f9f9f9;
            }
            
            .table tr:hover td {
                background-color: #f0f0f0;
            }
            
            .image-container {
                margin: 15px 0;
                border: 1px solid #ccc;
                background-color: white;
                text-align: center;
                overflow: hidden;
                z-index: 5;
            }
            
            .image {
                max-width: 100%;
                height: auto;
                display: block;
                margin: 0 auto;
            }
            
            .math-symbol {
                font-family: 'Times New Roman', serif;
            }
            
            .document-info {
                background-color: #f8f8f8;
                padding: 15px;
                border: 1px solid #ccc;
                margin-bottom: 20px;
                text-align: center;
                font-family: 'Times New Roman', Times, serif;
            }
            
            @media print {
                body { 
                    background-color: white; 
                    padding: 0;
                }
                .page-wrapper { 
                    border: none; 
                    box-shadow: none; 
                    margin: 0;
                    page-break-after: always;
                }
                .document-info {
                    display: none;
                }
                .table {
                    border: 2px solid #000 !important;
                }
                .table td, .table th {
                    border: 1px solid #000 !important;
                }
            }
        </style>
    </head>
    <body>
        <div class="document-container">""")

            html_content.append(f"""
            <div class="document-info">
                <h1>PDF Document Conversion</h1>
                <p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p>
                <p><strong>Converted on:</strong> {self._get_current_timestamp()}</p>
            </div>""")

            for page in pdf_content["pages"]:
                page_width = max(page["page_width"], 595)
                page_height = max(page["page_height"], 842)

                html_content.append(f"""
            <div class="page-wrapper">
                <div class="page-header">
                    Page {page["page_number"]} ({page_width:.0f}×{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])}
                </div>
                <div class="content-layer">""")

                # Add images first
                for img in page["images"]:
                    html_content.append(f"""
                    <div class="image-container">
                        <img class="image" src="data:image/png;base64,{img['data']}"
                            alt="Page {page['page_number']} Image {img['index']}">
                    </div>""")

                # Add tables with improved generation
                for table_idx, table in enumerate(page["tables"]):
                    print(f"🔄 Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})")
                    html_content.append(self._generate_html_table(
                        table["data"],
                        header_rows=table.get("header_rows", 1)
                    ))

                # Add text content (non-overlapping groups)
                text_groups = self._group_overlapping_text(page["text_blocks"])
                
                html_content.append('                <div class="text-content">')
                
                for group in text_groups:
                    if len(group) == 1:
                        block = group[0]
                        if block.text.strip():
                            enhanced_text = self.enhance_math_symbols(block.text)
                            enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')

                            css_classes = ["text-block"]
                            if block.is_bold:
                                css_classes.append("bold")
                            if block.is_italic:
                                css_classes.append("italic")
                            if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
                                css_classes.append("math-symbol")

                            font_family = "'Times New Roman', Times, serif"
                            if 'arial' in block.font_name.lower():
                                font_family = "Arial, sans-serif"
                            elif 'helvetica' in block.font_name.lower():
                                font_family = "Helvetica, Arial, sans-serif"
                            elif 'courier' in block.font_name.lower():
                                font_family = "'Courier New', monospace"

                            font_size = max(block.font_size * 0.9, 10)

                            html_content.append(f"""
                        <div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
                            {enhanced_text}
                        </div>""")
                    else:
                        group.sort(key=lambda b: b.x)
                        html_content.append('                    <div class="text-group">')
                        
                        for block in group:
                            if block.text.strip():
                                enhanced_text = self.enhance_math_symbols(block.text)
                                enhanced_text = enhanced_text.replace('<', '&lt;').replace('>', '&gt;')

                                css_classes = ["text-block", "inline"]
                                if block.is_bold:
                                    css_classes.append("bold")
                                if block.is_italic:
                                    css_classes.append("italic")
                                if any(s in enhanced_text for s in ['&alpha;', '&beta;', '&gamma;', '&sum;', '&int;']):
                                    css_classes.append("math-symbol")

                                font_family = "'Times New Roman', Times, serif"
                                if 'arial' in block.font_name.lower():
                                    font_family = "Arial, sans-serif"
                                elif 'helvetica' in block.font_name.lower():
                                    font_family = "Helvetica, Arial, sans-serif"
                                elif 'courier' in block.font_name.lower():
                                    font_family = "'Courier New', monospace"

                                font_size = max(block.font_size * 0.9, 10)

                                html_content.append(f"""
                            <span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};">
                                {enhanced_text}
                            </span>""")
                        
                        html_content.append('                    </div>')

                html_content.append("""                </div>
                </div>
            </div>""")

            html_content.append("    </div>")
            html_content.append("""
    </body>
    </html>""")
            final_html = "\n".join(html_content)

            if output_path:
                try:
                    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(final_html)
                    print(f"✅ HTML saved to: {output_path}")
                except Exception as e:
                    print(f"⚠️ Error saving HTML to {output_path}: {e}")

            return final_html

    def _get_current_timestamp(self) -> str:
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str:
        print(f"🚀 Processing PDF: {pdf_path}")

        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        print("📄 Extracting PDF content...")
        pdf_content = self.extract_pdf_content(pdf_path)

        if use_hf_models and self.hf_token:
            print("🤖 Attempting to enhance with Hugging Face models...")
            try:
                print("Note: Hugging Face model integration requires further implementation.")
            except Exception as e:
                print(f"⚠️ Hugging Face enhancement failed: {e}")

        print("🔄 Converting to HTML...")
        html_content = self.convert_to_html(pdf_content, output_path)

        print("✅ Processing complete!")
        return html_content

def main():
    HF_TOKEN = os.getenv("HF_API_TOKEN")

    converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
    pdf_path = "new-pdf.pdf"
    output_path = "sample_converted.html"

    try:
        html_content = converter.process_pdf(
            pdf_path=pdf_path,
            output_path=output_path,
            use_hf_models=False 
        )

        print(f"✅ Successfully converted '{pdf_path}' to '{output_path}'")
        print(f"🌐 Open '{output_path}' in your web browser to view the result!")

    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        print("Please ensure the PDF file exists at the specified path.")
    except Exception as e:
        print(f"❌ An unexpected error occurred: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()