import os from pathlib import Path from typing import Optional, Dict, Any, Union import magic from docling.document_converter import DocumentConverter from datetime import datetime import shutil import tempfile from .types import ParsedDocument, DocumentMetadata from .exceptions import UnsupportedFormatError, ParseError class DocumentParser: """ A multiformat document parser using Docling """ SUPPORTED_FORMATS = { 'application/pdf': 'pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', 'text/plain': 'txt', 'text/html': 'html', 'text/markdown': 'md', # Add common variations 'application/x-pdf': 'pdf', 'application/acrobat': 'pdf', 'application/msword': 'docx', 'text/x-markdown': 'md', 'text/x-html': 'html' } EXTENSION_TO_MIME = { '.pdf': 'application/pdf', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.txt': 'text/plain', '.html': 'text/html', '.htm': 'text/html', '.md': 'text/markdown', '.markdown': 'text/markdown' } def __init__(self, config: Optional[Dict[str, Any]] = None): self.config = config or {} self.converter = DocumentConverter() # Create a temporary directory for processing files self.temp_dir = Path(tempfile.mkdtemp(prefix="dockling_")) def __del__(self): """Cleanup temporary directory on object destruction""" if hasattr(self, 'temp_dir') and self.temp_dir.exists(): shutil.rmtree(self.temp_dir, ignore_errors=True) def _validate_and_copy_file(self, file_path: Union[str, Path]) -> Path: """ Validate file and copy to temporary location with correct extension """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") # Try to determine format from extension first extension = file_path.suffix.lower() mime_type = self.EXTENSION_TO_MIME.get(extension) # If extension not recognized, use magic if not mime_type: mime_type = magic.from_file(str(file_path), mime=True) if mime_type in self.SUPPORTED_FORMATS: extension = f".{self.SUPPORTED_FORMATS[mime_type]}" else: raise UnsupportedFormatError( f"Unsupported file format: {mime_type}. " f"Supported formats are: {', '.join(set(self.SUPPORTED_FORMATS.values()))}" ) # Copy file to temp directory with correct extension temp_file = self.temp_dir / f"doc{extension}" shutil.copy2(file_path, temp_file) return temp_file def parse(self, file_path: Union[str, Path]) -> ParsedDocument: """ Parse a document file and return structured content Args: file_path: Path to the document file Returns: ParsedDocument object containing parsed content and metadata Raises: UnsupportedFormatError: If the file format is not supported ParseError: If parsing fails """ try: # Validate and prepare file temp_file = self._validate_and_copy_file(file_path) # Get file metadata stats = temp_file.stat() mime_type = magic.from_file(str(temp_file), mime=True) metadata = DocumentMetadata( filename=Path(file_path).name, # Use original filename file_type=self.SUPPORTED_FORMATS[mime_type], size_bytes=stats.st_size, created_at=datetime.fromtimestamp(stats.st_ctime), modified_at=datetime.fromtimestamp(stats.st_mtime), mime_type=mime_type ) try: # Parse document using Docling result = self.converter.convert(str(temp_file)) doc = result.document # Extract content using proper methods try: content = doc.export_to_text() except Exception as e: raise ParseError(f"Failed to extract text content: {str(e)}") # Extract structured content structured_content = { 'sections': doc.sections if hasattr(doc, 'sections') else [], 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [], 'entities': doc.entities if hasattr(doc, 'entities') else {}, 'metadata': doc.metadata if hasattr(doc, 'metadata') else {} } # Get raw text if available try: raw_text = doc.export_to_text(include_layout=True) except: raw_text = content # Update metadata with document-specific information if hasattr(doc, 'metadata') and doc.metadata: metadata.title = doc.metadata.get('title') metadata.author = doc.metadata.get('author') metadata.pages = doc.metadata.get('pages') metadata.extra.update(doc.metadata) return ParsedDocument( content=content, metadata=metadata, raw_text=raw_text, structured_content=structured_content, confidence_score=getattr(doc, 'confidence', 1.0) ) except Exception as e: raise ParseError(f"Failed to parse document: {str(e)}") except Exception as e: raise ParseError(str(e)) finally: # Cleanup temporary files if 'temp_file' in locals() and temp_file.exists(): try: temp_file.unlink() except: pass def supports_format(self, mime_type: str) -> bool: """Check if a given MIME type is supported""" return mime_type in self.SUPPORTED_FORMATS