hellorahulk's picture
Fix Docling import and usage with DocumentConverter
5c197b6
raw
history blame
3.57 kB
import os
from pathlib import Path
from typing import Optional, Dict, Any, Union
import magic
from docling.document_converter import DocumentConverter
from datetime import datetime
from .types import ParsedDocument, DocumentMetadata
from .exceptions import UnsupportedFormatError, ParseError
class DocumentParser:
"""
A multiformat document parser using Docling
"""
SUPPORTED_FORMATS = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'text/plain': 'txt',
'text/html': 'html',
'text/markdown': 'md'
}
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.converter = DocumentConverter()
def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
"""
Parse a document file and return structured content
Args:
file_path: Path to the document file
Returns:
ParsedDocument object containing parsed content and metadata
Raises:
UnsupportedFormatError: If the file format is not supported
ParseError: If parsing fails
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
mime_type = magic.from_file(str(file_path), mime=True)
if mime_type not in self.SUPPORTED_FORMATS:
raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
try:
# Get file metadata
stats = file_path.stat()
metadata = DocumentMetadata(
filename=file_path.name,
file_type=self.SUPPORTED_FORMATS[mime_type],
size_bytes=stats.st_size,
created_at=datetime.fromtimestamp(stats.st_ctime),
modified_at=datetime.fromtimestamp(stats.st_mtime),
mime_type=mime_type
)
# Parse document using Docling
result = self.converter.convert(str(file_path))
doc = result.document
# Extract content and structure
content = doc.text
structured_content = {
'sections': doc.sections if hasattr(doc, 'sections') else [],
'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
'entities': doc.entities if hasattr(doc, 'entities') else {},
'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
}
# Update metadata with document-specific information
if hasattr(doc, 'metadata') and doc.metadata:
metadata.title = doc.metadata.get('title')
metadata.author = doc.metadata.get('author')
metadata.pages = doc.metadata.get('pages')
metadata.extra.update(doc.metadata)
return ParsedDocument(
content=content,
metadata=metadata,
raw_text=doc.raw_text if hasattr(doc, 'raw_text') else None,
structured_content=structured_content,
confidence_score=getattr(doc, 'confidence', 1.0)
)
except Exception as e:
raise ParseError(f"Failed to parse document: {str(e)}") from e
def supports_format(self, mime_type: str) -> bool:
"""Check if a given MIME type is supported"""
return mime_type in self.SUPPORTED_FORMATS