Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

docling_free / dockling_parser /parser.py

hellorahulk

Improve error handling and file processing

fdbfd73 5 months ago

raw

history blame contribute delete

6.34 kB

	import os
	from pathlib import Path
	from typing import Optional, Dict, Any, Union
	import magic
	from docling.document_converter import DocumentConverter
	from datetime import datetime
	import shutil
	import tempfile

	from .types import ParsedDocument, DocumentMetadata
	from .exceptions import UnsupportedFormatError, ParseError

	class DocumentParser:
	"""
	A multiformat document parser using Docling
	"""

	SUPPORTED_FORMATS = {
	'application/pdf': 'pdf',
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
	'text/plain': 'txt',
	'text/html': 'html',
	'text/markdown': 'md',
	# Add common variations
	'application/x-pdf': 'pdf',
	'application/acrobat': 'pdf',
	'application/msword': 'docx',
	'text/x-markdown': 'md',
	'text/x-html': 'html'
	}

	EXTENSION_TO_MIME = {
	'.pdf': 'application/pdf',
	'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	'.txt': 'text/plain',
	'.html': 'text/html',
	'.htm': 'text/html',
	'.md': 'text/markdown',
	'.markdown': 'text/markdown'
	}

	def __init__(self, config: Optional[Dict[str, Any]] = None):
	self.config = config or {}
	self.converter = DocumentConverter()
	# Create a temporary directory for processing files
	self.temp_dir = Path(tempfile.mkdtemp(prefix="dockling_"))

	def __del__(self):
	"""Cleanup temporary directory on object destruction"""
	if hasattr(self, 'temp_dir') and self.temp_dir.exists():
	shutil.rmtree(self.temp_dir, ignore_errors=True)

	def _validate_and_copy_file(self, file_path: Union[str, Path]) -> Path:
	"""
	Validate file and copy to temporary location with correct extension
	"""
	file_path = Path(file_path)
	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	# Try to determine format from extension first
	extension = file_path.suffix.lower()
	mime_type = self.EXTENSION_TO_MIME.get(extension)

	# If extension not recognized, use magic
	if not mime_type:
	mime_type = magic.from_file(str(file_path), mime=True)
	if mime_type in self.SUPPORTED_FORMATS:
	extension = f".{self.SUPPORTED_FORMATS[mime_type]}"
	else:
	raise UnsupportedFormatError(
	f"Unsupported file format: {mime_type}. "
	f"Supported formats are: {', '.join(set(self.SUPPORTED_FORMATS.values()))}"
	)

	# Copy file to temp directory with correct extension
	temp_file = self.temp_dir / f"doc{extension}"
	shutil.copy2(file_path, temp_file)
	return temp_file

	def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
	"""
	Parse a document file and return structured content

	Args:
	file_path: Path to the document file

	Returns:
	ParsedDocument object containing parsed content and metadata

	Raises:
	UnsupportedFormatError: If the file format is not supported
	ParseError: If parsing fails
	"""
	try:
	# Validate and prepare file
	temp_file = self._validate_and_copy_file(file_path)

	# Get file metadata
	stats = temp_file.stat()
	mime_type = magic.from_file(str(temp_file), mime=True)

	metadata = DocumentMetadata(
	filename=Path(file_path).name, # Use original filename
	file_type=self.SUPPORTED_FORMATS[mime_type],
	size_bytes=stats.st_size,
	created_at=datetime.fromtimestamp(stats.st_ctime),
	modified_at=datetime.fromtimestamp(stats.st_mtime),
	mime_type=mime_type
	)

	try:
	# Parse document using Docling
	result = self.converter.convert(str(temp_file))
	doc = result.document

	# Extract content using proper methods
	try:
	content = doc.export_to_text()
	except Exception as e:
	raise ParseError(f"Failed to extract text content: {str(e)}")

	# Extract structured content
	structured_content = {
	'sections': doc.sections if hasattr(doc, 'sections') else [],
	'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
	'entities': doc.entities if hasattr(doc, 'entities') else {},
	'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
	}

	# Get raw text if available
	try:
	raw_text = doc.export_to_text(include_layout=True)
	except:
	raw_text = content

	# Update metadata with document-specific information
	if hasattr(doc, 'metadata') and doc.metadata:
	metadata.title = doc.metadata.get('title')
	metadata.author = doc.metadata.get('author')
	metadata.pages = doc.metadata.get('pages')
	metadata.extra.update(doc.metadata)

	return ParsedDocument(
	content=content,
	metadata=metadata,
	raw_text=raw_text,
	structured_content=structured_content,
	confidence_score=getattr(doc, 'confidence', 1.0)
	)

	except Exception as e:
	raise ParseError(f"Failed to parse document: {str(e)}")

	except Exception as e:
	raise ParseError(str(e))

	finally:
	# Cleanup temporary files
	if 'temp_file' in locals() and temp_file.exists():
	try:
	temp_file.unlink()
	except:
	pass

	def supports_format(self, mime_type: str) -> bool:
	"""Check if a given MIME type is supported"""
	return mime_type in self.SUPPORTED_FORMATS