from pathlib import Path from typing import List, Optional, Dict, Union import numpy as np import pandas as pd from paddleocr import PaddleOCR from PIL import Image class TextRecognizer: """ A class for performing OCR on detected tables using PaddleOCR. Attributes: models_dir (Path): Directory containing OCR model files """ def __init__(self, models_dir: Optional[Union[str, Path]] = None) -> None: """ Initialize the TextRecognizer with model directory. Args: models_dir: Directory containing OCR model files """ self.models_dir = Path(models_dir) if models_dir else Path(__file__).parent / 'paddleocr_models' self._setup_model_dirs() self.model = PaddleOCR( use_angle_cls=False, lang='en', det_model_dir=str(self.models_dir / 'det'), rec_model_dir=str(self.models_dir / 'rec') ) def _setup_model_dirs(self) -> None: """Create necessary directories for model files.""" (self.models_dir / 'det').mkdir(parents=True, exist_ok=True) (self.models_dir / 'rec').mkdir(parents=True, exist_ok=True) def recognize( self, image_path: Union[str, Path], table_boxes: Optional[np.ndarray] = None, padding: tuple = (0, 0) ) -> List[pd.DataFrame]: """ Perform OCR on the image within specified table regions. Args: image_path: Path to the input image table_boxes: Array of table bounding box coordinates padding: Padding to add around table regions (x, y) Returns: List of DataFrames containing extracted text and positions """ with Image.open(image_path) as img: img_array = np.array(img.convert('RGB')) if table_boxes is not None and len(table_boxes) == 1: pad_x, pad_y = padding box = table_boxes[0] img_array = img_array[ max(box[1]-pad_y, 0):box[3]+pad_y, max(box[0]-pad_x, 0):box[2]+pad_x ] ocr_result = self.model.ocr(img_array) if table_boxes is not None and len(table_boxes) > 1: return self._process_multiple_tables(ocr_result[0], table_boxes) return self._process_single_table(ocr_result[0]) def _process_multiple_tables( self, ocr_data: List, table_boxes: np.ndarray ) -> List[pd.DataFrame]: """Process OCR results for multiple tables.""" result: Dict[int, List] = {} for item in ocr_data: bbox = np.array(item[0]).astype(int) word = item[1][0] bbox = [bbox[:,0].min(), bbox[:,1].min(), bbox[:,0].max(), bbox[:,1].max()] for idx, table_box in enumerate(table_boxes): if (bbox[0] >= table_box[0] and bbox[1] >= table_box[1] and bbox[0] <= table_box[2] and bbox[1] <= table_box[3]): if idx not in result: result[idx] = [] result[idx].append((word, bbox)) return [ pd.DataFrame( sorted(table_data, key=lambda x: (x[1][1], x[1][0])), columns=['text', 'boundingBox'] ) for table_data in result.values() ] def _process_single_table(self, ocr_data: List) -> List[pd.DataFrame]: """Process OCR results for a single table.""" processed_data = [ (item[1][0], [ np.array(item[0])[:,0].min(), np.array(item[0])[:,1].min(), np.array(item[0])[:,0].max(), np.array(item[0])[:,1].max() ]) for item in ocr_data ] return [pd.DataFrame( sorted(processed_data, key=lambda x: (x[1][1], x[1][0])), columns=['text', 'boundingBox'] )]