File size: 4,065 Bytes
a7b8c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from pathlib import Path
from typing import List, Optional, Dict, Union
import numpy as np
import pandas as pd
from paddleocr import PaddleOCR
from PIL import Image

class TextRecognizer:
    """
    A class for performing OCR on detected tables using PaddleOCR.
    
    Attributes:
        models_dir (Path): Directory containing OCR model files
    """
    
    def __init__(self, models_dir: Optional[Union[str, Path]] = None) -> None:
        """
        Initialize the TextRecognizer with model directory.
        
        Args:
            models_dir: Directory containing OCR model files
        """
        self.models_dir = Path(models_dir) if models_dir else Path(__file__).parent / 'paddleocr_models'
        self._setup_model_dirs()
        
        self.model = PaddleOCR(
            use_angle_cls=False,
            lang='en',
            det_model_dir=str(self.models_dir / 'det'),
            rec_model_dir=str(self.models_dir / 'rec')
        )

    def _setup_model_dirs(self) -> None:
        """Create necessary directories for model files."""
        (self.models_dir / 'det').mkdir(parents=True, exist_ok=True)
        (self.models_dir / 'rec').mkdir(parents=True, exist_ok=True)

    def recognize(
        self, 
        image_path: Union[str, Path], 
        table_boxes: Optional[np.ndarray] = None,
        padding: tuple = (0, 0)
    ) -> List[pd.DataFrame]:
        """
        Perform OCR on the image within specified table regions.
        
        Args:
            image_path: Path to the input image
            table_boxes: Array of table bounding box coordinates
            padding: Padding to add around table regions (x, y)
            
        Returns:
            List of DataFrames containing extracted text and positions
        """
        with Image.open(image_path) as img:
            img_array = np.array(img.convert('RGB'))
            
        if table_boxes is not None and len(table_boxes) == 1:
            pad_x, pad_y = padding
            box = table_boxes[0]
            img_array = img_array[
                max(box[1]-pad_y, 0):box[3]+pad_y,
                max(box[0]-pad_x, 0):box[2]+pad_x
            ]
            
        ocr_result = self.model.ocr(img_array)
        
        if table_boxes is not None and len(table_boxes) > 1:
            return self._process_multiple_tables(ocr_result[0], table_boxes)
        return self._process_single_table(ocr_result[0])

    def _process_multiple_tables(
        self, 
        ocr_data: List, 
        table_boxes: np.ndarray
    ) -> List[pd.DataFrame]:
        """Process OCR results for multiple tables."""
        result: Dict[int, List] = {}
        
        for item in ocr_data:
            bbox = np.array(item[0]).astype(int)
            word = item[1][0]
            bbox = [bbox[:,0].min(), bbox[:,1].min(), bbox[:,0].max(), bbox[:,1].max()]
            
            for idx, table_box in enumerate(table_boxes):
                if (bbox[0] >= table_box[0] and bbox[1] >= table_box[1] and 
                    bbox[0] <= table_box[2] and bbox[1] <= table_box[3]):
                    if idx not in result:
                        result[idx] = []
                    result[idx].append((word, bbox))
                    
        return [
            pd.DataFrame(
                sorted(table_data, key=lambda x: (x[1][1], x[1][0])),
                columns=['text', 'boundingBox']
            )
            for table_data in result.values()
        ]

    def _process_single_table(self, ocr_data: List) -> List[pd.DataFrame]:
        """Process OCR results for a single table."""
        processed_data = [
            (item[1][0], [
                np.array(item[0])[:,0].min(),
                np.array(item[0])[:,1].min(),
                np.array(item[0])[:,0].max(),
                np.array(item[0])[:,1].max()
            ])
            for item in ocr_data
        ]
        
        return [pd.DataFrame(
            sorted(processed_data, key=lambda x: (x[1][1], x[1][0])),
            columns=['text', 'boundingBox']
        )]