File size: 3,147 Bytes
42a09ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import json
from PIL import Image
import pytesseract

class HostList:
    def __init__(self, is_debug=False) -> None:
        self.is_debug = is_debug

        # Host List Style (hlstyle) configuration for pytesseract
        # - psm means page segmentation (Ref. https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/)
        # - fixed slashed zero issue with custom traineddata: https://github.com/ReceiptManager/receipt-parser-server/tree/master/tessdata
        self.hlstyle_config = r'--psm 6 --tessdata-dir ./tessdata -l eng_slashed_zeros'

    def get_orientation(self, image: Image):
        # detect orientation
        osd = pytesseract.image_to_osd(image)
        isrotate = re.search('(?<=Rotate: )\d+', osd)
        isscript = re.search('(?<=Script: )\d+', osd)
        angle = re.search('(?<=Rotate: )\d+', osd).group(0) if isrotate else None
        script = re.search('(?<=Script: )\d+', osd).group(1) if isscript else None

        if self.is_debug:
            print("---------------------------------")
            print(f"angle : {angle}")
            print(f"script : {script}")
            
        return (angle, script)
    
    def post_processes(self, result: str):
        data = dict()
        obj = dict()
        data['data'] = []
        is_host = False
        is_mid = False
        is_tid = False

        if self.is_debug:
            print("---------------------------------")
            print("post-processes:\n")

        lines = result.splitlines()
        for line in lines:
            if re.search(r'(\:)', line):
                infos = line.split(':')[1]

                # # Clear end line character
                # if len(infos) > 1:
                #     infos.pop()
                
                # # Merge all
                # infos = ''.join(infos)
                print(infos)
                
                # Is alphabet or numeric ?
                if re.search(r'[a-zA-Z0-9]+', infos):
                    if not is_host and not is_mid and not is_tid:
                        is_host = True
                        obj['host'] = re.sub('\W', '', infos)
                    elif is_host and not is_mid and not is_tid:
                        is_mid = True
                        obj['mid'] = max(infos.split(' '), key=len)
                    elif is_host and is_mid and not is_tid:
                        is_tid = True
                        obj['tid'] = max(infos.split(' '), key=len)

                    if is_host and is_mid and is_tid:
                        data['data'].append(obj.copy())
                        print(json.dumps(obj))
                        is_host = False
                        is_mid = False
                        is_tid = False

                        obj.clear()

        print(json.dumps(data))

        return f'{result}\n-------------------\n{json.dumps(data, indent=2)}'

    def process_image(self, image: Image):
        string = pytesseract.image_to_string(image, config=self.hlstyle_config)
        string = self.post_processes(string)
        return f'{string}'