import re import json from PIL import Image import pytesseract class HostList: def __init__(self, is_debug=False) -> None: self.is_debug = is_debug # Host List Style (hlstyle) configuration for pytesseract # - psm means page segmentation (Ref. https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/) # - fixed slashed zero issue with custom traineddata: https://github.com/ReceiptManager/receipt-parser-server/tree/master/tessdata self.hlstyle_config = r'--psm 6 --tessdata-dir ./tessdata -l eng_slashed_zeros' def get_orientation(self, image: Image): # detect orientation osd = pytesseract.image_to_osd(image) isrotate = re.search('(?<=Rotate: )\d+', osd) isscript = re.search('(?<=Script: )\d+', osd) angle = re.search('(?<=Rotate: )\d+', osd).group(0) if isrotate else None script = re.search('(?<=Script: )\d+', osd).group(1) if isscript else None if self.is_debug: print("---------------------------------") print(f"angle : {angle}") print(f"script : {script}") return (angle, script) def post_processes(self, result: str): data = dict() obj = dict() data['data'] = [] is_host = False is_mid = False is_tid = False if self.is_debug: print("---------------------------------") print("post-processes:\n") lines = result.splitlines() for line in lines: if re.search(r'(\:)', line): infos = line.split(':')[1] # # Clear end line character # if len(infos) > 1: # infos.pop() # # Merge all # infos = ''.join(infos) print(infos) # Is alphabet or numeric ? if re.search(r'[a-zA-Z0-9]+', infos): if not is_host and not is_mid and not is_tid: is_host = True obj['host'] = re.sub('\W', '', infos) elif is_host and not is_mid and not is_tid: is_mid = True obj['mid'] = max(infos.split(' '), key=len) elif is_host and is_mid and not is_tid: is_tid = True obj['tid'] = max(infos.split(' '), key=len) if is_host and is_mid and is_tid: data['data'].append(obj.copy()) print(json.dumps(obj)) is_host = False is_mid = False is_tid = False obj.clear() print(json.dumps(data)) return f'{result}\n-------------------\n{json.dumps(data, indent=2)}' def process_image(self, image: Image): string = pytesseract.image_to_string(image, config=self.hlstyle_config) string = self.post_processes(string) return f'{string}'