Spaces:
Sleeping
Sleeping
import re | |
import json | |
from PIL import Image | |
import pytesseract | |
class HostList: | |
def __init__(self, is_debug=False) -> None: | |
self.is_debug = is_debug | |
# Host List Style (hlstyle) configuration for pytesseract | |
# - psm means page segmentation (Ref. https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/) | |
# - fixed slashed zero issue with custom traineddata: https://github.com/ReceiptManager/receipt-parser-server/tree/master/tessdata | |
self.hlstyle_config = r'--psm 6 --tessdata-dir ./tessdata -l eng_slashed_zeros' | |
def get_orientation(self, image: Image): | |
# detect orientation | |
osd = pytesseract.image_to_osd(image) | |
isrotate = re.search('(?<=Rotate: )\d+', osd) | |
isscript = re.search('(?<=Script: )\d+', osd) | |
angle = re.search('(?<=Rotate: )\d+', osd).group(0) if isrotate else None | |
script = re.search('(?<=Script: )\d+', osd).group(1) if isscript else None | |
if self.is_debug: | |
print("---------------------------------") | |
print(f"angle : {angle}") | |
print(f"script : {script}") | |
return (angle, script) | |
def post_processes(self, result: str): | |
data = dict() | |
obj = dict() | |
data['data'] = [] | |
is_host = False | |
is_mid = False | |
is_tid = False | |
if self.is_debug: | |
print("---------------------------------") | |
print("post-processes:\n") | |
lines = result.splitlines() | |
for line in lines: | |
if re.search(r'(\:)', line): | |
infos = line.split(':')[1] | |
# # Clear end line character | |
# if len(infos) > 1: | |
# infos.pop() | |
# # Merge all | |
# infos = ''.join(infos) | |
print(infos) | |
# Is alphabet or numeric ? | |
if re.search(r'[a-zA-Z0-9]+', infos): | |
if not is_host and not is_mid and not is_tid: | |
is_host = True | |
obj['host'] = re.sub('\W', '', infos) | |
elif is_host and not is_mid and not is_tid: | |
is_mid = True | |
obj['mid'] = max(infos.split(' '), key=len) | |
elif is_host and is_mid and not is_tid: | |
is_tid = True | |
obj['tid'] = max(infos.split(' '), key=len) | |
if is_host and is_mid and is_tid: | |
data['data'].append(obj.copy()) | |
print(json.dumps(obj)) | |
is_host = False | |
is_mid = False | |
is_tid = False | |
obj.clear() | |
print(json.dumps(data)) | |
return f'{result}\n-------------------\n{json.dumps(data, indent=2)}' | |
def process_image(self, image: Image): | |
string = pytesseract.image_to_string(image, config=self.hlstyle_config) | |
string = self.post_processes(string) | |
return f'{string}' |