Spaces:
Sleeping
Sleeping
File size: 3,147 Bytes
42a09ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import re
import json
from PIL import Image
import pytesseract
class HostList:
def __init__(self, is_debug=False) -> None:
self.is_debug = is_debug
# Host List Style (hlstyle) configuration for pytesseract
# - psm means page segmentation (Ref. https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/)
# - fixed slashed zero issue with custom traineddata: https://github.com/ReceiptManager/receipt-parser-server/tree/master/tessdata
self.hlstyle_config = r'--psm 6 --tessdata-dir ./tessdata -l eng_slashed_zeros'
def get_orientation(self, image: Image):
# detect orientation
osd = pytesseract.image_to_osd(image)
isrotate = re.search('(?<=Rotate: )\d+', osd)
isscript = re.search('(?<=Script: )\d+', osd)
angle = re.search('(?<=Rotate: )\d+', osd).group(0) if isrotate else None
script = re.search('(?<=Script: )\d+', osd).group(1) if isscript else None
if self.is_debug:
print("---------------------------------")
print(f"angle : {angle}")
print(f"script : {script}")
return (angle, script)
def post_processes(self, result: str):
data = dict()
obj = dict()
data['data'] = []
is_host = False
is_mid = False
is_tid = False
if self.is_debug:
print("---------------------------------")
print("post-processes:\n")
lines = result.splitlines()
for line in lines:
if re.search(r'(\:)', line):
infos = line.split(':')[1]
# # Clear end line character
# if len(infos) > 1:
# infos.pop()
# # Merge all
# infos = ''.join(infos)
print(infos)
# Is alphabet or numeric ?
if re.search(r'[a-zA-Z0-9]+', infos):
if not is_host and not is_mid and not is_tid:
is_host = True
obj['host'] = re.sub('\W', '', infos)
elif is_host and not is_mid and not is_tid:
is_mid = True
obj['mid'] = max(infos.split(' '), key=len)
elif is_host and is_mid and not is_tid:
is_tid = True
obj['tid'] = max(infos.split(' '), key=len)
if is_host and is_mid and is_tid:
data['data'].append(obj.copy())
print(json.dumps(obj))
is_host = False
is_mid = False
is_tid = False
obj.clear()
print(json.dumps(data))
return f'{result}\n-------------------\n{json.dumps(data, indent=2)}'
def process_image(self, image: Image):
string = pytesseract.image_to_string(image, config=self.hlstyle_config)
string = self.post_processes(string)
return f'{string}' |