Commit
·
858fd76
0
Parent(s):
Duplicate from pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1
Browse files- .gitattributes +34 -0
- README.md +16 -0
- app.py +171 -0
- files/README.md +0 -0
- files/blank.pdf +0 -0
- files/blank.png +0 -0
- files/example.pdf +0 -0
- files/functions.py +805 -0
- files/languages_iso.csv +184 -0
- files/languages_tesseract.csv +127 -0
- files/template.pdf +0 -0
- files/wo_content.png +0 -0
- packages.txt +2 -0
- requirements.txt +9 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Inference APP for Document Understanding at line level (v1)
|
3 |
+
emoji: 🐢
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.18.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
models:
|
11 |
+
- >-
|
12 |
+
pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384
|
13 |
+
duplicated_from: pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1
|
14 |
+
---
|
15 |
+
|
16 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
|
6 |
+
from operator import itemgetter
|
7 |
+
import collections
|
8 |
+
|
9 |
+
import pypdf
|
10 |
+
from pypdf import PdfReader
|
11 |
+
from pypdf.errors import PdfReadError
|
12 |
+
|
13 |
+
import pdf2image
|
14 |
+
from pdf2image import convert_from_path
|
15 |
+
import langdetect
|
16 |
+
from langdetect import detect_langs
|
17 |
+
|
18 |
+
import pandas as pd
|
19 |
+
import numpy as np
|
20 |
+
import random
|
21 |
+
import tempfile
|
22 |
+
import itertools
|
23 |
+
|
24 |
+
from matplotlib import font_manager
|
25 |
+
from PIL import Image, ImageDraw, ImageFont
|
26 |
+
import cv2
|
27 |
+
|
28 |
+
## files
|
29 |
+
|
30 |
+
import sys
|
31 |
+
sys.path.insert(0, 'files/')
|
32 |
+
|
33 |
+
import functions
|
34 |
+
from functions import *
|
35 |
+
|
36 |
+
# update pip
|
37 |
+
os.system('python -m pip install --upgrade pip')
|
38 |
+
|
39 |
+
# model
|
40 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
41 |
+
|
42 |
+
import torch
|
43 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
44 |
+
|
45 |
+
model_id = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
|
46 |
+
|
47 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
48 |
+
model = AutoModelForTokenClassification.from_pretrained(model_id);
|
49 |
+
model.to(device);
|
50 |
+
|
51 |
+
# APP outputs
|
52 |
+
def app_outputs(uploaded_pdf):
|
53 |
+
filename, msg, images = pdf_to_images(uploaded_pdf)
|
54 |
+
num_images = len(images)
|
55 |
+
|
56 |
+
if not msg.startswith("Error with the PDF"):
|
57 |
+
|
58 |
+
# Extraction of image data (text and bounding boxes)
|
59 |
+
dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
|
60 |
+
# prepare our data in the format of the model
|
61 |
+
encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
|
62 |
+
custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
|
63 |
+
# Get predictions (token level)
|
64 |
+
outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
|
65 |
+
# Get predictions (line level)
|
66 |
+
probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
|
67 |
+
# Get labeled images with lines bounding boxes
|
68 |
+
images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
|
69 |
+
|
70 |
+
img_files = list()
|
71 |
+
# get image of PDF without bounding boxes
|
72 |
+
for i in range(num_images):
|
73 |
+
if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
|
74 |
+
else: img_file = filename.replace(".pdf", ".png")
|
75 |
+
images[i].save(img_file)
|
76 |
+
img_files.append(img_file)
|
77 |
+
|
78 |
+
if num_images < max_imgboxes:
|
79 |
+
img_files += [image_blank]*(max_imgboxes - num_images)
|
80 |
+
images += [Image.open(image_blank)]*(max_imgboxes - num_images)
|
81 |
+
for count in range(max_imgboxes - num_images):
|
82 |
+
df[num_images + count] = pd.DataFrame()
|
83 |
+
else:
|
84 |
+
img_files = img_files[:max_imgboxes]
|
85 |
+
images = images[:max_imgboxes]
|
86 |
+
df = dict(itertools.islice(df.items(), max_imgboxes))
|
87 |
+
|
88 |
+
# save
|
89 |
+
csv_files = list()
|
90 |
+
for i in range(max_imgboxes):
|
91 |
+
csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
|
92 |
+
csv_files.append(gr.File.update(value=csv_file, visible=True))
|
93 |
+
df[i].to_csv(csv_file, encoding="utf-8", index=False)
|
94 |
+
|
95 |
+
else:
|
96 |
+
img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes
|
97 |
+
img_files[0], img_files[1] = image_blank, image_blank
|
98 |
+
images[0], images[1] = Image.open(image_blank), Image.open(image_blank)
|
99 |
+
csv_file = "csv_wo_content.csv"
|
100 |
+
csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
|
101 |
+
df, df_empty = dict(), pd.DataFrame()
|
102 |
+
df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
|
103 |
+
|
104 |
+
return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
|
105 |
+
|
106 |
+
# gradio APP
|
107 |
+
with gr.Blocks(title="Inference APP for Document Understanding at line level (v1)", css=".gradio-container") as demo:
|
108 |
+
gr.HTML("""
|
109 |
+
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v1)</h1></div>
|
110 |
+
<div style="margin-top: 40px"><p><b>[ Check as well the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">Inference APP for Document Understanding at PARAGRAPH level</a>! ]</b></p></div>
|
111 |
+
<div><p>(02/12/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model LiLT base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level</a> (chunk size of 384 tokens).</p></div>
|
112 |
+
<div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2202.13669" target="_blank">LiLT (Language-Independent Layout Transformer)</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
|
113 |
+
<div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
|
114 |
+
<div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
|
115 |
+
<div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI | Inference at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div>
|
116 |
+
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
|
117 |
+
<ul><li>(02/16/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-c18d16e53cf8" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
|
118 |
+
""")
|
119 |
+
with gr.Row():
|
120 |
+
pdf_file = gr.File(label="PDF")
|
121 |
+
with gr.Row():
|
122 |
+
submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
|
123 |
+
reset_btn = gr.Button(value="Clear")
|
124 |
+
with gr.Row():
|
125 |
+
output_msg = gr.Textbox(label="Output message")
|
126 |
+
with gr.Row():
|
127 |
+
fileboxes = []
|
128 |
+
for num_page in range(max_imgboxes):
|
129 |
+
file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
|
130 |
+
fileboxes.append(file_path)
|
131 |
+
with gr.Row():
|
132 |
+
imgboxes = []
|
133 |
+
for num_page in range(max_imgboxes):
|
134 |
+
img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
|
135 |
+
imgboxes.append(img)
|
136 |
+
with gr.Row():
|
137 |
+
csvboxes = []
|
138 |
+
for num_page in range(max_imgboxes):
|
139 |
+
csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
|
140 |
+
csvboxes.append(csv)
|
141 |
+
with gr.Row():
|
142 |
+
dfboxes = []
|
143 |
+
for num_page in range(max_imgboxes):
|
144 |
+
df = gr.Dataframe(
|
145 |
+
headers=["bounding boxes", "texts", "labels"],
|
146 |
+
datatype=["str", "str", "str"],
|
147 |
+
col_count=(3, "fixed"),
|
148 |
+
visible=True,
|
149 |
+
label=f"Data of page {num_page}",
|
150 |
+
type="pandas",
|
151 |
+
wrap=True
|
152 |
+
)
|
153 |
+
dfboxes.append(df)
|
154 |
+
|
155 |
+
outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
|
156 |
+
submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
|
157 |
+
reset_btn.click(
|
158 |
+
lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
|
159 |
+
inputs=[],
|
160 |
+
outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
|
161 |
+
)
|
162 |
+
|
163 |
+
gr.Examples(
|
164 |
+
[["files/example.pdf"]],
|
165 |
+
[pdf_file],
|
166 |
+
outputboxes,
|
167 |
+
fn=app_outputs,
|
168 |
+
cache_examples=True,
|
169 |
+
)
|
170 |
+
|
171 |
+
demo.launch()
|
files/README.md
ADDED
File without changes
|
files/blank.pdf
ADDED
Binary file (1.15 kB). View file
|
|
files/blank.png
ADDED
![]() |
files/example.pdf
ADDED
Binary file (343 kB). View file
|
|
files/functions.py
ADDED
@@ -0,0 +1,805 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from operator import itemgetter
|
8 |
+
import collections
|
9 |
+
|
10 |
+
import pypdf
|
11 |
+
from pypdf import PdfReader
|
12 |
+
from pypdf.errors import PdfReadError
|
13 |
+
|
14 |
+
import pdf2image
|
15 |
+
from pdf2image import convert_from_path
|
16 |
+
import langdetect
|
17 |
+
from langdetect import detect_langs
|
18 |
+
|
19 |
+
import pandas as pd
|
20 |
+
import numpy as np
|
21 |
+
import random
|
22 |
+
import tempfile
|
23 |
+
import itertools
|
24 |
+
|
25 |
+
from matplotlib import font_manager
|
26 |
+
from PIL import Image, ImageDraw, ImageFont
|
27 |
+
import cv2
|
28 |
+
|
29 |
+
# Tesseract
|
30 |
+
print(os.popen(f'cat /etc/debian_version').read())
|
31 |
+
print(os.popen(f'cat /etc/issue').read())
|
32 |
+
print(os.popen(f'apt search tesseract').read())
|
33 |
+
import pytesseract
|
34 |
+
|
35 |
+
## Key parameters
|
36 |
+
|
37 |
+
# categories colors
|
38 |
+
label2color = {
|
39 |
+
'Caption': 'brown',
|
40 |
+
'Footnote': 'orange',
|
41 |
+
'Formula': 'gray',
|
42 |
+
'List-item': 'yellow',
|
43 |
+
'Page-footer': 'red',
|
44 |
+
'Page-header': 'red',
|
45 |
+
'Picture': 'violet',
|
46 |
+
'Section-header': 'orange',
|
47 |
+
'Table': 'green',
|
48 |
+
'Text': 'blue',
|
49 |
+
'Title': 'pink'
|
50 |
+
}
|
51 |
+
|
52 |
+
# bounding boxes start and end of a sequence
|
53 |
+
cls_box = [0, 0, 0, 0]
|
54 |
+
sep_box = cls_box
|
55 |
+
|
56 |
+
# model
|
57 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
58 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
59 |
+
|
60 |
+
model_id = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
|
61 |
+
|
62 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
63 |
+
model = AutoModelForTokenClassification.from_pretrained(model_id);
|
64 |
+
model.to(device);
|
65 |
+
|
66 |
+
# get labels
|
67 |
+
id2label = model.config.id2label
|
68 |
+
label2id = model.config.label2id
|
69 |
+
num_labels = len(id2label)
|
70 |
+
|
71 |
+
# (tokenization) The maximum length of a feature (sequence)
|
72 |
+
if str(384) in model_id:
|
73 |
+
max_length = 384
|
74 |
+
elif str(512) in model_id:
|
75 |
+
max_length = 512
|
76 |
+
else:
|
77 |
+
print("Error with max_length of chunks!")
|
78 |
+
|
79 |
+
# (tokenization) overlap
|
80 |
+
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
|
81 |
+
|
82 |
+
# max PDF page images that will be displayed
|
83 |
+
max_imgboxes = 2
|
84 |
+
examples_dir = 'files/'
|
85 |
+
image_wo_content = examples_dir + "wo_content.png" # image without content
|
86 |
+
pdf_blank = examples_dir + "blank.pdf" # blank PDF
|
87 |
+
image_blank = examples_dir + "blank.png" # blank image
|
88 |
+
|
89 |
+
## get langdetect2Tesseract dictionary
|
90 |
+
t = "files/languages_tesseract.csv"
|
91 |
+
l = "files/languages_iso.csv"
|
92 |
+
|
93 |
+
df_t = pd.read_csv(t)
|
94 |
+
df_l = pd.read_csv(l)
|
95 |
+
|
96 |
+
langs_t = df_t["Language"].to_list()
|
97 |
+
langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t]
|
98 |
+
langs_l = df_l["Language"].to_list()
|
99 |
+
langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l]
|
100 |
+
langscode_t = df_t["LangCode"].to_list()
|
101 |
+
langscode_l = df_l["LangCode"].to_list()
|
102 |
+
|
103 |
+
Tesseract2langdetect, langdetect2Tesseract = dict(), dict()
|
104 |
+
for lang_t, langcode_t in zip(langs_t,langscode_t):
|
105 |
+
try:
|
106 |
+
if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese"
|
107 |
+
index = langs_l.index(lang_t)
|
108 |
+
langcode_l = langscode_l[index]
|
109 |
+
Tesseract2langdetect[langcode_t] = langcode_l
|
110 |
+
except:
|
111 |
+
continue
|
112 |
+
|
113 |
+
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
|
114 |
+
|
115 |
+
## General
|
116 |
+
|
117 |
+
# get text and bounding boxes from an image
|
118 |
+
# https://stackoverflow.com/questions/61347755/how-can-i-get-line-coordinates-that-readed-by-tesseract
|
119 |
+
# https://medium.com/geekculture/tesseract-ocr-understanding-the-contents-of-documents-beyond-their-text-a98704b7c655
|
120 |
+
def get_data(results, factor, conf_min=0):
|
121 |
+
|
122 |
+
data = {}
|
123 |
+
for i in range(len(results['line_num'])):
|
124 |
+
level = results['level'][i]
|
125 |
+
block_num = results['block_num'][i]
|
126 |
+
par_num = results['par_num'][i]
|
127 |
+
line_num = results['line_num'][i]
|
128 |
+
top, left = results['top'][i], results['left'][i]
|
129 |
+
width, height = results['width'][i], results['height'][i]
|
130 |
+
conf = results['conf'][i]
|
131 |
+
text = results['text'][i]
|
132 |
+
if not (text == '' or text.isspace()):
|
133 |
+
if conf >= conf_min:
|
134 |
+
tup = (text, left, top, width, height)
|
135 |
+
if block_num in list(data.keys()):
|
136 |
+
if par_num in list(data[block_num].keys()):
|
137 |
+
if line_num in list(data[block_num][par_num].keys()):
|
138 |
+
data[block_num][par_num][line_num].append(tup)
|
139 |
+
else:
|
140 |
+
data[block_num][par_num][line_num] = [tup]
|
141 |
+
else:
|
142 |
+
data[block_num][par_num] = {}
|
143 |
+
data[block_num][par_num][line_num] = [tup]
|
144 |
+
else:
|
145 |
+
data[block_num] = {}
|
146 |
+
data[block_num][par_num] = {}
|
147 |
+
data[block_num][par_num][line_num] = [tup]
|
148 |
+
|
149 |
+
# get paragraphs dicionnary with list of lines
|
150 |
+
par_data = {}
|
151 |
+
par_idx = 1
|
152 |
+
for _, b in data.items():
|
153 |
+
for _, p in b.items():
|
154 |
+
line_data = {}
|
155 |
+
line_idx = 1
|
156 |
+
for _, l in p.items():
|
157 |
+
line_data[line_idx] = l
|
158 |
+
line_idx += 1
|
159 |
+
par_data[par_idx] = line_data
|
160 |
+
par_idx += 1
|
161 |
+
|
162 |
+
# get lines of texts, grouped by paragraph
|
163 |
+
lines = list()
|
164 |
+
row_indexes = list()
|
165 |
+
row_index = 0
|
166 |
+
for _,par in par_data.items():
|
167 |
+
count_lines = 0
|
168 |
+
for _,line in par.items():
|
169 |
+
if count_lines == 0: row_indexes.append(row_index)
|
170 |
+
line_text = ' '.join([item[0] for item in line])
|
171 |
+
lines.append(line_text)
|
172 |
+
count_lines += 1
|
173 |
+
row_index += 1
|
174 |
+
# lines.append("\n")
|
175 |
+
row_index += 1
|
176 |
+
# lines = lines[:-1]
|
177 |
+
|
178 |
+
# get paragraphes boxes (par_boxes)
|
179 |
+
# get lines boxes (line_boxes)
|
180 |
+
par_boxes = list()
|
181 |
+
par_idx = 1
|
182 |
+
line_boxes = list()
|
183 |
+
line_idx = 1
|
184 |
+
for _, par in par_data.items():
|
185 |
+
xmins, ymins, xmaxs, ymaxs = list(), list(), list(), list()
|
186 |
+
for _, line in par.items():
|
187 |
+
xmin, ymin = line[0][1], line[0][2]
|
188 |
+
xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4])
|
189 |
+
line_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
|
190 |
+
xmins.append(xmin)
|
191 |
+
ymins.append(ymin)
|
192 |
+
xmaxs.append(xmax)
|
193 |
+
ymaxs.append(ymax)
|
194 |
+
line_idx += 1
|
195 |
+
xmin, ymin, xmax, ymax = min(xmins), min(ymins), max(xmaxs), max(ymaxs)
|
196 |
+
par_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
|
197 |
+
par_idx += 1
|
198 |
+
|
199 |
+
return lines, row_indexes, par_boxes, line_boxes #data, par_data #
|
200 |
+
|
201 |
+
# rescale image to get 300dpi
|
202 |
+
def set_image_dpi_resize(image):
|
203 |
+
"""
|
204 |
+
Rescaling image to 300dpi while resizing
|
205 |
+
:param image: An image
|
206 |
+
:return: A rescaled image
|
207 |
+
"""
|
208 |
+
length_x, width_y = image.size
|
209 |
+
factor = min(1, float(1024.0 / length_x))
|
210 |
+
size = int(factor * length_x), int(factor * width_y)
|
211 |
+
image_resize = image.resize(size, Image.Resampling.LANCZOS)
|
212 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='1.png')
|
213 |
+
temp_filename = temp_file.name
|
214 |
+
image_resize.save(temp_filename, dpi=(300, 300))
|
215 |
+
return factor, temp_filename
|
216 |
+
|
217 |
+
# it is important that each bounding box should be in (upper left, lower right) format.
|
218 |
+
# source: https://github.com/NielsRogge/Transformers-Tutorials/issues/129
|
219 |
+
def upperleft_to_lowerright(bbox):
|
220 |
+
x0, y0, x1, y1 = tuple(bbox)
|
221 |
+
if bbox[2] < bbox[0]:
|
222 |
+
x0 = bbox[2]
|
223 |
+
x1 = bbox[0]
|
224 |
+
if bbox[3] < bbox[1]:
|
225 |
+
y0 = bbox[3]
|
226 |
+
y1 = bbox[1]
|
227 |
+
return [x0, y0, x1, y1]
|
228 |
+
|
229 |
+
# convert boundings boxes (left, top, width, height) format to (left, top, left+widght, top+height) format.
|
230 |
+
def convert_box(bbox):
|
231 |
+
x, y, w, h = tuple(bbox) # the row comes in (left, top, width, height) format
|
232 |
+
return [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
|
233 |
+
|
234 |
+
# LiLT model gets 1000x10000 pixels images
|
235 |
+
def normalize_box(bbox, width, height):
|
236 |
+
return [
|
237 |
+
int(1000 * (bbox[0] / width)),
|
238 |
+
int(1000 * (bbox[1] / height)),
|
239 |
+
int(1000 * (bbox[2] / width)),
|
240 |
+
int(1000 * (bbox[3] / height)),
|
241 |
+
]
|
242 |
+
|
243 |
+
# LiLT model gets 1000x10000 pixels images
|
244 |
+
def denormalize_box(bbox, width, height):
|
245 |
+
return [
|
246 |
+
int(width * (bbox[0] / 1000)),
|
247 |
+
int(height * (bbox[1] / 1000)),
|
248 |
+
int(width* (bbox[2] / 1000)),
|
249 |
+
int(height * (bbox[3] / 1000)),
|
250 |
+
]
|
251 |
+
|
252 |
+
# get back original size
|
253 |
+
def original_box(box, original_width, original_height, coco_width, coco_height):
|
254 |
+
return [
|
255 |
+
int(original_width * (box[0] / coco_width)),
|
256 |
+
int(original_height * (box[1] / coco_height)),
|
257 |
+
int(original_width * (box[2] / coco_width)),
|
258 |
+
int(original_height* (box[3] / coco_height)),
|
259 |
+
]
|
260 |
+
|
261 |
+
def get_blocks(bboxes_block, categories, texts):
|
262 |
+
|
263 |
+
# get list of unique block boxes
|
264 |
+
bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list()
|
265 |
+
for count_block, bbox_block in enumerate(bboxes_block):
|
266 |
+
if bbox_block != bbox_block_prec:
|
267 |
+
bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block]
|
268 |
+
bbox_block_dict[count_block] = bbox_block_indexes
|
269 |
+
bboxes_block_list.append(bbox_block)
|
270 |
+
bbox_block_prec = bbox_block
|
271 |
+
|
272 |
+
# get list of categories and texts by unique block boxes
|
273 |
+
category_block_list, text_block_list = list(), list()
|
274 |
+
for bbox_block in bboxes_block_list:
|
275 |
+
count_block = bboxes_block.index(bbox_block)
|
276 |
+
bbox_block_indexes = bbox_block_dict[count_block]
|
277 |
+
category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0]
|
278 |
+
category_block_list.append(category_block)
|
279 |
+
text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist()
|
280 |
+
text_block = [text.replace("\n","").strip() for text in text_block]
|
281 |
+
if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote":
|
282 |
+
text_block = ' '.join(text_block)
|
283 |
+
else:
|
284 |
+
text_block = '\n'.join(text_block)
|
285 |
+
text_block_list.append(text_block)
|
286 |
+
|
287 |
+
return bboxes_block_list, category_block_list, text_block_list
|
288 |
+
|
289 |
+
# function to sort bounding boxes
|
290 |
+
def get_sorted_boxes(bboxes):
|
291 |
+
|
292 |
+
# sort by y from page top to bottom
|
293 |
+
sorted_bboxes = sorted(bboxes, key=itemgetter(1), reverse=False)
|
294 |
+
y_list = [bbox[1] for bbox in sorted_bboxes]
|
295 |
+
|
296 |
+
# sort by x from page left to right when boxes with same y
|
297 |
+
if len(list(set(y_list))) != len(y_list):
|
298 |
+
y_list_duplicates_indexes = dict()
|
299 |
+
y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1]
|
300 |
+
for item in y_list_duplicates:
|
301 |
+
y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item]
|
302 |
+
bbox_list_y_duplicates = sorted(np.array(sorted_bboxes, dtype=object)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False)
|
303 |
+
np_array_bboxes = np.array(sorted_bboxes)
|
304 |
+
np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates)
|
305 |
+
sorted_bboxes = np_array_bboxes.tolist()
|
306 |
+
|
307 |
+
return sorted_bboxes
|
308 |
+
|
309 |
+
# sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
|
310 |
+
def sort_data(bboxes, categories, texts):
|
311 |
+
|
312 |
+
sorted_bboxes = get_sorted_boxes(bboxes)
|
313 |
+
sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
|
314 |
+
sorted_categories = np.array(categories, dtype=object)[sorted_bboxes_indexes].tolist()
|
315 |
+
sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
|
316 |
+
|
317 |
+
return sorted_bboxes, sorted_categories, sorted_texts
|
318 |
+
|
319 |
+
# sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
|
320 |
+
def sort_data_wo_labels(bboxes, texts):
|
321 |
+
|
322 |
+
sorted_bboxes = get_sorted_boxes(bboxes)
|
323 |
+
sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
|
324 |
+
sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
|
325 |
+
|
326 |
+
return sorted_bboxes, sorted_texts
|
327 |
+
|
328 |
+
## PDF processing
|
329 |
+
|
330 |
+
# get filename and images of PDF pages
|
331 |
+
def pdf_to_images(uploaded_pdf):
|
332 |
+
|
333 |
+
# Check if None object
|
334 |
+
if uploaded_pdf is None:
|
335 |
+
path_to_file = pdf_blank
|
336 |
+
filename = path_to_file.replace(examples_dir,"")
|
337 |
+
msg = "Invalid PDF file."
|
338 |
+
images = [Image.open(image_blank)]
|
339 |
+
else:
|
340 |
+
# path to the uploaded PDF
|
341 |
+
path_to_file = uploaded_pdf.name
|
342 |
+
filename = path_to_file.replace("/tmp/","")
|
343 |
+
|
344 |
+
try:
|
345 |
+
PdfReader(path_to_file)
|
346 |
+
except PdfReadError:
|
347 |
+
path_to_file = pdf_blank
|
348 |
+
filename = path_to_file.replace(examples_dir,"")
|
349 |
+
msg = "Invalid PDF file."
|
350 |
+
images = [Image.open(image_blank)]
|
351 |
+
else:
|
352 |
+
try:
|
353 |
+
images = convert_from_path(path_to_file, last_page=max_imgboxes)
|
354 |
+
num_imgs = len(images)
|
355 |
+
msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
|
356 |
+
except:
|
357 |
+
msg = f'Error with the PDF "{filename}": it was not converted into images.'
|
358 |
+
images = [Image.open(image_wo_content)]
|
359 |
+
|
360 |
+
return filename, msg, images
|
361 |
+
|
362 |
+
# Extraction of image data (text and bounding boxes)
|
363 |
+
def extraction_data_from_image(images):
|
364 |
+
|
365 |
+
num_imgs = len(images)
|
366 |
+
|
367 |
+
if num_imgs > 0:
|
368 |
+
|
369 |
+
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
|
370 |
+
custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
|
371 |
+
results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
|
372 |
+
images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
|
373 |
+
|
374 |
+
try:
|
375 |
+
for i,image in enumerate(images):
|
376 |
+
# image preprocessing
|
377 |
+
# https://docs.opencv.org/3.0-beta/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.html
|
378 |
+
img = image.copy()
|
379 |
+
factor, path_to_img = set_image_dpi_resize(img) # Rescaling to 300dpi
|
380 |
+
img = Image.open(path_to_img)
|
381 |
+
img = np.array(img, dtype='uint8') # convert PIL to cv2
|
382 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
|
383 |
+
ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
|
384 |
+
|
385 |
+
# OCR PyTesseract | get langs of page
|
386 |
+
txt = pytesseract.image_to_string(img, config=custom_config)
|
387 |
+
txt = txt.strip().lower()
|
388 |
+
txt = re.sub(r" +", " ", txt) # multiple space
|
389 |
+
txt = re.sub(r"(\n\s*)+\n+", "\n", txt) # multiple line
|
390 |
+
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
391 |
+
try:
|
392 |
+
langs = detect_langs(txt)
|
393 |
+
langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
|
394 |
+
langs_string = '+'.join(langs)
|
395 |
+
except:
|
396 |
+
langs_string = "eng"
|
397 |
+
langs_string += '+osd'
|
398 |
+
custom_config = f'--oem 3 --psm 3 -l {langs_string}' # default config PyTesseract: --oem 3 --psm 3
|
399 |
+
|
400 |
+
# OCR PyTesseract | get data
|
401 |
+
results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
|
402 |
+
# results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
403 |
+
|
404 |
+
lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
|
405 |
+
lines_list.append(lines[i])
|
406 |
+
par_boxes_list.append(par_boxes[i])
|
407 |
+
line_boxes_list.append(line_boxes[i])
|
408 |
+
images_ids_list.append(i)
|
409 |
+
images_list.append(images[i])
|
410 |
+
page_no_list.append(i)
|
411 |
+
num_pages_list.append(num_imgs)
|
412 |
+
|
413 |
+
except:
|
414 |
+
print(f"There was an error within the extraction of PDF text by the OCR!")
|
415 |
+
else:
|
416 |
+
from datasets import Dataset
|
417 |
+
dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_line": line_boxes_list})
|
418 |
+
|
419 |
+
# print(f"The text data was successfully extracted by the OCR!")
|
420 |
+
|
421 |
+
return dataset, lines, row_indexes, par_boxes, line_boxes
|
422 |
+
|
423 |
+
## Inference
|
424 |
+
|
425 |
+
def prepare_inference_features(example, cls_box = cls_box, sep_box = sep_box):
|
426 |
+
|
427 |
+
images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list = list(), list(), list(), list(), list()
|
428 |
+
|
429 |
+
# get batch
|
430 |
+
batch_images_ids = example["images_ids"]
|
431 |
+
batch_images = example["images"]
|
432 |
+
batch_bboxes_line = example["bboxes_line"]
|
433 |
+
batch_texts = example["texts"]
|
434 |
+
batch_images_size = [image.size for image in batch_images]
|
435 |
+
|
436 |
+
batch_width, batch_height = [image_size[0] for image_size in batch_images_size], [image_size[1] for image_size in batch_images_size]
|
437 |
+
|
438 |
+
# add a dimension if not a batch but only one image
|
439 |
+
if not isinstance(batch_images_ids, list):
|
440 |
+
batch_images_ids = [batch_images_ids]
|
441 |
+
batch_images = [batch_images]
|
442 |
+
batch_bboxes_line = [batch_bboxes_line]
|
443 |
+
batch_texts = [batch_texts]
|
444 |
+
batch_width, batch_height = [batch_width], [batch_height]
|
445 |
+
|
446 |
+
# process all images of the batch
|
447 |
+
for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_line, batch_texts, batch_width, batch_height)):
|
448 |
+
tokens_list = []
|
449 |
+
bboxes_list = []
|
450 |
+
|
451 |
+
# add a dimension if only on image
|
452 |
+
if not isinstance(texts, list):
|
453 |
+
texts, boxes = [texts], [boxes]
|
454 |
+
|
455 |
+
# convert boxes to original
|
456 |
+
normalize_bboxes_line = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
|
457 |
+
|
458 |
+
# sort boxes with texts
|
459 |
+
# we want sorted lists from top to bottom of the image
|
460 |
+
boxes, texts = sort_data_wo_labels(normalize_bboxes_line, texts)
|
461 |
+
|
462 |
+
count = 0
|
463 |
+
for box, text in zip(boxes, texts):
|
464 |
+
tokens = tokenizer.tokenize(text)
|
465 |
+
num_tokens = len(tokens) # get number of tokens
|
466 |
+
tokens_list.extend(tokens)
|
467 |
+
|
468 |
+
bboxes_list.extend([box] * num_tokens) # number of boxes must be the same as the number of tokens
|
469 |
+
|
470 |
+
# use of return_overflowing_tokens=True / stride=doc_stride
|
471 |
+
# to get parts of image with overlap
|
472 |
+
# source: https://huggingface.co/course/chapter6/3b?fw=tf#handling-long-contexts
|
473 |
+
encodings = tokenizer(" ".join(texts),
|
474 |
+
truncation=True,
|
475 |
+
padding="max_length",
|
476 |
+
max_length=max_length,
|
477 |
+
stride=doc_stride,
|
478 |
+
return_overflowing_tokens=True,
|
479 |
+
return_offsets_mapping=True
|
480 |
+
)
|
481 |
+
|
482 |
+
otsm = encodings.pop("overflow_to_sample_mapping")
|
483 |
+
offset_mapping = encodings.pop("offset_mapping")
|
484 |
+
|
485 |
+
# Let's label those examples and get their boxes
|
486 |
+
sequence_length_prev = 0
|
487 |
+
for i, offsets in enumerate(offset_mapping):
|
488 |
+
# truncate tokens, boxes and labels based on length of chunk - 2 (special tokens <s> and </s>)
|
489 |
+
sequence_length = len(encodings.input_ids[i]) - 2
|
490 |
+
if i == 0: start = 0
|
491 |
+
else: start += sequence_length_prev - doc_stride
|
492 |
+
end = start + sequence_length
|
493 |
+
sequence_length_prev = sequence_length
|
494 |
+
|
495 |
+
# get tokens, boxes and labels of this image chunk
|
496 |
+
bb = [cls_box] + bboxes_list[start:end] + [sep_box]
|
497 |
+
|
498 |
+
# as the last chunk can have a length < max_length
|
499 |
+
# we must to add [tokenizer.pad_token] (tokens), [sep_box] (boxes) and [-100] (labels)
|
500 |
+
if len(bb) < max_length:
|
501 |
+
bb = bb + [sep_box] * (max_length - len(bb))
|
502 |
+
|
503 |
+
# append results
|
504 |
+
input_ids_list.append(encodings["input_ids"][i])
|
505 |
+
attention_mask_list.append(encodings["attention_mask"][i])
|
506 |
+
bb_list.append(bb)
|
507 |
+
images_ids_list.append(image_id)
|
508 |
+
chunks_ids_list.append(i)
|
509 |
+
|
510 |
+
return {
|
511 |
+
"images_ids": images_ids_list,
|
512 |
+
"chunk_ids": chunks_ids_list,
|
513 |
+
"input_ids": input_ids_list,
|
514 |
+
"attention_mask": attention_mask_list,
|
515 |
+
"normalized_bboxes": bb_list,
|
516 |
+
}
|
517 |
+
|
518 |
+
from torch.utils.data import Dataset
|
519 |
+
|
520 |
+
class CustomDataset(Dataset):
|
521 |
+
def __init__(self, dataset, tokenizer):
|
522 |
+
self.dataset = dataset
|
523 |
+
self.tokenizer = tokenizer
|
524 |
+
|
525 |
+
def __len__(self):
|
526 |
+
return len(self.dataset)
|
527 |
+
|
528 |
+
def __getitem__(self, idx):
|
529 |
+
# get item
|
530 |
+
example = self.dataset[idx]
|
531 |
+
encoding = dict()
|
532 |
+
encoding["images_ids"] = example["images_ids"]
|
533 |
+
encoding["chunk_ids"] = example["chunk_ids"]
|
534 |
+
encoding["input_ids"] = example["input_ids"]
|
535 |
+
encoding["attention_mask"] = example["attention_mask"]
|
536 |
+
encoding["bbox"] = example["normalized_bboxes"]
|
537 |
+
|
538 |
+
return encoding
|
539 |
+
|
540 |
+
import torch.nn.functional as F
|
541 |
+
|
542 |
+
# get predictions at token level
|
543 |
+
def predictions_token_level(images, custom_encoded_dataset):
|
544 |
+
|
545 |
+
num_imgs = len(images)
|
546 |
+
if num_imgs > 0:
|
547 |
+
|
548 |
+
chunk_ids, input_ids, bboxes, outputs, token_predictions = dict(), dict(), dict(), dict(), dict()
|
549 |
+
images_ids_list = list()
|
550 |
+
|
551 |
+
for i,encoding in enumerate(custom_encoded_dataset):
|
552 |
+
|
553 |
+
# get custom encoded data
|
554 |
+
image_id = encoding['images_ids']
|
555 |
+
chunk_id = encoding['chunk_ids']
|
556 |
+
input_id = torch.tensor(encoding['input_ids'])[None]
|
557 |
+
attention_mask = torch.tensor(encoding['attention_mask'])[None]
|
558 |
+
bbox = torch.tensor(encoding['bbox'])[None]
|
559 |
+
|
560 |
+
# save data in dictionnaries
|
561 |
+
if image_id not in images_ids_list: images_ids_list.append(image_id)
|
562 |
+
|
563 |
+
if image_id in chunk_ids: chunk_ids[image_id].append(chunk_id)
|
564 |
+
else: chunk_ids[image_id] = [chunk_id]
|
565 |
+
|
566 |
+
if image_id in input_ids: input_ids[image_id].append(input_id)
|
567 |
+
else: input_ids[image_id] = [input_id]
|
568 |
+
|
569 |
+
if image_id in bboxes: bboxes[image_id].append(bbox)
|
570 |
+
else: bboxes[image_id] = [bbox]
|
571 |
+
|
572 |
+
# get prediction with forward pass
|
573 |
+
with torch.no_grad():
|
574 |
+
output = model(
|
575 |
+
input_ids=input_id,
|
576 |
+
attention_mask=attention_mask,
|
577 |
+
bbox=bbox
|
578 |
+
)
|
579 |
+
|
580 |
+
# save probabilities of predictions in dictionnary
|
581 |
+
if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
|
582 |
+
else: outputs[image_id] = [F.softmax(output.logits.squeeze(), dim=-1)]
|
583 |
+
|
584 |
+
return outputs, images_ids_list, chunk_ids, input_ids, bboxes
|
585 |
+
|
586 |
+
else:
|
587 |
+
print("An error occurred while getting predictions!")
|
588 |
+
|
589 |
+
from functools import reduce
|
590 |
+
|
591 |
+
# Get predictions (line level)
|
592 |
+
def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
|
593 |
+
|
594 |
+
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
|
595 |
+
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
|
596 |
+
|
597 |
+
if len(images_ids_list) > 0:
|
598 |
+
|
599 |
+
for i, image_id in enumerate(images_ids_list):
|
600 |
+
|
601 |
+
# get image information
|
602 |
+
images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
|
603 |
+
image = images_list[0]
|
604 |
+
width, height = image.size
|
605 |
+
|
606 |
+
# get data
|
607 |
+
chunk_ids_list = chunk_ids[image_id]
|
608 |
+
outputs_list = outputs[image_id]
|
609 |
+
input_ids_list = input_ids[image_id]
|
610 |
+
bboxes_list = bboxes[image_id]
|
611 |
+
|
612 |
+
# create zeros tensors
|
613 |
+
ten_probs = torch.zeros((outputs_list[0].shape[0] - 2)*len(outputs_list), outputs_list[0].shape[1])
|
614 |
+
ten_input_ids = torch.ones(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list)), dtype =int)
|
615 |
+
ten_bboxes = torch.zeros(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list), 4), dtype =int)
|
616 |
+
|
617 |
+
if len(outputs_list) > 1:
|
618 |
+
|
619 |
+
for num_output, (output, input_id, bbox) in enumerate(zip(outputs_list, input_ids_list, bboxes_list)):
|
620 |
+
start = num_output*(max_length - 2) - max(0,num_output)*doc_stride
|
621 |
+
end = start + (max_length - 2)
|
622 |
+
|
623 |
+
if num_output == 0:
|
624 |
+
ten_probs[start:end,:] += output[1:-1]
|
625 |
+
ten_input_ids[:,start:end] = input_id[:,1:-1]
|
626 |
+
ten_bboxes[:,start:end,:] = bbox[:,1:-1,:]
|
627 |
+
else:
|
628 |
+
ten_probs[start:start + doc_stride,:] += output[1:1 + doc_stride]
|
629 |
+
ten_probs[start:start + doc_stride,:] = ten_probs[start:start + doc_stride,:] * 0.5
|
630 |
+
ten_probs[start + doc_stride:end,:] += output[1 + doc_stride:-1]
|
631 |
+
|
632 |
+
ten_input_ids[:,start:start + doc_stride] = input_id[:,1:1 + doc_stride]
|
633 |
+
ten_input_ids[:,start + doc_stride:end] = input_id[:,1 + doc_stride:-1]
|
634 |
+
|
635 |
+
ten_bboxes[:,start:start + doc_stride,:] = bbox[:,1:1 + doc_stride,:]
|
636 |
+
ten_bboxes[:,start + doc_stride:end,:] = bbox[:,1 + doc_stride:-1,:]
|
637 |
+
|
638 |
+
else:
|
639 |
+
ten_probs += outputs_list[0][1:-1]
|
640 |
+
ten_input_ids = input_ids_list[0][:,1:-1]
|
641 |
+
ten_bboxes = bboxes_list[0][:,1:-1]
|
642 |
+
|
643 |
+
ten_probs_list, ten_input_ids_list, ten_bboxes_list = ten_probs.tolist(), ten_input_ids.tolist()[0], ten_bboxes.tolist()[0]
|
644 |
+
bboxes_list = list()
|
645 |
+
input_ids_dict, probs_dict = dict(), dict()
|
646 |
+
bbox_prev = [-100, -100, -100, -100]
|
647 |
+
for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
|
648 |
+
bbox = denormalize_box(bbox, width, height)
|
649 |
+
if bbox != bbox_prev and bbox != cls_box:
|
650 |
+
bboxes_list.append(bbox)
|
651 |
+
input_ids_dict[str(bbox)] = [input_id]
|
652 |
+
probs_dict[str(bbox)] = [probs]
|
653 |
+
else:
|
654 |
+
if bbox != cls_box:
|
655 |
+
input_ids_dict[str(bbox)].append(input_id)
|
656 |
+
probs_dict[str(bbox)].append(probs)
|
657 |
+
bbox_prev = bbox
|
658 |
+
|
659 |
+
probs_bbox = dict()
|
660 |
+
for i,bbox in enumerate(bboxes_list):
|
661 |
+
probs = probs_dict[str(bbox)]
|
662 |
+
probs = np.array(probs).T.tolist()
|
663 |
+
|
664 |
+
probs_label = list()
|
665 |
+
for probs_list in probs:
|
666 |
+
prob_label = reduce(lambda x, y: x*y, probs_list)
|
667 |
+
probs_label.append(prob_label)
|
668 |
+
max_value = max(probs_label)
|
669 |
+
max_index = probs_label.index(max_value)
|
670 |
+
probs_bbox[str(bbox)] = max_index
|
671 |
+
|
672 |
+
bboxes_list_dict[image_id] = bboxes_list
|
673 |
+
input_ids_dict_dict[image_id] = input_ids_dict
|
674 |
+
probs_dict_dict[image_id] = probs_bbox
|
675 |
+
|
676 |
+
df[image_id] = pd.DataFrame()
|
677 |
+
df[image_id]["bboxes"] = bboxes_list
|
678 |
+
df[image_id]["texts"] = [tokenizer.decode(input_ids_dict[str(bbox)]) for bbox in bboxes_list]
|
679 |
+
df[image_id]["labels"] = [id2label[probs_bbox[str(bbox)]] for bbox in bboxes_list]
|
680 |
+
|
681 |
+
return probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df
|
682 |
+
|
683 |
+
else:
|
684 |
+
print("An error occurred while getting predictions!")
|
685 |
+
|
686 |
+
# Get labeled images with lines bounding boxes
|
687 |
+
def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
|
688 |
+
|
689 |
+
labeled_images = list()
|
690 |
+
|
691 |
+
for i, image_id in enumerate(images_ids_list):
|
692 |
+
|
693 |
+
# get image
|
694 |
+
images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
|
695 |
+
image = images_list[0]
|
696 |
+
width, height = image.size
|
697 |
+
|
698 |
+
# get predicted boxes and labels
|
699 |
+
bboxes_list = bboxes_list_dict[image_id]
|
700 |
+
probs_bbox = probs_dict_dict[image_id]
|
701 |
+
|
702 |
+
draw = ImageDraw.Draw(image)
|
703 |
+
# https://stackoverflow.com/questions/66274858/choosing-a-pil-imagefont-by-font-name-rather-than-filename-and-cross-platform-f
|
704 |
+
font = font_manager.FontProperties(family='sans-serif', weight='bold')
|
705 |
+
font_file = font_manager.findfont(font)
|
706 |
+
font_size = 30
|
707 |
+
font = ImageFont.truetype(font_file, font_size)
|
708 |
+
|
709 |
+
for bbox in bboxes_list:
|
710 |
+
predicted_label = id2label[probs_bbox[str(bbox)]]
|
711 |
+
draw.rectangle(bbox, outline=label2color[predicted_label])
|
712 |
+
draw.text((bbox[0] + 10, bbox[1] - font_size), text=predicted_label, fill=label2color[predicted_label], font=font)
|
713 |
+
|
714 |
+
labeled_images.append(image)
|
715 |
+
|
716 |
+
return labeled_images
|
717 |
+
|
718 |
+
# get data of encoded chunk
|
719 |
+
def get_encoded_chunk_inference(index_chunk=None):
|
720 |
+
|
721 |
+
# get datasets
|
722 |
+
example = dataset
|
723 |
+
encoded_example = encoded_dataset
|
724 |
+
|
725 |
+
# get randomly a document in dataset
|
726 |
+
if index_chunk == None: index_chunk = random.randint(0, len(encoded_example)-1)
|
727 |
+
encoded_example = encoded_example[index_chunk]
|
728 |
+
encoded_image_ids = encoded_example["images_ids"]
|
729 |
+
|
730 |
+
# get the image
|
731 |
+
example = example.filter(lambda example: example["images_ids"] == encoded_image_ids)[0]
|
732 |
+
image = example["images"] # original image
|
733 |
+
width, height = image.size
|
734 |
+
page_no = example["page_no"]
|
735 |
+
num_pages = example["num_pages"]
|
736 |
+
|
737 |
+
# get boxes, texts, categories
|
738 |
+
bboxes, input_ids = encoded_example["normalized_bboxes"][1:-1], encoded_example["input_ids"][1:-1]
|
739 |
+
bboxes = [denormalize_box(bbox, width, height) for bbox in bboxes]
|
740 |
+
num_tokens = len(input_ids) + 2
|
741 |
+
|
742 |
+
# get unique bboxes and corresponding labels
|
743 |
+
bboxes_list, input_ids_list = list(), list()
|
744 |
+
input_ids_dict = dict()
|
745 |
+
bbox_prev = [-100, -100, -100, -100]
|
746 |
+
for i, (bbox, input_id) in enumerate(zip(bboxes, input_ids)):
|
747 |
+
if bbox != bbox_prev:
|
748 |
+
bboxes_list.append(bbox)
|
749 |
+
input_ids_dict[str(bbox)] = [input_id]
|
750 |
+
else:
|
751 |
+
input_ids_dict[str(bbox)].append(input_id)
|
752 |
+
|
753 |
+
# start_indexes_list.append(i)
|
754 |
+
bbox_prev = bbox
|
755 |
+
|
756 |
+
# do not keep "</s><pad><pad>..."
|
757 |
+
if input_ids_dict[str(bboxes_list[-1])][0] == (tokenizer.convert_tokens_to_ids('</s>')):
|
758 |
+
del input_ids_dict[str(bboxes_list[-1])]
|
759 |
+
bboxes_list = bboxes_list[:-1]
|
760 |
+
|
761 |
+
# get texts by line
|
762 |
+
input_ids_list = input_ids_dict.values()
|
763 |
+
texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
|
764 |
+
|
765 |
+
# display DataFrame
|
766 |
+
df = pd.DataFrame({"texts": texts_list, "input_ids": input_ids_list, "bboxes": bboxes_list})
|
767 |
+
|
768 |
+
return image, df, num_tokens, page_no, num_pages
|
769 |
+
|
770 |
+
# display chunk of PDF image and its data
|
771 |
+
def display_chunk_lines_inference(index_chunk=None):
|
772 |
+
|
773 |
+
# get image and image data
|
774 |
+
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
|
775 |
+
|
776 |
+
# get data from dataframe
|
777 |
+
input_ids = df["input_ids"]
|
778 |
+
texts = df["texts"]
|
779 |
+
bboxes = df["bboxes"]
|
780 |
+
|
781 |
+
print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
|
782 |
+
|
783 |
+
# display image with bounding boxes
|
784 |
+
print(">> PDF image with bounding boxes of lines\n")
|
785 |
+
draw = ImageDraw.Draw(image)
|
786 |
+
|
787 |
+
labels = list()
|
788 |
+
for box, text in zip(bboxes, texts):
|
789 |
+
color = "red"
|
790 |
+
draw.rectangle(box, outline=color)
|
791 |
+
|
792 |
+
# resize image to original
|
793 |
+
width, height = image.size
|
794 |
+
image = image.resize((int(0.5*width), int(0.5*height)))
|
795 |
+
|
796 |
+
# convert to cv and display
|
797 |
+
img = np.array(image, dtype='uint8') # PIL to cv2
|
798 |
+
cv2_imshow(img)
|
799 |
+
cv2.waitKey(0)
|
800 |
+
|
801 |
+
# display image dataframe
|
802 |
+
print("\n>> Dataframe of annotated lines\n")
|
803 |
+
cols = ["texts", "bboxes"]
|
804 |
+
df = df[cols]
|
805 |
+
display(df)
|
files/languages_iso.csv
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Language,LangCode
|
2 |
+
Abkhazian,ab
|
3 |
+
Afar,aa
|
4 |
+
Afrikaans,af
|
5 |
+
Akan,ak
|
6 |
+
Albanian,sq
|
7 |
+
Amharic,am
|
8 |
+
Arabic,ar
|
9 |
+
Aragonese,an
|
10 |
+
Armenian,hy
|
11 |
+
Assamese,as
|
12 |
+
Avaric,av
|
13 |
+
Avestan,ae
|
14 |
+
Aymara,ay
|
15 |
+
Azerbaijani,az
|
16 |
+
Bambara,bm
|
17 |
+
Bashkir,ba
|
18 |
+
Basque,eu
|
19 |
+
Belarusian,be
|
20 |
+
Bengali,bn
|
21 |
+
Bislama,bi
|
22 |
+
Bosnian,bs
|
23 |
+
Breton,br
|
24 |
+
Bulgarian,bg
|
25 |
+
Burmese,my
|
26 |
+
"Catalan, Valencian",ca
|
27 |
+
Chamorro,ch
|
28 |
+
Chechen,ce
|
29 |
+
"Chichewa, Chewa, Nyanja",ny
|
30 |
+
Chinese,zh
|
31 |
+
"Church Slavonic, Old Slavonic, Old Church Slavonic",cu
|
32 |
+
Chuvash,cv
|
33 |
+
Cornish,kw
|
34 |
+
Corsican,co
|
35 |
+
Cree,cr
|
36 |
+
Croatian,hr
|
37 |
+
Czech,cs
|
38 |
+
Danish,da
|
39 |
+
"Divehi, Dhivehi, Maldivian",dv
|
40 |
+
"Dutch, Flemish",nl
|
41 |
+
Dzongkha,dz
|
42 |
+
English,en
|
43 |
+
Esperanto,eo
|
44 |
+
Estonian,et
|
45 |
+
Ewe,ee
|
46 |
+
Faroese,fo
|
47 |
+
Fijian,fj
|
48 |
+
Finnish,fi
|
49 |
+
French,fr
|
50 |
+
Western Frisian,fy
|
51 |
+
Fulah,ff
|
52 |
+
"Gaelic, Scottish Gaelic",gd
|
53 |
+
Galician,gl
|
54 |
+
Ganda,lg
|
55 |
+
Georgian,ka
|
56 |
+
German,de
|
57 |
+
"Greek, Modern (1453–)",el
|
58 |
+
"Kalaallisut, Greenlandic",kl
|
59 |
+
Guarani,gn
|
60 |
+
Gujarati,gu
|
61 |
+
"Haitian, Haitian Creole",ht
|
62 |
+
Hausa,ha
|
63 |
+
Hebrew,he
|
64 |
+
Herero,hz
|
65 |
+
Hindi,hi
|
66 |
+
Hiri Motu,ho
|
67 |
+
Hungarian,hu
|
68 |
+
Icelandic,is
|
69 |
+
Ido,io
|
70 |
+
Igbo,ig
|
71 |
+
Indonesian,id
|
72 |
+
Interlingua (International Auxiliary Language Association),ia
|
73 |
+
"Interlingue, Occidental",ie
|
74 |
+
Inuktitut,iu
|
75 |
+
Inupiaq,ik
|
76 |
+
Irish,ga
|
77 |
+
Italian,it
|
78 |
+
Japanese,ja
|
79 |
+
Javanese,jv
|
80 |
+
Kannada,kn
|
81 |
+
Kanuri,kr
|
82 |
+
Kashmiri,ks
|
83 |
+
Kazakh,kk
|
84 |
+
Central Khmer,km
|
85 |
+
"Kikuyu, Gikuyu",ki
|
86 |
+
Kinyarwanda,rw
|
87 |
+
"Kirghiz, Kyrgyz",ky
|
88 |
+
Komi,kv
|
89 |
+
Kongo,kg
|
90 |
+
Korean,ko
|
91 |
+
"Kuanyama, Kwanyama",kj
|
92 |
+
Kurdish,ku
|
93 |
+
Lao,lo
|
94 |
+
Latin,la
|
95 |
+
Latvian,lv
|
96 |
+
"Limburgan, Limburger, Limburgish",li
|
97 |
+
Lingala,ln
|
98 |
+
Lithuanian,lt
|
99 |
+
Luba-Katanga,lu
|
100 |
+
"Luxembourgish, Letzeburgesch",lb
|
101 |
+
Macedonian,mk
|
102 |
+
Malagasy,mg
|
103 |
+
Malay,ms
|
104 |
+
Malayalam,ml
|
105 |
+
Maltese,mt
|
106 |
+
Manx,gv
|
107 |
+
Maori,mi
|
108 |
+
Marathi,mr
|
109 |
+
Marshallese,mh
|
110 |
+
Mongolian,mn
|
111 |
+
Nauru,na
|
112 |
+
"Navajo, Navaho",nv
|
113 |
+
North Ndebele,nd
|
114 |
+
South Ndebele,nr
|
115 |
+
Ndonga,ng
|
116 |
+
Nepali,ne
|
117 |
+
Norwegian,no
|
118 |
+
Norwegian Bokmål,nb
|
119 |
+
Norwegian Nynorsk,nn
|
120 |
+
"Sichuan Yi, Nuosu",ii
|
121 |
+
Occitan,oc
|
122 |
+
Ojibwa,oj
|
123 |
+
Oriya,or
|
124 |
+
Oromo,om
|
125 |
+
"Ossetian, Ossetic",os
|
126 |
+
Pali,pi
|
127 |
+
"Pashto, Pushto",ps
|
128 |
+
Persian,fa
|
129 |
+
Polish,pl
|
130 |
+
Portuguese,pt
|
131 |
+
"Punjabi, Panjabi",pa
|
132 |
+
Quechua,qu
|
133 |
+
"Romanian, Moldavian, Moldovan",ro
|
134 |
+
Romansh,rm
|
135 |
+
Rundi,rn
|
136 |
+
Russian,ru
|
137 |
+
Northern Sami,se
|
138 |
+
Samoan,sm
|
139 |
+
Sango,sg
|
140 |
+
Sanskrit,sa
|
141 |
+
Sardinian,sc
|
142 |
+
Serbian,sr
|
143 |
+
Shona,sn
|
144 |
+
Sindhi,sd
|
145 |
+
"Sinhala, Sinhalese",si
|
146 |
+
Slovak,sk
|
147 |
+
Slovenian,sl
|
148 |
+
Somali,so
|
149 |
+
Southern Sotho,st
|
150 |
+
"Spanish, Castilian",es
|
151 |
+
Sundanese,su
|
152 |
+
Swahili,sw
|
153 |
+
Swati,ss
|
154 |
+
Swedish,sv
|
155 |
+
Tagalog,tl
|
156 |
+
Tahitian,ty
|
157 |
+
Tajik,tg
|
158 |
+
Tamil,ta
|
159 |
+
Tatar,tt
|
160 |
+
Telugu,te
|
161 |
+
Thai,th
|
162 |
+
Tibetan,bo
|
163 |
+
Tigrinya,ti
|
164 |
+
Tonga (Tonga Islands),to
|
165 |
+
Tsonga,ts
|
166 |
+
Tswana,tn
|
167 |
+
Turkish,tr
|
168 |
+
Turkmen,tk
|
169 |
+
Twi,tw
|
170 |
+
"Uighur, Uyghur",ug
|
171 |
+
Ukrainian,uk
|
172 |
+
Urdu,ur
|
173 |
+
Uzbek,uz
|
174 |
+
Venda,ve
|
175 |
+
Vietnamese,vi
|
176 |
+
Volapük,vo
|
177 |
+
Walloon,wa
|
178 |
+
Welsh,cy
|
179 |
+
Wolof,wo
|
180 |
+
Xhosa,xh
|
181 |
+
Yiddish,yi
|
182 |
+
Yoruba,yo
|
183 |
+
"Zhuang, Chuang",za
|
184 |
+
Zulu,zu
|
files/languages_tesseract.csv
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Language,LangCode
|
2 |
+
Afrikaans,afr
|
3 |
+
Amharic,amh
|
4 |
+
Arabic,ara
|
5 |
+
Assamese,asm
|
6 |
+
Azerbaijani,aze
|
7 |
+
Azerbaijani - Cyrilic,aze_cyrl
|
8 |
+
Belarusian,bel
|
9 |
+
Bengali,ben
|
10 |
+
Tibetan,bod
|
11 |
+
Bosnian,bos
|
12 |
+
Breton,bre
|
13 |
+
Bulgarian,bul
|
14 |
+
Catalan; Valencian,cat
|
15 |
+
Cebuano,ceb
|
16 |
+
Czech,ces
|
17 |
+
Chinese - Simplified,chi_sim
|
18 |
+
Chinese - Traditional,chi_tra
|
19 |
+
Cherokee,chr
|
20 |
+
Corsican,cos
|
21 |
+
Welsh,cym
|
22 |
+
Danish,dan
|
23 |
+
Danish - Fraktur (contrib),dan_frak
|
24 |
+
German,deu
|
25 |
+
German - Fraktur (contrib),deu_frak
|
26 |
+
Dzongkha,dzo
|
27 |
+
"Greek, Modern (1453-)",ell
|
28 |
+
English,eng
|
29 |
+
"English, Middle (1100-1500)",enm
|
30 |
+
Esperanto,epo
|
31 |
+
Math / equation detection module,equ
|
32 |
+
Estonian,est
|
33 |
+
Basque,eus
|
34 |
+
Faroese,fao
|
35 |
+
Persian,fas
|
36 |
+
Filipino (old - Tagalog),fil
|
37 |
+
Finnish,fin
|
38 |
+
French,fra
|
39 |
+
German - Fraktur,frk
|
40 |
+
"French, Middle (ca.1400-1600)",frm
|
41 |
+
Western Frisian,fry
|
42 |
+
Scottish Gaelic,gla
|
43 |
+
Irish,gle
|
44 |
+
Galician,glg
|
45 |
+
"Greek, Ancient (to 1453) (contrib)",grc
|
46 |
+
Gujarati,guj
|
47 |
+
Haitian; Haitian Creole,hat
|
48 |
+
Hebrew,heb
|
49 |
+
Hindi,hin
|
50 |
+
Croatian,hrv
|
51 |
+
Hungarian,hun
|
52 |
+
Armenian,hye
|
53 |
+
Inuktitut,iku
|
54 |
+
Indonesian,ind
|
55 |
+
Icelandic,isl
|
56 |
+
Italian,ita
|
57 |
+
Italian - Old,ita_old
|
58 |
+
Javanese,jav
|
59 |
+
Japanese,jpn
|
60 |
+
Kannada,kan
|
61 |
+
Georgian,kat
|
62 |
+
Georgian - Old,kat_old
|
63 |
+
Kazakh,kaz
|
64 |
+
Central Khmer,khm
|
65 |
+
Kirghiz; Kyrgyz,kir
|
66 |
+
Kurmanji (Kurdish - Latin Script),kmr
|
67 |
+
Korean,kor
|
68 |
+
Korean (vertical),kor_vert
|
69 |
+
Kurdish (Arabic Script),kur
|
70 |
+
Lao,lao
|
71 |
+
Latin,lat
|
72 |
+
Latvian,lav
|
73 |
+
Lithuanian,lit
|
74 |
+
Luxembourgish,ltz
|
75 |
+
Malayalam,mal
|
76 |
+
Marathi,mar
|
77 |
+
Macedonian,mkd
|
78 |
+
Maltese,mlt
|
79 |
+
Mongolian,mon
|
80 |
+
Maori,mri
|
81 |
+
Malay,msa
|
82 |
+
Burmese,mya
|
83 |
+
Nepali,nep
|
84 |
+
Dutch; Flemish,nld
|
85 |
+
Norwegian,nor
|
86 |
+
Occitan (post 1500),oci
|
87 |
+
Oriya,ori
|
88 |
+
Orientation and script detection module,osd
|
89 |
+
Panjabi; Punjabi,pan
|
90 |
+
Polish,pol
|
91 |
+
Portuguese,por
|
92 |
+
Pushto; Pashto,pus
|
93 |
+
Quechua,que
|
94 |
+
Romanian; Moldavian; Moldovan,ron
|
95 |
+
Russian,rus
|
96 |
+
Sanskrit,san
|
97 |
+
Sinhala; Sinhalese,sin
|
98 |
+
Slovak,slk
|
99 |
+
Slovak - Fraktur (contrib),slk_frak
|
100 |
+
Slovenian,slv
|
101 |
+
Sindhi,snd
|
102 |
+
Spanish; Castilian,spa
|
103 |
+
Spanish; Castilian - Old,spa_old
|
104 |
+
Albanian,sqi
|
105 |
+
Serbian,srp
|
106 |
+
Serbian - Latin,srp_latn
|
107 |
+
Sundanese,sun
|
108 |
+
Swahili,swa
|
109 |
+
Swedish,swe
|
110 |
+
Syriac,syr
|
111 |
+
Tamil,tam
|
112 |
+
Tatar,tat
|
113 |
+
Telugu,tel
|
114 |
+
Tajik,tgk
|
115 |
+
Tagalog (new - Filipino),tgl
|
116 |
+
Thai,tha
|
117 |
+
Tigrinya,tir
|
118 |
+
Tonga,ton
|
119 |
+
Turkish,tur
|
120 |
+
Uighur; Uyghur,uig
|
121 |
+
Ukrainian,ukr
|
122 |
+
Urdu,urd
|
123 |
+
Uzbek,uzb
|
124 |
+
Uzbek - Cyrilic,uzb_cyrl
|
125 |
+
Vietnamese,vie
|
126 |
+
Yiddish,yid
|
127 |
+
Yoruba,yor
|
files/template.pdf
ADDED
Binary file (29.4 kB). View file
|
|
files/wo_content.png
ADDED
![]() |
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
tesseract-ocr-all
|
2 |
+
poppler-utils
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
datasets
|
4 |
+
pytesseract
|
5 |
+
opencv-python
|
6 |
+
pdf2image
|
7 |
+
pypdf
|
8 |
+
langdetect
|
9 |
+
gradio
|