pierreguillou commited on
Commit
858fd76
·
0 Parent(s):

Duplicate from pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Inference APP for Document Understanding at line level (v1)
3
+ emoji: 🐢
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.18.0
8
+ app_file: app.py
9
+ pinned: false
10
+ models:
11
+ - >-
12
+ pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384
13
+ duplicated_from: pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1
14
+ ---
15
+
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import re
4
+ import string
5
+
6
+ from operator import itemgetter
7
+ import collections
8
+
9
+ import pypdf
10
+ from pypdf import PdfReader
11
+ from pypdf.errors import PdfReadError
12
+
13
+ import pdf2image
14
+ from pdf2image import convert_from_path
15
+ import langdetect
16
+ from langdetect import detect_langs
17
+
18
+ import pandas as pd
19
+ import numpy as np
20
+ import random
21
+ import tempfile
22
+ import itertools
23
+
24
+ from matplotlib import font_manager
25
+ from PIL import Image, ImageDraw, ImageFont
26
+ import cv2
27
+
28
+ ## files
29
+
30
+ import sys
31
+ sys.path.insert(0, 'files/')
32
+
33
+ import functions
34
+ from functions import *
35
+
36
+ # update pip
37
+ os.system('python -m pip install --upgrade pip')
38
+
39
+ # model
40
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
41
+
42
+ import torch
43
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
+
45
+ model_id = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
46
+
47
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
48
+ model = AutoModelForTokenClassification.from_pretrained(model_id);
49
+ model.to(device);
50
+
51
+ # APP outputs
52
+ def app_outputs(uploaded_pdf):
53
+ filename, msg, images = pdf_to_images(uploaded_pdf)
54
+ num_images = len(images)
55
+
56
+ if not msg.startswith("Error with the PDF"):
57
+
58
+ # Extraction of image data (text and bounding boxes)
59
+ dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
60
+ # prepare our data in the format of the model
61
+ encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
62
+ custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
63
+ # Get predictions (token level)
64
+ outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
65
+ # Get predictions (line level)
66
+ probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
67
+ # Get labeled images with lines bounding boxes
68
+ images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
69
+
70
+ img_files = list()
71
+ # get image of PDF without bounding boxes
72
+ for i in range(num_images):
73
+ if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
74
+ else: img_file = filename.replace(".pdf", ".png")
75
+ images[i].save(img_file)
76
+ img_files.append(img_file)
77
+
78
+ if num_images < max_imgboxes:
79
+ img_files += [image_blank]*(max_imgboxes - num_images)
80
+ images += [Image.open(image_blank)]*(max_imgboxes - num_images)
81
+ for count in range(max_imgboxes - num_images):
82
+ df[num_images + count] = pd.DataFrame()
83
+ else:
84
+ img_files = img_files[:max_imgboxes]
85
+ images = images[:max_imgboxes]
86
+ df = dict(itertools.islice(df.items(), max_imgboxes))
87
+
88
+ # save
89
+ csv_files = list()
90
+ for i in range(max_imgboxes):
91
+ csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
92
+ csv_files.append(gr.File.update(value=csv_file, visible=True))
93
+ df[i].to_csv(csv_file, encoding="utf-8", index=False)
94
+
95
+ else:
96
+ img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes
97
+ img_files[0], img_files[1] = image_blank, image_blank
98
+ images[0], images[1] = Image.open(image_blank), Image.open(image_blank)
99
+ csv_file = "csv_wo_content.csv"
100
+ csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
101
+ df, df_empty = dict(), pd.DataFrame()
102
+ df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
103
+
104
+ return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
105
+
106
+ # gradio APP
107
+ with gr.Blocks(title="Inference APP for Document Understanding at line level (v1)", css=".gradio-container") as demo:
108
+ gr.HTML("""
109
+ <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v1)</h1></div>
110
+ <div style="margin-top: 40px"><p><b>[ Check as well the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">Inference APP for Document Understanding at PARAGRAPH level</a>! ]</b></p></div>
111
+ <div><p>(02/12/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model LiLT base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level</a> (chunk size of 384 tokens).</p></div>
112
+ <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2202.13669" target="_blank">LiLT (Language-Independent Layout Transformer)</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
113
+ <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
114
+ <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
115
+ <div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI | Inference at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div>
116
+ <div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
117
+ <ul><li>(02/16/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-c18d16e53cf8" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
118
+ """)
119
+ with gr.Row():
120
+ pdf_file = gr.File(label="PDF")
121
+ with gr.Row():
122
+ submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
123
+ reset_btn = gr.Button(value="Clear")
124
+ with gr.Row():
125
+ output_msg = gr.Textbox(label="Output message")
126
+ with gr.Row():
127
+ fileboxes = []
128
+ for num_page in range(max_imgboxes):
129
+ file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
130
+ fileboxes.append(file_path)
131
+ with gr.Row():
132
+ imgboxes = []
133
+ for num_page in range(max_imgboxes):
134
+ img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
135
+ imgboxes.append(img)
136
+ with gr.Row():
137
+ csvboxes = []
138
+ for num_page in range(max_imgboxes):
139
+ csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
140
+ csvboxes.append(csv)
141
+ with gr.Row():
142
+ dfboxes = []
143
+ for num_page in range(max_imgboxes):
144
+ df = gr.Dataframe(
145
+ headers=["bounding boxes", "texts", "labels"],
146
+ datatype=["str", "str", "str"],
147
+ col_count=(3, "fixed"),
148
+ visible=True,
149
+ label=f"Data of page {num_page}",
150
+ type="pandas",
151
+ wrap=True
152
+ )
153
+ dfboxes.append(df)
154
+
155
+ outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
156
+ submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
157
+ reset_btn.click(
158
+ lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
159
+ inputs=[],
160
+ outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
161
+ )
162
+
163
+ gr.Examples(
164
+ [["files/example.pdf"]],
165
+ [pdf_file],
166
+ outputboxes,
167
+ fn=app_outputs,
168
+ cache_examples=True,
169
+ )
170
+
171
+ demo.launch()
files/README.md ADDED
File without changes
files/blank.pdf ADDED
Binary file (1.15 kB). View file
 
files/blank.png ADDED
files/example.pdf ADDED
Binary file (343 kB). View file
 
files/functions.py ADDED
@@ -0,0 +1,805 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import re
4
+ import string
5
+ import torch
6
+
7
+ from operator import itemgetter
8
+ import collections
9
+
10
+ import pypdf
11
+ from pypdf import PdfReader
12
+ from pypdf.errors import PdfReadError
13
+
14
+ import pdf2image
15
+ from pdf2image import convert_from_path
16
+ import langdetect
17
+ from langdetect import detect_langs
18
+
19
+ import pandas as pd
20
+ import numpy as np
21
+ import random
22
+ import tempfile
23
+ import itertools
24
+
25
+ from matplotlib import font_manager
26
+ from PIL import Image, ImageDraw, ImageFont
27
+ import cv2
28
+
29
+ # Tesseract
30
+ print(os.popen(f'cat /etc/debian_version').read())
31
+ print(os.popen(f'cat /etc/issue').read())
32
+ print(os.popen(f'apt search tesseract').read())
33
+ import pytesseract
34
+
35
+ ## Key parameters
36
+
37
+ # categories colors
38
+ label2color = {
39
+ 'Caption': 'brown',
40
+ 'Footnote': 'orange',
41
+ 'Formula': 'gray',
42
+ 'List-item': 'yellow',
43
+ 'Page-footer': 'red',
44
+ 'Page-header': 'red',
45
+ 'Picture': 'violet',
46
+ 'Section-header': 'orange',
47
+ 'Table': 'green',
48
+ 'Text': 'blue',
49
+ 'Title': 'pink'
50
+ }
51
+
52
+ # bounding boxes start and end of a sequence
53
+ cls_box = [0, 0, 0, 0]
54
+ sep_box = cls_box
55
+
56
+ # model
57
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
58
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+
60
+ model_id = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
61
+
62
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
63
+ model = AutoModelForTokenClassification.from_pretrained(model_id);
64
+ model.to(device);
65
+
66
+ # get labels
67
+ id2label = model.config.id2label
68
+ label2id = model.config.label2id
69
+ num_labels = len(id2label)
70
+
71
+ # (tokenization) The maximum length of a feature (sequence)
72
+ if str(384) in model_id:
73
+ max_length = 384
74
+ elif str(512) in model_id:
75
+ max_length = 512
76
+ else:
77
+ print("Error with max_length of chunks!")
78
+
79
+ # (tokenization) overlap
80
+ doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
81
+
82
+ # max PDF page images that will be displayed
83
+ max_imgboxes = 2
84
+ examples_dir = 'files/'
85
+ image_wo_content = examples_dir + "wo_content.png" # image without content
86
+ pdf_blank = examples_dir + "blank.pdf" # blank PDF
87
+ image_blank = examples_dir + "blank.png" # blank image
88
+
89
+ ## get langdetect2Tesseract dictionary
90
+ t = "files/languages_tesseract.csv"
91
+ l = "files/languages_iso.csv"
92
+
93
+ df_t = pd.read_csv(t)
94
+ df_l = pd.read_csv(l)
95
+
96
+ langs_t = df_t["Language"].to_list()
97
+ langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t]
98
+ langs_l = df_l["Language"].to_list()
99
+ langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l]
100
+ langscode_t = df_t["LangCode"].to_list()
101
+ langscode_l = df_l["LangCode"].to_list()
102
+
103
+ Tesseract2langdetect, langdetect2Tesseract = dict(), dict()
104
+ for lang_t, langcode_t in zip(langs_t,langscode_t):
105
+ try:
106
+ if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese"
107
+ index = langs_l.index(lang_t)
108
+ langcode_l = langscode_l[index]
109
+ Tesseract2langdetect[langcode_t] = langcode_l
110
+ except:
111
+ continue
112
+
113
+ langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
114
+
115
+ ## General
116
+
117
+ # get text and bounding boxes from an image
118
+ # https://stackoverflow.com/questions/61347755/how-can-i-get-line-coordinates-that-readed-by-tesseract
119
+ # https://medium.com/geekculture/tesseract-ocr-understanding-the-contents-of-documents-beyond-their-text-a98704b7c655
120
+ def get_data(results, factor, conf_min=0):
121
+
122
+ data = {}
123
+ for i in range(len(results['line_num'])):
124
+ level = results['level'][i]
125
+ block_num = results['block_num'][i]
126
+ par_num = results['par_num'][i]
127
+ line_num = results['line_num'][i]
128
+ top, left = results['top'][i], results['left'][i]
129
+ width, height = results['width'][i], results['height'][i]
130
+ conf = results['conf'][i]
131
+ text = results['text'][i]
132
+ if not (text == '' or text.isspace()):
133
+ if conf >= conf_min:
134
+ tup = (text, left, top, width, height)
135
+ if block_num in list(data.keys()):
136
+ if par_num in list(data[block_num].keys()):
137
+ if line_num in list(data[block_num][par_num].keys()):
138
+ data[block_num][par_num][line_num].append(tup)
139
+ else:
140
+ data[block_num][par_num][line_num] = [tup]
141
+ else:
142
+ data[block_num][par_num] = {}
143
+ data[block_num][par_num][line_num] = [tup]
144
+ else:
145
+ data[block_num] = {}
146
+ data[block_num][par_num] = {}
147
+ data[block_num][par_num][line_num] = [tup]
148
+
149
+ # get paragraphs dicionnary with list of lines
150
+ par_data = {}
151
+ par_idx = 1
152
+ for _, b in data.items():
153
+ for _, p in b.items():
154
+ line_data = {}
155
+ line_idx = 1
156
+ for _, l in p.items():
157
+ line_data[line_idx] = l
158
+ line_idx += 1
159
+ par_data[par_idx] = line_data
160
+ par_idx += 1
161
+
162
+ # get lines of texts, grouped by paragraph
163
+ lines = list()
164
+ row_indexes = list()
165
+ row_index = 0
166
+ for _,par in par_data.items():
167
+ count_lines = 0
168
+ for _,line in par.items():
169
+ if count_lines == 0: row_indexes.append(row_index)
170
+ line_text = ' '.join([item[0] for item in line])
171
+ lines.append(line_text)
172
+ count_lines += 1
173
+ row_index += 1
174
+ # lines.append("\n")
175
+ row_index += 1
176
+ # lines = lines[:-1]
177
+
178
+ # get paragraphes boxes (par_boxes)
179
+ # get lines boxes (line_boxes)
180
+ par_boxes = list()
181
+ par_idx = 1
182
+ line_boxes = list()
183
+ line_idx = 1
184
+ for _, par in par_data.items():
185
+ xmins, ymins, xmaxs, ymaxs = list(), list(), list(), list()
186
+ for _, line in par.items():
187
+ xmin, ymin = line[0][1], line[0][2]
188
+ xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4])
189
+ line_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
190
+ xmins.append(xmin)
191
+ ymins.append(ymin)
192
+ xmaxs.append(xmax)
193
+ ymaxs.append(ymax)
194
+ line_idx += 1
195
+ xmin, ymin, xmax, ymax = min(xmins), min(ymins), max(xmaxs), max(ymaxs)
196
+ par_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
197
+ par_idx += 1
198
+
199
+ return lines, row_indexes, par_boxes, line_boxes #data, par_data #
200
+
201
+ # rescale image to get 300dpi
202
+ def set_image_dpi_resize(image):
203
+ """
204
+ Rescaling image to 300dpi while resizing
205
+ :param image: An image
206
+ :return: A rescaled image
207
+ """
208
+ length_x, width_y = image.size
209
+ factor = min(1, float(1024.0 / length_x))
210
+ size = int(factor * length_x), int(factor * width_y)
211
+ image_resize = image.resize(size, Image.Resampling.LANCZOS)
212
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='1.png')
213
+ temp_filename = temp_file.name
214
+ image_resize.save(temp_filename, dpi=(300, 300))
215
+ return factor, temp_filename
216
+
217
+ # it is important that each bounding box should be in (upper left, lower right) format.
218
+ # source: https://github.com/NielsRogge/Transformers-Tutorials/issues/129
219
+ def upperleft_to_lowerright(bbox):
220
+ x0, y0, x1, y1 = tuple(bbox)
221
+ if bbox[2] < bbox[0]:
222
+ x0 = bbox[2]
223
+ x1 = bbox[0]
224
+ if bbox[3] < bbox[1]:
225
+ y0 = bbox[3]
226
+ y1 = bbox[1]
227
+ return [x0, y0, x1, y1]
228
+
229
+ # convert boundings boxes (left, top, width, height) format to (left, top, left+widght, top+height) format.
230
+ def convert_box(bbox):
231
+ x, y, w, h = tuple(bbox) # the row comes in (left, top, width, height) format
232
+ return [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
233
+
234
+ # LiLT model gets 1000x10000 pixels images
235
+ def normalize_box(bbox, width, height):
236
+ return [
237
+ int(1000 * (bbox[0] / width)),
238
+ int(1000 * (bbox[1] / height)),
239
+ int(1000 * (bbox[2] / width)),
240
+ int(1000 * (bbox[3] / height)),
241
+ ]
242
+
243
+ # LiLT model gets 1000x10000 pixels images
244
+ def denormalize_box(bbox, width, height):
245
+ return [
246
+ int(width * (bbox[0] / 1000)),
247
+ int(height * (bbox[1] / 1000)),
248
+ int(width* (bbox[2] / 1000)),
249
+ int(height * (bbox[3] / 1000)),
250
+ ]
251
+
252
+ # get back original size
253
+ def original_box(box, original_width, original_height, coco_width, coco_height):
254
+ return [
255
+ int(original_width * (box[0] / coco_width)),
256
+ int(original_height * (box[1] / coco_height)),
257
+ int(original_width * (box[2] / coco_width)),
258
+ int(original_height* (box[3] / coco_height)),
259
+ ]
260
+
261
+ def get_blocks(bboxes_block, categories, texts):
262
+
263
+ # get list of unique block boxes
264
+ bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list()
265
+ for count_block, bbox_block in enumerate(bboxes_block):
266
+ if bbox_block != bbox_block_prec:
267
+ bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block]
268
+ bbox_block_dict[count_block] = bbox_block_indexes
269
+ bboxes_block_list.append(bbox_block)
270
+ bbox_block_prec = bbox_block
271
+
272
+ # get list of categories and texts by unique block boxes
273
+ category_block_list, text_block_list = list(), list()
274
+ for bbox_block in bboxes_block_list:
275
+ count_block = bboxes_block.index(bbox_block)
276
+ bbox_block_indexes = bbox_block_dict[count_block]
277
+ category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0]
278
+ category_block_list.append(category_block)
279
+ text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist()
280
+ text_block = [text.replace("\n","").strip() for text in text_block]
281
+ if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote":
282
+ text_block = ' '.join(text_block)
283
+ else:
284
+ text_block = '\n'.join(text_block)
285
+ text_block_list.append(text_block)
286
+
287
+ return bboxes_block_list, category_block_list, text_block_list
288
+
289
+ # function to sort bounding boxes
290
+ def get_sorted_boxes(bboxes):
291
+
292
+ # sort by y from page top to bottom
293
+ sorted_bboxes = sorted(bboxes, key=itemgetter(1), reverse=False)
294
+ y_list = [bbox[1] for bbox in sorted_bboxes]
295
+
296
+ # sort by x from page left to right when boxes with same y
297
+ if len(list(set(y_list))) != len(y_list):
298
+ y_list_duplicates_indexes = dict()
299
+ y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1]
300
+ for item in y_list_duplicates:
301
+ y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item]
302
+ bbox_list_y_duplicates = sorted(np.array(sorted_bboxes, dtype=object)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False)
303
+ np_array_bboxes = np.array(sorted_bboxes)
304
+ np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates)
305
+ sorted_bboxes = np_array_bboxes.tolist()
306
+
307
+ return sorted_bboxes
308
+
309
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
310
+ def sort_data(bboxes, categories, texts):
311
+
312
+ sorted_bboxes = get_sorted_boxes(bboxes)
313
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
314
+ sorted_categories = np.array(categories, dtype=object)[sorted_bboxes_indexes].tolist()
315
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
316
+
317
+ return sorted_bboxes, sorted_categories, sorted_texts
318
+
319
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
320
+ def sort_data_wo_labels(bboxes, texts):
321
+
322
+ sorted_bboxes = get_sorted_boxes(bboxes)
323
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
324
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
325
+
326
+ return sorted_bboxes, sorted_texts
327
+
328
+ ## PDF processing
329
+
330
+ # get filename and images of PDF pages
331
+ def pdf_to_images(uploaded_pdf):
332
+
333
+ # Check if None object
334
+ if uploaded_pdf is None:
335
+ path_to_file = pdf_blank
336
+ filename = path_to_file.replace(examples_dir,"")
337
+ msg = "Invalid PDF file."
338
+ images = [Image.open(image_blank)]
339
+ else:
340
+ # path to the uploaded PDF
341
+ path_to_file = uploaded_pdf.name
342
+ filename = path_to_file.replace("/tmp/","")
343
+
344
+ try:
345
+ PdfReader(path_to_file)
346
+ except PdfReadError:
347
+ path_to_file = pdf_blank
348
+ filename = path_to_file.replace(examples_dir,"")
349
+ msg = "Invalid PDF file."
350
+ images = [Image.open(image_blank)]
351
+ else:
352
+ try:
353
+ images = convert_from_path(path_to_file, last_page=max_imgboxes)
354
+ num_imgs = len(images)
355
+ msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
356
+ except:
357
+ msg = f'Error with the PDF "{filename}": it was not converted into images.'
358
+ images = [Image.open(image_wo_content)]
359
+
360
+ return filename, msg, images
361
+
362
+ # Extraction of image data (text and bounding boxes)
363
+ def extraction_data_from_image(images):
364
+
365
+ num_imgs = len(images)
366
+
367
+ if num_imgs > 0:
368
+
369
+ # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
370
+ custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
371
+ results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
372
+ images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
373
+
374
+ try:
375
+ for i,image in enumerate(images):
376
+ # image preprocessing
377
+ # https://docs.opencv.org/3.0-beta/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.html
378
+ img = image.copy()
379
+ factor, path_to_img = set_image_dpi_resize(img) # Rescaling to 300dpi
380
+ img = Image.open(path_to_img)
381
+ img = np.array(img, dtype='uint8') # convert PIL to cv2
382
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
383
+ ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
384
+
385
+ # OCR PyTesseract | get langs of page
386
+ txt = pytesseract.image_to_string(img, config=custom_config)
387
+ txt = txt.strip().lower()
388
+ txt = re.sub(r" +", " ", txt) # multiple space
389
+ txt = re.sub(r"(\n\s*)+\n+", "\n", txt) # multiple line
390
+ # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
391
+ try:
392
+ langs = detect_langs(txt)
393
+ langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
394
+ langs_string = '+'.join(langs)
395
+ except:
396
+ langs_string = "eng"
397
+ langs_string += '+osd'
398
+ custom_config = f'--oem 3 --psm 3 -l {langs_string}' # default config PyTesseract: --oem 3 --psm 3
399
+
400
+ # OCR PyTesseract | get data
401
+ results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
402
+ # results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
403
+
404
+ lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
405
+ lines_list.append(lines[i])
406
+ par_boxes_list.append(par_boxes[i])
407
+ line_boxes_list.append(line_boxes[i])
408
+ images_ids_list.append(i)
409
+ images_list.append(images[i])
410
+ page_no_list.append(i)
411
+ num_pages_list.append(num_imgs)
412
+
413
+ except:
414
+ print(f"There was an error within the extraction of PDF text by the OCR!")
415
+ else:
416
+ from datasets import Dataset
417
+ dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_line": line_boxes_list})
418
+
419
+ # print(f"The text data was successfully extracted by the OCR!")
420
+
421
+ return dataset, lines, row_indexes, par_boxes, line_boxes
422
+
423
+ ## Inference
424
+
425
+ def prepare_inference_features(example, cls_box = cls_box, sep_box = sep_box):
426
+
427
+ images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list = list(), list(), list(), list(), list()
428
+
429
+ # get batch
430
+ batch_images_ids = example["images_ids"]
431
+ batch_images = example["images"]
432
+ batch_bboxes_line = example["bboxes_line"]
433
+ batch_texts = example["texts"]
434
+ batch_images_size = [image.size for image in batch_images]
435
+
436
+ batch_width, batch_height = [image_size[0] for image_size in batch_images_size], [image_size[1] for image_size in batch_images_size]
437
+
438
+ # add a dimension if not a batch but only one image
439
+ if not isinstance(batch_images_ids, list):
440
+ batch_images_ids = [batch_images_ids]
441
+ batch_images = [batch_images]
442
+ batch_bboxes_line = [batch_bboxes_line]
443
+ batch_texts = [batch_texts]
444
+ batch_width, batch_height = [batch_width], [batch_height]
445
+
446
+ # process all images of the batch
447
+ for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_line, batch_texts, batch_width, batch_height)):
448
+ tokens_list = []
449
+ bboxes_list = []
450
+
451
+ # add a dimension if only on image
452
+ if not isinstance(texts, list):
453
+ texts, boxes = [texts], [boxes]
454
+
455
+ # convert boxes to original
456
+ normalize_bboxes_line = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
457
+
458
+ # sort boxes with texts
459
+ # we want sorted lists from top to bottom of the image
460
+ boxes, texts = sort_data_wo_labels(normalize_bboxes_line, texts)
461
+
462
+ count = 0
463
+ for box, text in zip(boxes, texts):
464
+ tokens = tokenizer.tokenize(text)
465
+ num_tokens = len(tokens) # get number of tokens
466
+ tokens_list.extend(tokens)
467
+
468
+ bboxes_list.extend([box] * num_tokens) # number of boxes must be the same as the number of tokens
469
+
470
+ # use of return_overflowing_tokens=True / stride=doc_stride
471
+ # to get parts of image with overlap
472
+ # source: https://huggingface.co/course/chapter6/3b?fw=tf#handling-long-contexts
473
+ encodings = tokenizer(" ".join(texts),
474
+ truncation=True,
475
+ padding="max_length",
476
+ max_length=max_length,
477
+ stride=doc_stride,
478
+ return_overflowing_tokens=True,
479
+ return_offsets_mapping=True
480
+ )
481
+
482
+ otsm = encodings.pop("overflow_to_sample_mapping")
483
+ offset_mapping = encodings.pop("offset_mapping")
484
+
485
+ # Let's label those examples and get their boxes
486
+ sequence_length_prev = 0
487
+ for i, offsets in enumerate(offset_mapping):
488
+ # truncate tokens, boxes and labels based on length of chunk - 2 (special tokens <s> and </s>)
489
+ sequence_length = len(encodings.input_ids[i]) - 2
490
+ if i == 0: start = 0
491
+ else: start += sequence_length_prev - doc_stride
492
+ end = start + sequence_length
493
+ sequence_length_prev = sequence_length
494
+
495
+ # get tokens, boxes and labels of this image chunk
496
+ bb = [cls_box] + bboxes_list[start:end] + [sep_box]
497
+
498
+ # as the last chunk can have a length < max_length
499
+ # we must to add [tokenizer.pad_token] (tokens), [sep_box] (boxes) and [-100] (labels)
500
+ if len(bb) < max_length:
501
+ bb = bb + [sep_box] * (max_length - len(bb))
502
+
503
+ # append results
504
+ input_ids_list.append(encodings["input_ids"][i])
505
+ attention_mask_list.append(encodings["attention_mask"][i])
506
+ bb_list.append(bb)
507
+ images_ids_list.append(image_id)
508
+ chunks_ids_list.append(i)
509
+
510
+ return {
511
+ "images_ids": images_ids_list,
512
+ "chunk_ids": chunks_ids_list,
513
+ "input_ids": input_ids_list,
514
+ "attention_mask": attention_mask_list,
515
+ "normalized_bboxes": bb_list,
516
+ }
517
+
518
+ from torch.utils.data import Dataset
519
+
520
+ class CustomDataset(Dataset):
521
+ def __init__(self, dataset, tokenizer):
522
+ self.dataset = dataset
523
+ self.tokenizer = tokenizer
524
+
525
+ def __len__(self):
526
+ return len(self.dataset)
527
+
528
+ def __getitem__(self, idx):
529
+ # get item
530
+ example = self.dataset[idx]
531
+ encoding = dict()
532
+ encoding["images_ids"] = example["images_ids"]
533
+ encoding["chunk_ids"] = example["chunk_ids"]
534
+ encoding["input_ids"] = example["input_ids"]
535
+ encoding["attention_mask"] = example["attention_mask"]
536
+ encoding["bbox"] = example["normalized_bboxes"]
537
+
538
+ return encoding
539
+
540
+ import torch.nn.functional as F
541
+
542
+ # get predictions at token level
543
+ def predictions_token_level(images, custom_encoded_dataset):
544
+
545
+ num_imgs = len(images)
546
+ if num_imgs > 0:
547
+
548
+ chunk_ids, input_ids, bboxes, outputs, token_predictions = dict(), dict(), dict(), dict(), dict()
549
+ images_ids_list = list()
550
+
551
+ for i,encoding in enumerate(custom_encoded_dataset):
552
+
553
+ # get custom encoded data
554
+ image_id = encoding['images_ids']
555
+ chunk_id = encoding['chunk_ids']
556
+ input_id = torch.tensor(encoding['input_ids'])[None]
557
+ attention_mask = torch.tensor(encoding['attention_mask'])[None]
558
+ bbox = torch.tensor(encoding['bbox'])[None]
559
+
560
+ # save data in dictionnaries
561
+ if image_id not in images_ids_list: images_ids_list.append(image_id)
562
+
563
+ if image_id in chunk_ids: chunk_ids[image_id].append(chunk_id)
564
+ else: chunk_ids[image_id] = [chunk_id]
565
+
566
+ if image_id in input_ids: input_ids[image_id].append(input_id)
567
+ else: input_ids[image_id] = [input_id]
568
+
569
+ if image_id in bboxes: bboxes[image_id].append(bbox)
570
+ else: bboxes[image_id] = [bbox]
571
+
572
+ # get prediction with forward pass
573
+ with torch.no_grad():
574
+ output = model(
575
+ input_ids=input_id,
576
+ attention_mask=attention_mask,
577
+ bbox=bbox
578
+ )
579
+
580
+ # save probabilities of predictions in dictionnary
581
+ if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
582
+ else: outputs[image_id] = [F.softmax(output.logits.squeeze(), dim=-1)]
583
+
584
+ return outputs, images_ids_list, chunk_ids, input_ids, bboxes
585
+
586
+ else:
587
+ print("An error occurred while getting predictions!")
588
+
589
+ from functools import reduce
590
+
591
+ # Get predictions (line level)
592
+ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
593
+
594
+ ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
595
+ bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
596
+
597
+ if len(images_ids_list) > 0:
598
+
599
+ for i, image_id in enumerate(images_ids_list):
600
+
601
+ # get image information
602
+ images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
603
+ image = images_list[0]
604
+ width, height = image.size
605
+
606
+ # get data
607
+ chunk_ids_list = chunk_ids[image_id]
608
+ outputs_list = outputs[image_id]
609
+ input_ids_list = input_ids[image_id]
610
+ bboxes_list = bboxes[image_id]
611
+
612
+ # create zeros tensors
613
+ ten_probs = torch.zeros((outputs_list[0].shape[0] - 2)*len(outputs_list), outputs_list[0].shape[1])
614
+ ten_input_ids = torch.ones(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list)), dtype =int)
615
+ ten_bboxes = torch.zeros(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list), 4), dtype =int)
616
+
617
+ if len(outputs_list) > 1:
618
+
619
+ for num_output, (output, input_id, bbox) in enumerate(zip(outputs_list, input_ids_list, bboxes_list)):
620
+ start = num_output*(max_length - 2) - max(0,num_output)*doc_stride
621
+ end = start + (max_length - 2)
622
+
623
+ if num_output == 0:
624
+ ten_probs[start:end,:] += output[1:-1]
625
+ ten_input_ids[:,start:end] = input_id[:,1:-1]
626
+ ten_bboxes[:,start:end,:] = bbox[:,1:-1,:]
627
+ else:
628
+ ten_probs[start:start + doc_stride,:] += output[1:1 + doc_stride]
629
+ ten_probs[start:start + doc_stride,:] = ten_probs[start:start + doc_stride,:] * 0.5
630
+ ten_probs[start + doc_stride:end,:] += output[1 + doc_stride:-1]
631
+
632
+ ten_input_ids[:,start:start + doc_stride] = input_id[:,1:1 + doc_stride]
633
+ ten_input_ids[:,start + doc_stride:end] = input_id[:,1 + doc_stride:-1]
634
+
635
+ ten_bboxes[:,start:start + doc_stride,:] = bbox[:,1:1 + doc_stride,:]
636
+ ten_bboxes[:,start + doc_stride:end,:] = bbox[:,1 + doc_stride:-1,:]
637
+
638
+ else:
639
+ ten_probs += outputs_list[0][1:-1]
640
+ ten_input_ids = input_ids_list[0][:,1:-1]
641
+ ten_bboxes = bboxes_list[0][:,1:-1]
642
+
643
+ ten_probs_list, ten_input_ids_list, ten_bboxes_list = ten_probs.tolist(), ten_input_ids.tolist()[0], ten_bboxes.tolist()[0]
644
+ bboxes_list = list()
645
+ input_ids_dict, probs_dict = dict(), dict()
646
+ bbox_prev = [-100, -100, -100, -100]
647
+ for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
648
+ bbox = denormalize_box(bbox, width, height)
649
+ if bbox != bbox_prev and bbox != cls_box:
650
+ bboxes_list.append(bbox)
651
+ input_ids_dict[str(bbox)] = [input_id]
652
+ probs_dict[str(bbox)] = [probs]
653
+ else:
654
+ if bbox != cls_box:
655
+ input_ids_dict[str(bbox)].append(input_id)
656
+ probs_dict[str(bbox)].append(probs)
657
+ bbox_prev = bbox
658
+
659
+ probs_bbox = dict()
660
+ for i,bbox in enumerate(bboxes_list):
661
+ probs = probs_dict[str(bbox)]
662
+ probs = np.array(probs).T.tolist()
663
+
664
+ probs_label = list()
665
+ for probs_list in probs:
666
+ prob_label = reduce(lambda x, y: x*y, probs_list)
667
+ probs_label.append(prob_label)
668
+ max_value = max(probs_label)
669
+ max_index = probs_label.index(max_value)
670
+ probs_bbox[str(bbox)] = max_index
671
+
672
+ bboxes_list_dict[image_id] = bboxes_list
673
+ input_ids_dict_dict[image_id] = input_ids_dict
674
+ probs_dict_dict[image_id] = probs_bbox
675
+
676
+ df[image_id] = pd.DataFrame()
677
+ df[image_id]["bboxes"] = bboxes_list
678
+ df[image_id]["texts"] = [tokenizer.decode(input_ids_dict[str(bbox)]) for bbox in bboxes_list]
679
+ df[image_id]["labels"] = [id2label[probs_bbox[str(bbox)]] for bbox in bboxes_list]
680
+
681
+ return probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df
682
+
683
+ else:
684
+ print("An error occurred while getting predictions!")
685
+
686
+ # Get labeled images with lines bounding boxes
687
+ def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
688
+
689
+ labeled_images = list()
690
+
691
+ for i, image_id in enumerate(images_ids_list):
692
+
693
+ # get image
694
+ images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
695
+ image = images_list[0]
696
+ width, height = image.size
697
+
698
+ # get predicted boxes and labels
699
+ bboxes_list = bboxes_list_dict[image_id]
700
+ probs_bbox = probs_dict_dict[image_id]
701
+
702
+ draw = ImageDraw.Draw(image)
703
+ # https://stackoverflow.com/questions/66274858/choosing-a-pil-imagefont-by-font-name-rather-than-filename-and-cross-platform-f
704
+ font = font_manager.FontProperties(family='sans-serif', weight='bold')
705
+ font_file = font_manager.findfont(font)
706
+ font_size = 30
707
+ font = ImageFont.truetype(font_file, font_size)
708
+
709
+ for bbox in bboxes_list:
710
+ predicted_label = id2label[probs_bbox[str(bbox)]]
711
+ draw.rectangle(bbox, outline=label2color[predicted_label])
712
+ draw.text((bbox[0] + 10, bbox[1] - font_size), text=predicted_label, fill=label2color[predicted_label], font=font)
713
+
714
+ labeled_images.append(image)
715
+
716
+ return labeled_images
717
+
718
+ # get data of encoded chunk
719
+ def get_encoded_chunk_inference(index_chunk=None):
720
+
721
+ # get datasets
722
+ example = dataset
723
+ encoded_example = encoded_dataset
724
+
725
+ # get randomly a document in dataset
726
+ if index_chunk == None: index_chunk = random.randint(0, len(encoded_example)-1)
727
+ encoded_example = encoded_example[index_chunk]
728
+ encoded_image_ids = encoded_example["images_ids"]
729
+
730
+ # get the image
731
+ example = example.filter(lambda example: example["images_ids"] == encoded_image_ids)[0]
732
+ image = example["images"] # original image
733
+ width, height = image.size
734
+ page_no = example["page_no"]
735
+ num_pages = example["num_pages"]
736
+
737
+ # get boxes, texts, categories
738
+ bboxes, input_ids = encoded_example["normalized_bboxes"][1:-1], encoded_example["input_ids"][1:-1]
739
+ bboxes = [denormalize_box(bbox, width, height) for bbox in bboxes]
740
+ num_tokens = len(input_ids) + 2
741
+
742
+ # get unique bboxes and corresponding labels
743
+ bboxes_list, input_ids_list = list(), list()
744
+ input_ids_dict = dict()
745
+ bbox_prev = [-100, -100, -100, -100]
746
+ for i, (bbox, input_id) in enumerate(zip(bboxes, input_ids)):
747
+ if bbox != bbox_prev:
748
+ bboxes_list.append(bbox)
749
+ input_ids_dict[str(bbox)] = [input_id]
750
+ else:
751
+ input_ids_dict[str(bbox)].append(input_id)
752
+
753
+ # start_indexes_list.append(i)
754
+ bbox_prev = bbox
755
+
756
+ # do not keep "</s><pad><pad>..."
757
+ if input_ids_dict[str(bboxes_list[-1])][0] == (tokenizer.convert_tokens_to_ids('</s>')):
758
+ del input_ids_dict[str(bboxes_list[-1])]
759
+ bboxes_list = bboxes_list[:-1]
760
+
761
+ # get texts by line
762
+ input_ids_list = input_ids_dict.values()
763
+ texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
764
+
765
+ # display DataFrame
766
+ df = pd.DataFrame({"texts": texts_list, "input_ids": input_ids_list, "bboxes": bboxes_list})
767
+
768
+ return image, df, num_tokens, page_no, num_pages
769
+
770
+ # display chunk of PDF image and its data
771
+ def display_chunk_lines_inference(index_chunk=None):
772
+
773
+ # get image and image data
774
+ image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
775
+
776
+ # get data from dataframe
777
+ input_ids = df["input_ids"]
778
+ texts = df["texts"]
779
+ bboxes = df["bboxes"]
780
+
781
+ print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
782
+
783
+ # display image with bounding boxes
784
+ print(">> PDF image with bounding boxes of lines\n")
785
+ draw = ImageDraw.Draw(image)
786
+
787
+ labels = list()
788
+ for box, text in zip(bboxes, texts):
789
+ color = "red"
790
+ draw.rectangle(box, outline=color)
791
+
792
+ # resize image to original
793
+ width, height = image.size
794
+ image = image.resize((int(0.5*width), int(0.5*height)))
795
+
796
+ # convert to cv and display
797
+ img = np.array(image, dtype='uint8') # PIL to cv2
798
+ cv2_imshow(img)
799
+ cv2.waitKey(0)
800
+
801
+ # display image dataframe
802
+ print("\n>> Dataframe of annotated lines\n")
803
+ cols = ["texts", "bboxes"]
804
+ df = df[cols]
805
+ display(df)
files/languages_iso.csv ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Language,LangCode
2
+ Abkhazian,ab
3
+ Afar,aa
4
+ Afrikaans,af
5
+ Akan,ak
6
+ Albanian,sq
7
+ Amharic,am
8
+ Arabic,ar
9
+ Aragonese,an
10
+ Armenian,hy
11
+ Assamese,as
12
+ Avaric,av
13
+ Avestan,ae
14
+ Aymara,ay
15
+ Azerbaijani,az
16
+ Bambara,bm
17
+ Bashkir,ba
18
+ Basque,eu
19
+ Belarusian,be
20
+ Bengali,bn
21
+ Bislama,bi
22
+ Bosnian,bs
23
+ Breton,br
24
+ Bulgarian,bg
25
+ Burmese,my
26
+ "Catalan, Valencian",ca
27
+ Chamorro,ch
28
+ Chechen,ce
29
+ "Chichewa, Chewa, Nyanja",ny
30
+ Chinese,zh
31
+ "Church Slavonic, Old Slavonic, Old Church Slavonic",cu
32
+ Chuvash,cv
33
+ Cornish,kw
34
+ Corsican,co
35
+ Cree,cr
36
+ Croatian,hr
37
+ Czech,cs
38
+ Danish,da
39
+ "Divehi, Dhivehi, Maldivian",dv
40
+ "Dutch, Flemish",nl
41
+ Dzongkha,dz
42
+ English,en
43
+ Esperanto,eo
44
+ Estonian,et
45
+ Ewe,ee
46
+ Faroese,fo
47
+ Fijian,fj
48
+ Finnish,fi
49
+ French,fr
50
+ Western Frisian,fy
51
+ Fulah,ff
52
+ "Gaelic, Scottish Gaelic",gd
53
+ Galician,gl
54
+ Ganda,lg
55
+ Georgian,ka
56
+ German,de
57
+ "Greek, Modern (1453–)",el
58
+ "Kalaallisut, Greenlandic",kl
59
+ Guarani,gn
60
+ Gujarati,gu
61
+ "Haitian, Haitian Creole",ht
62
+ Hausa,ha
63
+ Hebrew,he
64
+ Herero,hz
65
+ Hindi,hi
66
+ Hiri Motu,ho
67
+ Hungarian,hu
68
+ Icelandic,is
69
+ Ido,io
70
+ Igbo,ig
71
+ Indonesian,id
72
+ Interlingua (International Auxiliary Language Association),ia
73
+ "Interlingue, Occidental",ie
74
+ Inuktitut,iu
75
+ Inupiaq,ik
76
+ Irish,ga
77
+ Italian,it
78
+ Japanese,ja
79
+ Javanese,jv
80
+ Kannada,kn
81
+ Kanuri,kr
82
+ Kashmiri,ks
83
+ Kazakh,kk
84
+ Central Khmer,km
85
+ "Kikuyu, Gikuyu",ki
86
+ Kinyarwanda,rw
87
+ "Kirghiz, Kyrgyz",ky
88
+ Komi,kv
89
+ Kongo,kg
90
+ Korean,ko
91
+ "Kuanyama, Kwanyama",kj
92
+ Kurdish,ku
93
+ Lao,lo
94
+ Latin,la
95
+ Latvian,lv
96
+ "Limburgan, Limburger, Limburgish",li
97
+ Lingala,ln
98
+ Lithuanian,lt
99
+ Luba-Katanga,lu
100
+ "Luxembourgish, Letzeburgesch",lb
101
+ Macedonian,mk
102
+ Malagasy,mg
103
+ Malay,ms
104
+ Malayalam,ml
105
+ Maltese,mt
106
+ Manx,gv
107
+ Maori,mi
108
+ Marathi,mr
109
+ Marshallese,mh
110
+ Mongolian,mn
111
+ Nauru,na
112
+ "Navajo, Navaho",nv
113
+ North Ndebele,nd
114
+ South Ndebele,nr
115
+ Ndonga,ng
116
+ Nepali,ne
117
+ Norwegian,no
118
+ Norwegian Bokmål,nb
119
+ Norwegian Nynorsk,nn
120
+ "Sichuan Yi, Nuosu",ii
121
+ Occitan,oc
122
+ Ojibwa,oj
123
+ Oriya,or
124
+ Oromo,om
125
+ "Ossetian, Ossetic",os
126
+ Pali,pi
127
+ "Pashto, Pushto",ps
128
+ Persian,fa
129
+ Polish,pl
130
+ Portuguese,pt
131
+ "Punjabi, Panjabi",pa
132
+ Quechua,qu
133
+ "Romanian, Moldavian, Moldovan",ro
134
+ Romansh,rm
135
+ Rundi,rn
136
+ Russian,ru
137
+ Northern Sami,se
138
+ Samoan,sm
139
+ Sango,sg
140
+ Sanskrit,sa
141
+ Sardinian,sc
142
+ Serbian,sr
143
+ Shona,sn
144
+ Sindhi,sd
145
+ "Sinhala, Sinhalese",si
146
+ Slovak,sk
147
+ Slovenian,sl
148
+ Somali,so
149
+ Southern Sotho,st
150
+ "Spanish, Castilian",es
151
+ Sundanese,su
152
+ Swahili,sw
153
+ Swati,ss
154
+ Swedish,sv
155
+ Tagalog,tl
156
+ Tahitian,ty
157
+ Tajik,tg
158
+ Tamil,ta
159
+ Tatar,tt
160
+ Telugu,te
161
+ Thai,th
162
+ Tibetan,bo
163
+ Tigrinya,ti
164
+ Tonga (Tonga Islands),to
165
+ Tsonga,ts
166
+ Tswana,tn
167
+ Turkish,tr
168
+ Turkmen,tk
169
+ Twi,tw
170
+ "Uighur, Uyghur",ug
171
+ Ukrainian,uk
172
+ Urdu,ur
173
+ Uzbek,uz
174
+ Venda,ve
175
+ Vietnamese,vi
176
+ Volapük,vo
177
+ Walloon,wa
178
+ Welsh,cy
179
+ Wolof,wo
180
+ Xhosa,xh
181
+ Yiddish,yi
182
+ Yoruba,yo
183
+ "Zhuang, Chuang",za
184
+ Zulu,zu
files/languages_tesseract.csv ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Language,LangCode
2
+ Afrikaans,afr
3
+ Amharic,amh
4
+ Arabic,ara
5
+ Assamese,asm
6
+ Azerbaijani,aze
7
+ Azerbaijani - Cyrilic,aze_cyrl
8
+ Belarusian,bel
9
+ Bengali,ben
10
+ Tibetan,bod
11
+ Bosnian,bos
12
+ Breton,bre
13
+ Bulgarian,bul
14
+ Catalan; Valencian,cat
15
+ Cebuano,ceb
16
+ Czech,ces
17
+ Chinese - Simplified,chi_sim
18
+ Chinese - Traditional,chi_tra
19
+ Cherokee,chr
20
+ Corsican,cos
21
+ Welsh,cym
22
+ Danish,dan
23
+ Danish - Fraktur (contrib),dan_frak
24
+ German,deu
25
+ German - Fraktur (contrib),deu_frak
26
+ Dzongkha,dzo
27
+ "Greek, Modern (1453-)",ell
28
+ English,eng
29
+ "English, Middle (1100-1500)",enm
30
+ Esperanto,epo
31
+ Math / equation detection module,equ
32
+ Estonian,est
33
+ Basque,eus
34
+ Faroese,fao
35
+ Persian,fas
36
+ Filipino (old - Tagalog),fil
37
+ Finnish,fin
38
+ French,fra
39
+ German - Fraktur,frk
40
+ "French, Middle (ca.1400-1600)",frm
41
+ Western Frisian,fry
42
+ Scottish Gaelic,gla
43
+ Irish,gle
44
+ Galician,glg
45
+ "Greek, Ancient (to 1453) (contrib)",grc
46
+ Gujarati,guj
47
+ Haitian; Haitian Creole,hat
48
+ Hebrew,heb
49
+ Hindi,hin
50
+ Croatian,hrv
51
+ Hungarian,hun
52
+ Armenian,hye
53
+ Inuktitut,iku
54
+ Indonesian,ind
55
+ Icelandic,isl
56
+ Italian,ita
57
+ Italian - Old,ita_old
58
+ Javanese,jav
59
+ Japanese,jpn
60
+ Kannada,kan
61
+ Georgian,kat
62
+ Georgian - Old,kat_old
63
+ Kazakh,kaz
64
+ Central Khmer,khm
65
+ Kirghiz; Kyrgyz,kir
66
+ Kurmanji (Kurdish - Latin Script),kmr
67
+ Korean,kor
68
+ Korean (vertical),kor_vert
69
+ Kurdish (Arabic Script),kur
70
+ Lao,lao
71
+ Latin,lat
72
+ Latvian,lav
73
+ Lithuanian,lit
74
+ Luxembourgish,ltz
75
+ Malayalam,mal
76
+ Marathi,mar
77
+ Macedonian,mkd
78
+ Maltese,mlt
79
+ Mongolian,mon
80
+ Maori,mri
81
+ Malay,msa
82
+ Burmese,mya
83
+ Nepali,nep
84
+ Dutch; Flemish,nld
85
+ Norwegian,nor
86
+ Occitan (post 1500),oci
87
+ Oriya,ori
88
+ Orientation and script detection module,osd
89
+ Panjabi; Punjabi,pan
90
+ Polish,pol
91
+ Portuguese,por
92
+ Pushto; Pashto,pus
93
+ Quechua,que
94
+ Romanian; Moldavian; Moldovan,ron
95
+ Russian,rus
96
+ Sanskrit,san
97
+ Sinhala; Sinhalese,sin
98
+ Slovak,slk
99
+ Slovak - Fraktur (contrib),slk_frak
100
+ Slovenian,slv
101
+ Sindhi,snd
102
+ Spanish; Castilian,spa
103
+ Spanish; Castilian - Old,spa_old
104
+ Albanian,sqi
105
+ Serbian,srp
106
+ Serbian - Latin,srp_latn
107
+ Sundanese,sun
108
+ Swahili,swa
109
+ Swedish,swe
110
+ Syriac,syr
111
+ Tamil,tam
112
+ Tatar,tat
113
+ Telugu,tel
114
+ Tajik,tgk
115
+ Tagalog (new - Filipino),tgl
116
+ Thai,tha
117
+ Tigrinya,tir
118
+ Tonga,ton
119
+ Turkish,tur
120
+ Uighur; Uyghur,uig
121
+ Ukrainian,ukr
122
+ Urdu,urd
123
+ Uzbek,uzb
124
+ Uzbek - Cyrilic,uzb_cyrl
125
+ Vietnamese,vie
126
+ Yiddish,yid
127
+ Yoruba,yor
files/template.pdf ADDED
Binary file (29.4 kB). View file
 
files/wo_content.png ADDED
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr-all
2
+ poppler-utils
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
4
+ pytesseract
5
+ opencv-python
6
+ pdf2image
7
+ pypdf
8
+ langdetect
9
+ gradio