Commit
·
00f09d5
1
Parent(s):
87e1451
Package updates
Browse files- README.md +1 -1
- pyproject.toml +17 -18
- requirements.txt +17 -17
- tools/redaction_review.py +8 -8
README.md
CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
-
version: 0.
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
+
version: 0.8.0
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
pyproject.toml
CHANGED
@@ -4,38 +4,37 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
-
version = "0.
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
11 |
|
12 |
dependencies = [
|
13 |
-
"pdfminer.six==
|
14 |
"pdf2image==1.17.0",
|
15 |
-
"pymupdf==1.26.
|
16 |
-
"opencv-python==4.
|
17 |
-
"presidio_analyzer==2.2.
|
18 |
-
"presidio_anonymizer==2.2.
|
19 |
-
"presidio-image-redactor==0.0.
|
20 |
-
"pikepdf==9.
|
21 |
-
"pandas==2.3.
|
22 |
-
"scikit-learn==1.
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
-
"gradio==5.
|
27 |
-
"boto3==1.
|
28 |
-
"pyarrow==
|
29 |
"openpyxl==3.1.5",
|
30 |
-
"Faker==
|
31 |
-
"python-levenshtein==0.
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
|
35 |
-
"rapidfuzz==3.
|
36 |
"python-dotenv==1.0.1",
|
37 |
-
"
|
38 |
-
"awslambdaric==3.0.1"
|
39 |
]
|
40 |
|
41 |
[project.urls]
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
+
version = "0.8.0"
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
11 |
|
12 |
dependencies = [
|
13 |
+
"pdfminer.six==20250506",
|
14 |
"pdf2image==1.17.0",
|
15 |
+
"pymupdf==1.26.3",
|
16 |
+
"opencv-python==4.12.0.88",
|
17 |
+
"presidio_analyzer==2.2.359",
|
18 |
+
"presidio_anonymizer==2.2.359",
|
19 |
+
"presidio-image-redactor==0.0.57",
|
20 |
+
"pikepdf==9.10.2",
|
21 |
+
"pandas==2.3.1",
|
22 |
+
"scikit-learn==1.7.1",
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
+
"gradio==5.42.0",
|
27 |
+
"boto3==1.40.10",
|
28 |
+
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
30 |
+
"Faker==37.5.3",
|
31 |
+
"python-levenshtein==0.27.1",
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
|
35 |
+
"rapidfuzz==3.13.0",
|
36 |
"python-dotenv==1.0.1",
|
37 |
+
"awslambdaric==3.1.1"
|
|
|
38 |
]
|
39 |
|
40 |
[project.urls]
|
requirements.txt
CHANGED
@@ -1,28 +1,28 @@
|
|
1 |
-
pdfminer.six==
|
2 |
pdf2image==1.17.0
|
3 |
-
pymupdf==1.26.
|
4 |
-
opencv-python==4.
|
5 |
-
presidio_analyzer==2.2.
|
6 |
-
presidio_anonymizer==2.2.
|
7 |
-
presidio-image-redactor==0.0.
|
8 |
-
pikepdf==9.
|
9 |
-
pandas==2.3.
|
10 |
-
scikit-learn==1.
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
-
boto3==1.
|
15 |
-
pyarrow==
|
16 |
openpyxl==3.1.5
|
17 |
-
Faker==
|
18 |
-
python-levenshtein==0.
|
19 |
spaczz==0.6.1
|
20 |
# The following version
|
21 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
-
rapidfuzz==3.
|
23 |
python-dotenv==1.0.1
|
24 |
-
numpy==1.26.4
|
25 |
-
awslambdaric==3.
|
26 |
|
27 |
|
28 |
|
|
|
1 |
+
pdfminer.six==20250506
|
2 |
pdf2image==1.17.0
|
3 |
+
pymupdf==1.26.3
|
4 |
+
opencv-python==4.12.0.88
|
5 |
+
presidio_analyzer==2.2.359
|
6 |
+
presidio_anonymizer==2.2.359
|
7 |
+
presidio-image-redactor==0.0.57
|
8 |
+
pikepdf==9.10.2
|
9 |
+
pandas==2.3.1
|
10 |
+
scikit-learn==1.7.1
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
+
gradio==5.42.0
|
14 |
+
boto3==1.40.10
|
15 |
+
pyarrow==21.0.0
|
16 |
openpyxl==3.1.5
|
17 |
+
Faker==37.5.3
|
18 |
+
python-levenshtein==0.27.1
|
19 |
spaczz==0.6.1
|
20 |
# The following version
|
21 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
+
rapidfuzz==3.13.0
|
23 |
python-dotenv==1.0.1
|
24 |
+
#numpy==1.26.4
|
25 |
+
awslambdaric==3.1.1
|
26 |
|
27 |
|
28 |
|
tools/redaction_review.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
-
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
@@ -17,6 +16,7 @@ import pymupdf
|
|
17 |
from PIL import ImageDraw, Image
|
18 |
from datetime import datetime, timezone, timedelta
|
19 |
from collections import defaultdict
|
|
|
20 |
|
21 |
from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
|
22 |
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
|
@@ -473,8 +473,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
473 |
existing_annotations_df: pd.DataFrame,
|
474 |
existing_annotations_list: List[Dict],
|
475 |
existing_recogniser_entity_df: pd.DataFrame,
|
476 |
-
progress
|
477 |
-
) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
478 |
"""
|
479 |
This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
|
480 |
|
@@ -491,7 +490,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
491 |
Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
|
492 |
"""
|
493 |
|
494 |
-
progress(0.2, "Identifying new redactions to add")
|
495 |
print("Identifying new redactions to add")
|
496 |
if filtered_ocr_results_with_words_df.empty:
|
497 |
print("No new annotations to add.")
|
@@ -520,11 +519,11 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
520 |
'word_text': 'text'
|
521 |
})
|
522 |
|
523 |
-
progress(0.3, "Checking for adjacent annotations to merge...")
|
524 |
print("Checking for adjacent annotations to merge...")
|
525 |
new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
|
526 |
|
527 |
-
progress(0.4, "Creating new redaction IDs...")
|
528 |
print("Creating new redaction IDs...")
|
529 |
existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
|
530 |
num_new_ids = len(new_annotations_df)
|
@@ -536,7 +535,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
536 |
|
537 |
key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
|
538 |
|
539 |
-
progress(0.5, "Checking suggested redactions against existing")
|
540 |
|
541 |
if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
|
542 |
unique_new_df = new_annotations_df
|
@@ -584,7 +583,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
584 |
# 3. Sort the DataFrame based on this new custom order.
|
585 |
merged_df = merged_df.sort_values('image')
|
586 |
|
587 |
-
# --- NEW CODE END ---
|
588 |
|
589 |
final_annotations_list = []
|
590 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
@@ -610,6 +608,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
610 |
"image": image_path,
|
611 |
"boxes": boxes
|
612 |
})
|
|
|
|
|
613 |
|
614 |
return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
|
615 |
|
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
|
|
16 |
from PIL import ImageDraw, Image
|
17 |
from datetime import datetime, timezone, timedelta
|
18 |
from collections import defaultdict
|
19 |
+
import gradio as gr
|
20 |
|
21 |
from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
|
22 |
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
|
|
|
473 |
existing_annotations_df: pd.DataFrame,
|
474 |
existing_annotations_list: List[Dict],
|
475 |
existing_recogniser_entity_df: pd.DataFrame,
|
476 |
+
progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
|
477 |
"""
|
478 |
This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
|
479 |
|
|
|
490 |
Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
|
491 |
"""
|
492 |
|
493 |
+
progress(0.2, desc="Identifying new redactions to add")
|
494 |
print("Identifying new redactions to add")
|
495 |
if filtered_ocr_results_with_words_df.empty:
|
496 |
print("No new annotations to add.")
|
|
|
519 |
'word_text': 'text'
|
520 |
})
|
521 |
|
522 |
+
progress(0.3, desc="Checking for adjacent annotations to merge...")
|
523 |
print("Checking for adjacent annotations to merge...")
|
524 |
new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
|
525 |
|
526 |
+
progress(0.4, desc="Creating new redaction IDs...")
|
527 |
print("Creating new redaction IDs...")
|
528 |
existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
|
529 |
num_new_ids = len(new_annotations_df)
|
|
|
535 |
|
536 |
key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
|
537 |
|
538 |
+
progress(0.5, desc="Checking suggested redactions against existing")
|
539 |
|
540 |
if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
|
541 |
unique_new_df = new_annotations_df
|
|
|
583 |
# 3. Sort the DataFrame based on this new custom order.
|
584 |
merged_df = merged_df.sort_values('image')
|
585 |
|
|
|
586 |
|
587 |
final_annotations_list = []
|
588 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
|
|
608 |
"image": image_path,
|
609 |
"boxes": boxes
|
610 |
})
|
611 |
+
|
612 |
+
progress(1.0, desc="Completed annotation processing")
|
613 |
|
614 |
return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
|
615 |
|