Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 15

Commit

00f09d5

1 Parent(s): 87e1451

Package updates

Browse files

Files changed (4) hide show

README.md +1 -1
pyproject.toml +17 -18
requirements.txt +17 -17
tools/redaction_review.py +8 -8

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ license: agpl-3.0
 ---
 # Document redaction
-version: 0.7.1
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

 ---
 # Document redaction
+version: 0.8.0
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

pyproject.toml CHANGED Viewed

@@ -4,38 +4,37 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "0.7.2"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "pdfminer.six==20240706",
     "pdf2image==1.17.0",
-    "pymupdf==1.26.1",
-    "opencv-python==4.10.0.84",
-    "presidio_analyzer==2.2.358",
-    "presidio_anonymizer==2.2.358",
-    "presidio-image-redactor==0.0.56",
-    "pikepdf==9.5.2",
-    "pandas==2.3.0",
-    "scikit-learn==1.6.1",
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.34.2",
-    "boto3==1.39.4",
-    "pyarrow==20.0.0",
     "openpyxl==3.1.5",
-    "Faker==36.1.1",
-    "python-levenshtein==0.26.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
     "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
-    "rapidfuzz==3.12.1",
     "python-dotenv==1.0.1",
-    "numpy==1.26.4",
-    "awslambdaric==3.0.1"
 ]
 [project.urls]

 [project]
 name = "doc_redaction"
+version = "0.8.0"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
+    "pdfminer.six==20250506",
     "pdf2image==1.17.0",
+    "pymupdf==1.26.3",
+    "opencv-python==4.12.0.88",
+    "presidio_analyzer==2.2.359",
+    "presidio_anonymizer==2.2.359",
+    "presidio-image-redactor==0.0.57",
+    "pikepdf==9.10.2",
+    "pandas==2.3.1",
+    "scikit-learn==1.7.1",
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.42.0",
+    "boto3==1.40.10",
+    "pyarrow==21.0.0",
     "openpyxl==3.1.5",
+    "Faker==37.5.3",
+    "python-levenshtein==0.27.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
     "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
+    "rapidfuzz==3.13.0",
     "python-dotenv==1.0.1",
+    "awslambdaric==3.1.1"
 ]
 [project.urls]

requirements.txt CHANGED Viewed

@@ -1,28 +1,28 @@
-pdfminer.six==20240706
 pdf2image==1.17.0
-pymupdf==1.26.1
-opencv-python==4.10.0.84
-presidio_analyzer==2.2.358
-presidio_anonymizer==2.2.358
-presidio-image-redactor==0.0.56
-pikepdf==9.5.2
-pandas==2.3.0
-scikit-learn==1.6.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.34.2
-boto3==1.39.4
-pyarrow==20.0.0
 openpyxl==3.1.5
-Faker==36.1.1
-python-levenshtein==0.26.1
 spaczz==0.6.1
 # The following version
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
-rapidfuzz==3.12.1
 python-dotenv==1.0.1
-numpy==1.26.4
-awslambdaric==3.0.1

+pdfminer.six==20250506
 pdf2image==1.17.0
+pymupdf==1.26.3
+opencv-python==4.12.0.88
+presidio_analyzer==2.2.359
+presidio_anonymizer==2.2.359
+presidio-image-redactor==0.0.57
+pikepdf==9.10.2
+pandas==2.3.1
+scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.42.0
+boto3==1.40.10
+pyarrow==21.0.0
 openpyxl==3.1.5
+Faker==37.5.3
+python-levenshtein==0.27.1
 spaczz==0.6.1
 # The following version
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
+rapidfuzz==3.13.0
 python-dotenv==1.0.1
+#numpy==1.26.4
+awslambdaric==3.1.1

tools/redaction_review.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import re
-import gradio as gr
 import pandas as pd
 import numpy as np
 import pandas as pd
@@ -17,6 +16,7 @@ import pymupdf
 from PIL import ImageDraw, Image
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
 from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
@@ -473,8 +473,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     existing_annotations_df: pd.DataFrame,
     existing_annotations_list: List[Dict],
     existing_recogniser_entity_df: pd.DataFrame,
-    progress = gr.Progress(track_tqdm=True)
-) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
@@ -491,7 +490,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
         Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
     """
-    progress(0.2, "Identifying new redactions to add")
     print("Identifying new redactions to add")
     if filtered_ocr_results_with_words_df.empty:
         print("No new annotations to add.")
@@ -520,11 +519,11 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
                 'word_text': 'text'
             })
-            progress(0.3, "Checking for adjacent annotations to merge...")
             print("Checking for adjacent annotations to merge...")
             new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
-            progress(0.4, "Creating new redaction IDs...")
             print("Creating new redaction IDs...")
             existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
             num_new_ids = len(new_annotations_df)
@@ -536,7 +535,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
             key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
-            progress(0.5, "Checking suggested redactions against existing")
             if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
                 unique_new_df = new_annotations_df
@@ -584,7 +583,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     # 3. Sort the DataFrame based on this new custom order.
     merged_df = merged_df.sort_values('image')
-    # --- NEW CODE END ---
     final_annotations_list = []
     box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
@@ -610,6 +608,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
             "image": image_path,
             "boxes": boxes
         })
     return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df

 import os
 import re
 import pandas as pd
 import numpy as np
 import pandas as pd
 from PIL import ImageDraw, Image
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
+import gradio as gr
 from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
 from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
     existing_annotations_df: pd.DataFrame,
     existing_annotations_list: List[Dict],
     existing_recogniser_entity_df: pd.DataFrame,
+    progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
         Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
     """
+    progress(0.2, desc="Identifying new redactions to add")
     print("Identifying new redactions to add")
     if filtered_ocr_results_with_words_df.empty:
         print("No new annotations to add.")
                 'word_text': 'text'
             })
+            progress(0.3, desc="Checking for adjacent annotations to merge...")
             print("Checking for adjacent annotations to merge...")
             new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
+            progress(0.4, desc="Creating new redaction IDs...")
             print("Creating new redaction IDs...")
             existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
             num_new_ids = len(new_annotations_df)
             key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
+            progress(0.5, desc="Checking suggested redactions against existing")
             if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
                 unique_new_df = new_annotations_df
     # 3. Sort the DataFrame based on this new custom order.
     merged_df = merged_df.sort_values('image')
     final_annotations_list = []
     box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
             "image": image_path,
             "boxes": boxes
         })
+    progress(1.0, desc="Completed annotation processing")
     return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df