seanpedrickcase commited on
Commit
00f09d5
·
1 Parent(s): 87e1451

Package updates

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. pyproject.toml +17 -18
  3. requirements.txt +17 -17
  4. tools/redaction_review.py +8 -8
README.md CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 0.7.1
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 0.8.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
pyproject.toml CHANGED
@@ -4,38 +4,37 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.7.2"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
11
 
12
  dependencies = [
13
- "pdfminer.six==20240706",
14
  "pdf2image==1.17.0",
15
- "pymupdf==1.26.1",
16
- "opencv-python==4.10.0.84",
17
- "presidio_analyzer==2.2.358",
18
- "presidio_anonymizer==2.2.358",
19
- "presidio-image-redactor==0.0.56",
20
- "pikepdf==9.5.2",
21
- "pandas==2.3.0",
22
- "scikit-learn==1.6.1",
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.34.2",
27
- "boto3==1.39.4",
28
- "pyarrow==20.0.0",
29
  "openpyxl==3.1.5",
30
- "Faker==36.1.1",
31
- "python-levenshtein==0.26.1",
32
  "spaczz==0.6.1",
33
  # Direct URL dependency for gradio_image_annotator wheel
34
  "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
35
- "rapidfuzz==3.12.1",
36
  "python-dotenv==1.0.1",
37
- "numpy==1.26.4",
38
- "awslambdaric==3.0.1"
39
  ]
40
 
41
  [project.urls]
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "0.8.0"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
11
 
12
  dependencies = [
13
+ "pdfminer.six==20250506",
14
  "pdf2image==1.17.0",
15
+ "pymupdf==1.26.3",
16
+ "opencv-python==4.12.0.88",
17
+ "presidio_analyzer==2.2.359",
18
+ "presidio_anonymizer==2.2.359",
19
+ "presidio-image-redactor==0.0.57",
20
+ "pikepdf==9.10.2",
21
+ "pandas==2.3.1",
22
+ "scikit-learn==1.7.1",
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.42.0",
27
+ "boto3==1.40.10",
28
+ "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
30
+ "Faker==37.5.3",
31
+ "python-levenshtein==0.27.1",
32
  "spaczz==0.6.1",
33
  # Direct URL dependency for gradio_image_annotator wheel
34
  "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
35
+ "rapidfuzz==3.13.0",
36
  "python-dotenv==1.0.1",
37
+ "awslambdaric==3.1.1"
 
38
  ]
39
 
40
  [project.urls]
requirements.txt CHANGED
@@ -1,28 +1,28 @@
1
- pdfminer.six==20240706
2
  pdf2image==1.17.0
3
- pymupdf==1.26.1
4
- opencv-python==4.10.0.84
5
- presidio_analyzer==2.2.358
6
- presidio_anonymizer==2.2.358
7
- presidio-image-redactor==0.0.56
8
- pikepdf==9.5.2
9
- pandas==2.3.0
10
- scikit-learn==1.6.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.34.2
14
- boto3==1.39.4
15
- pyarrow==20.0.0
16
  openpyxl==3.1.5
17
- Faker==36.1.1
18
- python-levenshtein==0.26.1
19
  spaczz==0.6.1
20
  # The following version
21
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
- rapidfuzz==3.12.1
23
  python-dotenv==1.0.1
24
- numpy==1.26.4
25
- awslambdaric==3.0.1
26
 
27
 
28
 
 
1
+ pdfminer.six==20250506
2
  pdf2image==1.17.0
3
+ pymupdf==1.26.3
4
+ opencv-python==4.12.0.88
5
+ presidio_analyzer==2.2.359
6
+ presidio_anonymizer==2.2.359
7
+ presidio-image-redactor==0.0.57
8
+ pikepdf==9.10.2
9
+ pandas==2.3.1
10
+ scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.42.0
14
+ boto3==1.40.10
15
+ pyarrow==21.0.0
16
  openpyxl==3.1.5
17
+ Faker==37.5.3
18
+ python-levenshtein==0.27.1
19
  spaczz==0.6.1
20
  # The following version
21
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
+ rapidfuzz==3.13.0
23
  python-dotenv==1.0.1
24
+ #numpy==1.26.4
25
+ awslambdaric==3.1.1
26
 
27
 
28
 
tools/redaction_review.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  import re
3
- import gradio as gr
4
  import pandas as pd
5
  import numpy as np
6
  import pandas as pd
@@ -17,6 +16,7 @@ import pymupdf
17
  from PIL import ImageDraw, Image
18
  from datetime import datetime, timezone, timedelta
19
  from collections import defaultdict
 
20
 
21
  from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
22
  from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
@@ -473,8 +473,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
473
  existing_annotations_df: pd.DataFrame,
474
  existing_annotations_list: List[Dict],
475
  existing_recogniser_entity_df: pd.DataFrame,
476
- progress = gr.Progress(track_tqdm=True)
477
- ) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
478
  """
479
  This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
480
 
@@ -491,7 +490,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
491
  Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
492
  """
493
 
494
- progress(0.2, "Identifying new redactions to add")
495
  print("Identifying new redactions to add")
496
  if filtered_ocr_results_with_words_df.empty:
497
  print("No new annotations to add.")
@@ -520,11 +519,11 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
520
  'word_text': 'text'
521
  })
522
 
523
- progress(0.3, "Checking for adjacent annotations to merge...")
524
  print("Checking for adjacent annotations to merge...")
525
  new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
526
 
527
- progress(0.4, "Creating new redaction IDs...")
528
  print("Creating new redaction IDs...")
529
  existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
530
  num_new_ids = len(new_annotations_df)
@@ -536,7 +535,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
536
 
537
  key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
538
 
539
- progress(0.5, "Checking suggested redactions against existing")
540
 
541
  if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
542
  unique_new_df = new_annotations_df
@@ -584,7 +583,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
584
  # 3. Sort the DataFrame based on this new custom order.
585
  merged_df = merged_df.sort_values('image')
586
 
587
- # --- NEW CODE END ---
588
 
589
  final_annotations_list = []
590
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
@@ -610,6 +608,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
610
  "image": image_path,
611
  "boxes": boxes
612
  })
 
 
613
 
614
  return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
615
 
 
1
  import os
2
  import re
 
3
  import pandas as pd
4
  import numpy as np
5
  import pandas as pd
 
16
  from PIL import ImageDraw, Image
17
  from datetime import datetime, timezone, timedelta
18
  from collections import defaultdict
19
+ import gradio as gr
20
 
21
  from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
22
  from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
 
473
  existing_annotations_df: pd.DataFrame,
474
  existing_annotations_list: List[Dict],
475
  existing_recogniser_entity_df: pd.DataFrame,
476
+ progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 
477
  """
478
  This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
479
 
 
490
  Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
491
  """
492
 
493
+ progress(0.2, desc="Identifying new redactions to add")
494
  print("Identifying new redactions to add")
495
  if filtered_ocr_results_with_words_df.empty:
496
  print("No new annotations to add.")
 
519
  'word_text': 'text'
520
  })
521
 
522
+ progress(0.3, desc="Checking for adjacent annotations to merge...")
523
  print("Checking for adjacent annotations to merge...")
524
  new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
525
 
526
+ progress(0.4, desc="Creating new redaction IDs...")
527
  print("Creating new redaction IDs...")
528
  existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
529
  num_new_ids = len(new_annotations_df)
 
535
 
536
  key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
537
 
538
+ progress(0.5, desc="Checking suggested redactions against existing")
539
 
540
  if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
541
  unique_new_df = new_annotations_df
 
583
  # 3. Sort the DataFrame based on this new custom order.
584
  merged_df = merged_df.sort_values('image')
585
 
 
586
 
587
  final_annotations_list = []
588
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
 
608
  "image": image_path,
609
  "boxes": boxes
610
  })
611
+
612
+ progress(1.0, desc="Completed annotation processing")
613
 
614
  return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
615