Spaces:

seanpedrickcase
/

document_redaction

Sleeping

App Files Files Community

seanpedrickcase commited on 25 days ago

Commit

6f96988

1 Parent(s): 4c95b3c

Corrected some multiple xlsx/docx file redaction issues. package updates.

Browse files

Files changed (9) hide show

README.md +1 -1
app.py +1 -1
cdk/cdk_stack.py +15 -0
index.qmd +1 -1
pyproject.toml +2 -2
requirements.txt +1 -1
tools/custom_image_analyser_engine.py +0 -1
tools/data_anonymise.py +39 -49
tools/redaction_review.py +1 -1

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ license: agpl-3.0
 ---
 # Document redaction
-version: 0.8.0
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

 ---
 # Document redaction
+version: 1.0.0
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

app.py CHANGED Viewed

@@ -536,7 +536,7 @@ with app:
         pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = DEFAULT_PII_DETECTION_MODEL, choices=TABULAR_PII_DETECTION_MODELS)
-        with gr.Accordion("Anonymisation output format", open = False):
             anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "redact completely") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
         tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")

         pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = DEFAULT_PII_DETECTION_MODEL, choices=TABULAR_PII_DETECTION_MODELS)
+        with gr.Accordion("Anonymisation output format - by default will replace PII with a blank space", open = False):
             anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "redact completely") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
         tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")

cdk/cdk_stack.py CHANGED Viewed

@@ -990,6 +990,21 @@ class CdkStack(Stack):
                         "sourceVolume": epheremal_storage_volume_name,
                         "containerPath": "/tmp/gradio_tmp",
                         "readOnly": False
                     }
                 ],
                 "readonlyRootFilesystem": read_only_file_system,

                         "sourceVolume": epheremal_storage_volume_name,
                         "containerPath": "/tmp/gradio_tmp",
                         "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/.paddlex",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/.local/share/spacy/data",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/usr/share/tessdata",
+                        "readOnly": False
                     }
                 ],
                 "readonlyRootFilesystem": read_only_file_system,

index.qmd CHANGED Viewed

@@ -2,7 +2,7 @@
 title: "Home"
 ---
-version: 0.7.1
 Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.

 title: "Home"
 ---
+version: 1.0.0
 Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "0.8.0"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -23,7 +23,7 @@ dependencies = [
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.42.0",
     "boto3==1.40.10",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",

 [project]
 name = "doc_redaction"
+version = "1.0.0"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.43.1",
     "boto3==1.40.10",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",

requirements.txt CHANGED Viewed

@@ -10,7 +10,7 @@ pandas==2.3.1
 scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.42.0
 boto3==1.40.10
 pyarrow==21.0.0
 openpyxl==3.1.5

 scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.43.1
 boto3==1.40.10
 pyarrow==21.0.0
 openpyxl==3.1.5

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -699,7 +699,6 @@ class CustomImageAnalyzerEngine:
         page_text_mapping = list()
         all_text_line_results = list()
         comprehend_query_number = 0
-        print("custom_entities:", custom_entities)
         if not nlp_analyser:
             nlp_analyser = self.analyzer_engine

         page_text_mapping = list()
         all_text_line_results = list()
         comprehend_query_number = 0
         if not nlp_analyser:
             nlp_analyser = self.analyzer_engine

tools/data_anonymise.py CHANGED Viewed

@@ -49,7 +49,7 @@ def initial_clean(text:str) -> str:
     return text
 def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
-        output = []
         if hasattr(result, 'value'):
             text = result.value[data_row]
@@ -89,7 +89,7 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
     Returns:
         str: A string containing the detailed decision process output.
     """
-    decision_process_output = []
     keys_to_keep = ['entity_type', 'start', 'end']
     # Run through each column to analyse for PII
@@ -124,16 +124,10 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
     analyzer_results = list(analyzer_results)
-    # + tags=[]
     text = analyzer_results[3].value
-    # + tags=[]
     recognizer_result = str(analyzer_results[3].recognizer_results)
-    # + tags=[]
-    recognizer_result
-    # + tags=[]
     data_str = recognizer_result  # abbreviated for brevity
     # Adjusting the parse_dict function to handle trailing ']'
@@ -156,7 +150,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     # Re-running the improved processing code
-    result = []
     for lst_str in list_strs:
         # Splitting each list string into individual dictionary strings
@@ -167,41 +161,30 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
         dicts = [parse_dict(d) for d in dict_strs]
         result.append(dicts)
-    #result
-    # + tags=[]
-    names = []
     for idx, paragraph in enumerate(text):
-        paragraph_texts = []
         for dictionary in result[idx]:
             if dictionary['type'] == 'PERSON':
                 paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
         names.append(paragraph_texts)
-    # + tags=[]
     # Flatten the list of lists and extract unique names
     unique_names = list(set(name for sublist in names for name in sublist))
-    # + tags=[]
     fake_names = pd.Series(unique_names).apply(fake_first_name)
-    # + tags=[]
     mapping_df = pd.DataFrame(data={"Unique names":unique_names,
                     "Fake names": fake_names})
-    # + tags=[]
-    # Convert mapping dataframe to dictionary
     # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
     name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
-    # + tags=[]
     name_map
-    # + tags=[]
     scrubbed_df_consistent_names = df.replace(name_map, regex = True)
-    # + tags=[]
     scrubbed_df_consistent_names
     return scrubbed_df_consistent_names
@@ -230,8 +213,8 @@ def handle_docx_anonymisation(
     # 1. Load the document and extract text elements
     doc = docx.Document(file_path)
-    text_elements = []  # This will store the actual docx objects (paragraphs, cells)
-    original_texts = [] # This will store the text from those objects
     # Extract from paragraphs
     for para in doc.paragraphs:
@@ -307,16 +290,16 @@ def anonymise_files_with_open_text(file_paths: List[str],
                          chosen_redact_entities: List[str],
                          in_allow_list: List[str] = None,
                          latest_file_completed: int = 0,
-                         out_message: list = [],
-                         out_file_paths: list = [],
-                         log_files_output_paths: list = [],
-                         in_excel_sheets: list = [],
                          first_loop_state: bool = False,
                          output_folder: str = OUTPUT_FOLDER,
-                         in_deny_list:list[str]=[],
                          max_fuzzy_spelling_mistakes_num:int=0,
                          pii_identification_method:str="Local",
-                         chosen_redact_comprehend_entities:List[str]=[],
                          comprehend_query_number:int=0,
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
@@ -367,8 +350,8 @@ def anonymise_files_with_open_text(file_paths: List[str],
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
-        out_message = []
-        out_file_paths = []
     # Load file
     # If out message or out_file_paths are blank, change to a list so it can be appended to
@@ -378,23 +361,23 @@ def anonymise_files_with_open_text(file_paths: List[str],
     #print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
-        log_files_output_paths = []
     if not out_file_paths:
-        out_file_paths = []
     if isinstance(in_allow_list, list):
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
-            in_allow_list_flat = []
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
-            in_allow_list_flat = []
     else:
-        in_allow_list_flat = []
     anon_df = pd.DataFrame()
@@ -520,6 +503,9 @@ def anonymise_files_with_open_text(file_paths: List[str],
         actual_time_taken_number += out_time_float
         out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
         out_message_out = '\n'.join(out_message)
@@ -549,11 +535,11 @@ def tabular_anonymise_wrapper_func(
     file_type: str,
     anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
-    in_deny_list: List[str]=[],
     max_fuzzy_spelling_mistakes_num:int=0,
     pii_identification_method:str="Local",
     comprehend_language: Optional[str] = None,
-    chosen_redact_comprehend_entities:List[str]=[],
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
     nlp_analyser: AnalyzerEngine = nlp_analyser,
@@ -599,7 +585,7 @@ def tabular_anonymise_wrapper_func(
         Returns:
             A list containing the common strings.
         """
-        common_strings = []
         for string in list1:
             if string in list2:
                 common_strings.append(string)
@@ -615,7 +601,9 @@ def tabular_anonymise_wrapper_func(
     if any_cols_found == False:
         out_message = "No chosen columns found in dataframe: " + out_file_part
         print(out_message)
     else:
         chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
@@ -686,11 +674,11 @@ def anonymise_script(df:pd.DataFrame,
                      anon_strat:str,
                      language:str,
                      chosen_redact_entities:List[str],
-                     in_allow_list:List[str]=[],
-                     in_deny_list:List[str]=[],
                      max_fuzzy_spelling_mistakes_num:int=0,
                      pii_identification_method:str="Local",
-                     chosen_redact_comprehend_entities:List[str]=[],
                      comprehend_query_number:int=0,
                      comprehend_client:botocore.client.BaseClient="",
                      custom_entities:List[str]=custom_entities,
@@ -714,18 +702,20 @@ def anonymise_script(df:pd.DataFrame,
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
-            in_allow_list_flat = []
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
-            in_allow_list_flat = []
     else:
-        in_allow_list_flat = []
     ### Language check - check if selected language packs exist
     try:
-        progress(0.1, desc=f"Loading SpaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:
@@ -748,7 +738,7 @@ def anonymise_script(df:pd.DataFrame,
             in_deny_list = in_deny_list.iloc[:, 0].tolist()
         else:
             # Handle the case where the DataFrame is empty
-            in_deny_list = []  # or some default value
         # Sort the strings in order from the longest string to the shortest
         in_deny_list = sorted(in_deny_list, key=len, reverse=True)
@@ -766,7 +756,7 @@ def anonymise_script(df:pd.DataFrame,
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
-    analyzer_results = []
     # Use provided comprehend language or fall back to main language
     language = language

     return text
 def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
+        output = list()
         if hasattr(result, 'value'):
             text = result.value[data_row]
     Returns:
         str: A string containing the detailed decision process output.
     """
+    decision_process_output = list()
     keys_to_keep = ['entity_type', 'start', 'end']
     # Run through each column to analyse for PII
     analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
     analyzer_results = list(analyzer_results)
     text = analyzer_results[3].value
     recognizer_result = str(analyzer_results[3].recognizer_results)
     data_str = recognizer_result  # abbreviated for brevity
     # Adjusting the parse_dict function to handle trailing ']'
     # Re-running the improved processing code
+    result = list()
     for lst_str in list_strs:
         # Splitting each list string into individual dictionary strings
         dicts = [parse_dict(d) for d in dict_strs]
         result.append(dicts)
+    names = list()
     for idx, paragraph in enumerate(text):
+        paragraph_texts = list()
         for dictionary in result[idx]:
             if dictionary['type'] == 'PERSON':
                 paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
         names.append(paragraph_texts)
     # Flatten the list of lists and extract unique names
     unique_names = list(set(name for sublist in names for name in sublist))
     fake_names = pd.Series(unique_names).apply(fake_first_name)
     mapping_df = pd.DataFrame(data={"Unique names":unique_names,
                     "Fake names": fake_names})
     # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
     name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
     name_map
     scrubbed_df_consistent_names = df.replace(name_map, regex = True)
     scrubbed_df_consistent_names
     return scrubbed_df_consistent_names
     # 1. Load the document and extract text elements
     doc = docx.Document(file_path)
+    text_elements = list()  # This will store the actual docx objects (paragraphs, cells)
+    original_texts = list() # This will store the text from those objects
     # Extract from paragraphs
     for para in doc.paragraphs:
                          chosen_redact_entities: List[str],
                          in_allow_list: List[str] = None,
                          latest_file_completed: int = 0,
+                         out_message: list = list(),
+                         out_file_paths: list = list(),
+                         log_files_output_paths: list = list(),
+                         in_excel_sheets: list = list(),
                          first_loop_state: bool = False,
                          output_folder: str = OUTPUT_FOLDER,
+                         in_deny_list:list[str]=list(),
                          max_fuzzy_spelling_mistakes_num:int=0,
                          pii_identification_method:str="Local",
+                         chosen_redact_comprehend_entities:List[str]=list(),
                          comprehend_query_number:int=0,
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
+        out_message = list()
+        out_file_paths = list()
     # Load file
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
+        log_files_output_paths = list()
     if not out_file_paths:
+        out_file_paths = list()
     if isinstance(in_allow_list, list):
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
+            in_allow_list_flat = list()
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
+            in_allow_list_flat = list()
     else:
+        in_allow_list_flat = list()
     anon_df = pd.DataFrame()
         actual_time_taken_number += out_time_float
+        if isinstance(out_message, str):
+            out_message = [out_message]
         out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
         out_message_out = '\n'.join(out_message)
     file_type: str,
     anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
+    in_deny_list: List[str]=list(),
     max_fuzzy_spelling_mistakes_num:int=0,
     pii_identification_method:str="Local",
     comprehend_language: Optional[str] = None,
+    chosen_redact_comprehend_entities:List[str]=list(),
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
     nlp_analyser: AnalyzerEngine = nlp_analyser,
         Returns:
             A list containing the common strings.
         """
+        common_strings = list()
         for string in list1:
             if string in list2:
                 common_strings.append(string)
     if any_cols_found == False:
         out_message = "No chosen columns found in dataframe: " + out_file_part
+        key_string = ""
         print(out_message)
+        return out_file_paths, out_message, key_string, log_files_output_paths
     else:
         chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
                      anon_strat:str,
                      language:str,
                      chosen_redact_entities:List[str],
+                     in_allow_list:List[str]=list(),
+                     in_deny_list:List[str]=list(),
                      max_fuzzy_spelling_mistakes_num:int=0,
                      pii_identification_method:str="Local",
+                     chosen_redact_comprehend_entities:List[str]=list(),
                      comprehend_query_number:int=0,
                      comprehend_client:botocore.client.BaseClient="",
                      custom_entities:List[str]=custom_entities,
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
+            in_allow_list_flat = list()
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
+            in_allow_list_flat = list()
     else:
+        in_allow_list_flat = list()
     ### Language check - check if selected language packs exist
     try:
+        if language != "en":
+            progress(0.1, desc=f"Loading SpaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:
             in_deny_list = in_deny_list.iloc[:, 0].tolist()
         else:
             # Handle the case where the DataFrame is empty
+            in_deny_list = list()  # or some default value
         # Sort the strings in order from the longest string to the shortest
         in_deny_list = sorted(in_deny_list, key=len, reverse=True)
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
+    analyzer_results = list()
     # Use provided comprehend language or fall back to main language
     language = language

tools/redaction_review.py CHANGED Viewed

@@ -615,7 +615,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     # Now, when we group, we use `sort=False`. This tells groupby to respect the
     # DataFrame's current order, which we have just manually set. This is slightly
     # more efficient than letting it sort again.
-    for image_path, group in merged_df.groupby('image', sort=False):
         # The progress.tqdm wrapper can be added back around the groupby object as you had it.
         # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):

     # Now, when we group, we use `sort=False`. This tells groupby to respect the
     # DataFrame's current order, which we have just manually set. This is slightly
     # more efficient than letting it sort again.
+    for image_path, group in merged_df.groupby('image', sort=False, observed=False):
         # The progress.tqdm wrapper can be added back around the groupby object as you had it.
         # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):