seanpedrickcase commited on
Commit
6f96988
·
1 Parent(s): 4c95b3c

Corrected some multiple xlsx/docx file redaction issues. package updates.

Browse files
README.md CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 0.8.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 1.0.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
app.py CHANGED
@@ -536,7 +536,7 @@ with app:
536
 
537
  pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = DEFAULT_PII_DETECTION_MODEL, choices=TABULAR_PII_DETECTION_MODELS)
538
 
539
- with gr.Accordion("Anonymisation output format", open = False):
540
  anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "redact completely") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
541
 
542
  tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
 
536
 
537
  pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = DEFAULT_PII_DETECTION_MODEL, choices=TABULAR_PII_DETECTION_MODELS)
538
 
539
+ with gr.Accordion("Anonymisation output format - by default will replace PII with a blank space", open = False):
540
  anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "redact completely") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
541
 
542
  tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
cdk/cdk_stack.py CHANGED
@@ -990,6 +990,21 @@ class CdkStack(Stack):
990
  "sourceVolume": epheremal_storage_volume_name,
991
  "containerPath": "/tmp/gradio_tmp",
992
  "readOnly": False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993
  }
994
  ],
995
  "readonlyRootFilesystem": read_only_file_system,
 
990
  "sourceVolume": epheremal_storage_volume_name,
991
  "containerPath": "/tmp/gradio_tmp",
992
  "readOnly": False
993
+ },
994
+ {
995
+ "sourceVolume": epheremal_storage_volume_name,
996
+ "containerPath": "/home/user/.paddlex",
997
+ "readOnly": False
998
+ },
999
+ {
1000
+ "sourceVolume": epheremal_storage_volume_name,
1001
+ "containerPath": "/home/user/.local/share/spacy/data",
1002
+ "readOnly": False
1003
+ },
1004
+ {
1005
+ "sourceVolume": epheremal_storage_volume_name,
1006
+ "containerPath": "/usr/share/tessdata",
1007
+ "readOnly": False
1008
  }
1009
  ],
1010
  "readonlyRootFilesystem": read_only_file_system,
index.qmd CHANGED
@@ -2,7 +2,7 @@
2
  title: "Home"
3
  ---
4
 
5
- version: 0.7.1
6
 
7
  Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
8
 
 
2
  title: "Home"
3
  ---
4
 
5
+ version: 1.0.0
6
 
7
  Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
8
 
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.8.0"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
@@ -23,7 +23,7 @@ dependencies = [
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.42.0",
27
  "boto3==1.40.10",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "1.0.0"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.43.1",
27
  "boto3==1.40.10",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
requirements.txt CHANGED
@@ -10,7 +10,7 @@ pandas==2.3.1
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.42.0
14
  boto3==1.40.10
15
  pyarrow==21.0.0
16
  openpyxl==3.1.5
 
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.43.1
14
  boto3==1.40.10
15
  pyarrow==21.0.0
16
  openpyxl==3.1.5
tools/custom_image_analyser_engine.py CHANGED
@@ -699,7 +699,6 @@ class CustomImageAnalyzerEngine:
699
  page_text_mapping = list()
700
  all_text_line_results = list()
701
  comprehend_query_number = 0
702
- print("custom_entities:", custom_entities)
703
 
704
  if not nlp_analyser:
705
  nlp_analyser = self.analyzer_engine
 
699
  page_text_mapping = list()
700
  all_text_line_results = list()
701
  comprehend_query_number = 0
 
702
 
703
  if not nlp_analyser:
704
  nlp_analyser = self.analyzer_engine
tools/data_anonymise.py CHANGED
@@ -49,7 +49,7 @@ def initial_clean(text:str) -> str:
49
  return text
50
 
51
  def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
52
- output = []
53
 
54
  if hasattr(result, 'value'):
55
  text = result.value[data_row]
@@ -89,7 +89,7 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
89
  Returns:
90
  str: A string containing the detailed decision process output.
91
  """
92
- decision_process_output = []
93
  keys_to_keep = ['entity_type', 'start', 'end']
94
 
95
  # Run through each column to analyse for PII
@@ -124,16 +124,10 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
124
  analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
125
  analyzer_results = list(analyzer_results)
126
 
127
- # + tags=[]
128
  text = analyzer_results[3].value
129
 
130
- # + tags=[]
131
  recognizer_result = str(analyzer_results[3].recognizer_results)
132
 
133
- # + tags=[]
134
- recognizer_result
135
-
136
- # + tags=[]
137
  data_str = recognizer_result # abbreviated for brevity
138
 
139
  # Adjusting the parse_dict function to handle trailing ']'
@@ -156,7 +150,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
156
 
157
  # Re-running the improved processing code
158
 
159
- result = []
160
 
161
  for lst_str in list_strs:
162
  # Splitting each list string into individual dictionary strings
@@ -167,41 +161,30 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
167
  dicts = [parse_dict(d) for d in dict_strs]
168
  result.append(dicts)
169
 
170
- #result
171
-
172
- # + tags=[]
173
- names = []
174
 
175
  for idx, paragraph in enumerate(text):
176
- paragraph_texts = []
177
  for dictionary in result[idx]:
178
  if dictionary['type'] == 'PERSON':
179
  paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
180
  names.append(paragraph_texts)
181
 
182
- # + tags=[]
183
  # Flatten the list of lists and extract unique names
184
  unique_names = list(set(name for sublist in names for name in sublist))
185
 
186
- # + tags=[]
187
  fake_names = pd.Series(unique_names).apply(fake_first_name)
188
 
189
- # + tags=[]
190
  mapping_df = pd.DataFrame(data={"Unique names":unique_names,
191
  "Fake names": fake_names})
192
 
193
- # + tags=[]
194
- # Convert mapping dataframe to dictionary
195
  # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
196
  name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
197
 
198
- # + tags=[]
199
  name_map
200
 
201
- # + tags=[]
202
  scrubbed_df_consistent_names = df.replace(name_map, regex = True)
203
 
204
- # + tags=[]
205
  scrubbed_df_consistent_names
206
 
207
  return scrubbed_df_consistent_names
@@ -230,8 +213,8 @@ def handle_docx_anonymisation(
230
 
231
  # 1. Load the document and extract text elements
232
  doc = docx.Document(file_path)
233
- text_elements = [] # This will store the actual docx objects (paragraphs, cells)
234
- original_texts = [] # This will store the text from those objects
235
 
236
  # Extract from paragraphs
237
  for para in doc.paragraphs:
@@ -307,16 +290,16 @@ def anonymise_files_with_open_text(file_paths: List[str],
307
  chosen_redact_entities: List[str],
308
  in_allow_list: List[str] = None,
309
  latest_file_completed: int = 0,
310
- out_message: list = [],
311
- out_file_paths: list = [],
312
- log_files_output_paths: list = [],
313
- in_excel_sheets: list = [],
314
  first_loop_state: bool = False,
315
  output_folder: str = OUTPUT_FOLDER,
316
- in_deny_list:list[str]=[],
317
  max_fuzzy_spelling_mistakes_num:int=0,
318
  pii_identification_method:str="Local",
319
- chosen_redact_comprehend_entities:List[str]=[],
320
  comprehend_query_number:int=0,
321
  aws_access_key_textbox:str='',
322
  aws_secret_key_textbox:str='',
@@ -367,8 +350,8 @@ def anonymise_files_with_open_text(file_paths: List[str],
367
  # If this is the first time around, set variables to 0/blank
368
  if first_loop_state==True:
369
  latest_file_completed = 0
370
- out_message = []
371
- out_file_paths = []
372
 
373
  # Load file
374
  # If out message or out_file_paths are blank, change to a list so it can be appended to
@@ -378,23 +361,23 @@ def anonymise_files_with_open_text(file_paths: List[str],
378
  #print("log_files_output_paths:",log_files_output_paths)
379
 
380
  if isinstance(log_files_output_paths, str):
381
- log_files_output_paths = []
382
 
383
  if not out_file_paths:
384
- out_file_paths = []
385
 
386
  if isinstance(in_allow_list, list):
387
  if in_allow_list:
388
  in_allow_list_flat = in_allow_list
389
  else:
390
- in_allow_list_flat = []
391
  elif isinstance(in_allow_list, pd.DataFrame):
392
  if not in_allow_list.empty:
393
  in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
394
  else:
395
- in_allow_list_flat = []
396
  else:
397
- in_allow_list_flat = []
398
 
399
  anon_df = pd.DataFrame()
400
 
@@ -520,6 +503,9 @@ def anonymise_files_with_open_text(file_paths: List[str],
520
 
521
  actual_time_taken_number += out_time_float
522
 
 
 
 
523
  out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
524
 
525
  out_message_out = '\n'.join(out_message)
@@ -549,11 +535,11 @@ def tabular_anonymise_wrapper_func(
549
  file_type: str,
550
  anon_xlsx_export_file_name: str,
551
  log_files_output_paths: List[str],
552
- in_deny_list: List[str]=[],
553
  max_fuzzy_spelling_mistakes_num:int=0,
554
  pii_identification_method:str="Local",
555
  comprehend_language: Optional[str] = None,
556
- chosen_redact_comprehend_entities:List[str]=[],
557
  comprehend_query_number:int=0,
558
  comprehend_client:botocore.client.BaseClient="",
559
  nlp_analyser: AnalyzerEngine = nlp_analyser,
@@ -599,7 +585,7 @@ def tabular_anonymise_wrapper_func(
599
  Returns:
600
  A list containing the common strings.
601
  """
602
- common_strings = []
603
  for string in list1:
604
  if string in list2:
605
  common_strings.append(string)
@@ -615,7 +601,9 @@ def tabular_anonymise_wrapper_func(
615
 
616
  if any_cols_found == False:
617
  out_message = "No chosen columns found in dataframe: " + out_file_part
 
618
  print(out_message)
 
619
  else:
620
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
621
 
@@ -686,11 +674,11 @@ def anonymise_script(df:pd.DataFrame,
686
  anon_strat:str,
687
  language:str,
688
  chosen_redact_entities:List[str],
689
- in_allow_list:List[str]=[],
690
- in_deny_list:List[str]=[],
691
  max_fuzzy_spelling_mistakes_num:int=0,
692
  pii_identification_method:str="Local",
693
- chosen_redact_comprehend_entities:List[str]=[],
694
  comprehend_query_number:int=0,
695
  comprehend_client:botocore.client.BaseClient="",
696
  custom_entities:List[str]=custom_entities,
@@ -714,18 +702,20 @@ def anonymise_script(df:pd.DataFrame,
714
  if in_allow_list:
715
  in_allow_list_flat = in_allow_list
716
  else:
717
- in_allow_list_flat = []
718
  elif isinstance(in_allow_list, pd.DataFrame):
719
  if not in_allow_list.empty:
720
  in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
721
  else:
722
- in_allow_list_flat = []
723
  else:
724
- in_allow_list_flat = []
725
 
726
  ### Language check - check if selected language packs exist
727
  try:
728
- progress(0.1, desc=f"Loading SpaCy model for {language}")
 
 
729
  load_spacy_model(language)
730
 
731
  except Exception as e:
@@ -748,7 +738,7 @@ def anonymise_script(df:pd.DataFrame,
748
  in_deny_list = in_deny_list.iloc[:, 0].tolist()
749
  else:
750
  # Handle the case where the DataFrame is empty
751
- in_deny_list = [] # or some default value
752
 
753
  # Sort the strings in order from the longest string to the shortest
754
  in_deny_list = sorted(in_deny_list, key=len, reverse=True)
@@ -766,7 +756,7 @@ def anonymise_script(df:pd.DataFrame,
766
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
767
  anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
768
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
769
- analyzer_results = []
770
 
771
  # Use provided comprehend language or fall back to main language
772
  language = language
 
49
  return text
50
 
51
  def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
52
+ output = list()
53
 
54
  if hasattr(result, 'value'):
55
  text = result.value[data_row]
 
89
  Returns:
90
  str: A string containing the detailed decision process output.
91
  """
92
+ decision_process_output = list()
93
  keys_to_keep = ['entity_type', 'start', 'end']
94
 
95
  # Run through each column to analyse for PII
 
124
  analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
125
  analyzer_results = list(analyzer_results)
126
 
 
127
  text = analyzer_results[3].value
128
 
 
129
  recognizer_result = str(analyzer_results[3].recognizer_results)
130
 
 
 
 
 
131
  data_str = recognizer_result # abbreviated for brevity
132
 
133
  # Adjusting the parse_dict function to handle trailing ']'
 
150
 
151
  # Re-running the improved processing code
152
 
153
+ result = list()
154
 
155
  for lst_str in list_strs:
156
  # Splitting each list string into individual dictionary strings
 
161
  dicts = [parse_dict(d) for d in dict_strs]
162
  result.append(dicts)
163
 
164
+ names = list()
 
 
 
165
 
166
  for idx, paragraph in enumerate(text):
167
+ paragraph_texts = list()
168
  for dictionary in result[idx]:
169
  if dictionary['type'] == 'PERSON':
170
  paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
171
  names.append(paragraph_texts)
172
 
 
173
  # Flatten the list of lists and extract unique names
174
  unique_names = list(set(name for sublist in names for name in sublist))
175
 
 
176
  fake_names = pd.Series(unique_names).apply(fake_first_name)
177
 
 
178
  mapping_df = pd.DataFrame(data={"Unique names":unique_names,
179
  "Fake names": fake_names})
180
 
 
 
181
  # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
182
  name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
183
 
 
184
  name_map
185
 
 
186
  scrubbed_df_consistent_names = df.replace(name_map, regex = True)
187
 
 
188
  scrubbed_df_consistent_names
189
 
190
  return scrubbed_df_consistent_names
 
213
 
214
  # 1. Load the document and extract text elements
215
  doc = docx.Document(file_path)
216
+ text_elements = list() # This will store the actual docx objects (paragraphs, cells)
217
+ original_texts = list() # This will store the text from those objects
218
 
219
  # Extract from paragraphs
220
  for para in doc.paragraphs:
 
290
  chosen_redact_entities: List[str],
291
  in_allow_list: List[str] = None,
292
  latest_file_completed: int = 0,
293
+ out_message: list = list(),
294
+ out_file_paths: list = list(),
295
+ log_files_output_paths: list = list(),
296
+ in_excel_sheets: list = list(),
297
  first_loop_state: bool = False,
298
  output_folder: str = OUTPUT_FOLDER,
299
+ in_deny_list:list[str]=list(),
300
  max_fuzzy_spelling_mistakes_num:int=0,
301
  pii_identification_method:str="Local",
302
+ chosen_redact_comprehend_entities:List[str]=list(),
303
  comprehend_query_number:int=0,
304
  aws_access_key_textbox:str='',
305
  aws_secret_key_textbox:str='',
 
350
  # If this is the first time around, set variables to 0/blank
351
  if first_loop_state==True:
352
  latest_file_completed = 0
353
+ out_message = list()
354
+ out_file_paths = list()
355
 
356
  # Load file
357
  # If out message or out_file_paths are blank, change to a list so it can be appended to
 
361
  #print("log_files_output_paths:",log_files_output_paths)
362
 
363
  if isinstance(log_files_output_paths, str):
364
+ log_files_output_paths = list()
365
 
366
  if not out_file_paths:
367
+ out_file_paths = list()
368
 
369
  if isinstance(in_allow_list, list):
370
  if in_allow_list:
371
  in_allow_list_flat = in_allow_list
372
  else:
373
+ in_allow_list_flat = list()
374
  elif isinstance(in_allow_list, pd.DataFrame):
375
  if not in_allow_list.empty:
376
  in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
377
  else:
378
+ in_allow_list_flat = list()
379
  else:
380
+ in_allow_list_flat = list()
381
 
382
  anon_df = pd.DataFrame()
383
 
 
503
 
504
  actual_time_taken_number += out_time_float
505
 
506
+ if isinstance(out_message, str):
507
+ out_message = [out_message]
508
+
509
  out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
510
 
511
  out_message_out = '\n'.join(out_message)
 
535
  file_type: str,
536
  anon_xlsx_export_file_name: str,
537
  log_files_output_paths: List[str],
538
+ in_deny_list: List[str]=list(),
539
  max_fuzzy_spelling_mistakes_num:int=0,
540
  pii_identification_method:str="Local",
541
  comprehend_language: Optional[str] = None,
542
+ chosen_redact_comprehend_entities:List[str]=list(),
543
  comprehend_query_number:int=0,
544
  comprehend_client:botocore.client.BaseClient="",
545
  nlp_analyser: AnalyzerEngine = nlp_analyser,
 
585
  Returns:
586
  A list containing the common strings.
587
  """
588
+ common_strings = list()
589
  for string in list1:
590
  if string in list2:
591
  common_strings.append(string)
 
601
 
602
  if any_cols_found == False:
603
  out_message = "No chosen columns found in dataframe: " + out_file_part
604
+ key_string = ""
605
  print(out_message)
606
+ return out_file_paths, out_message, key_string, log_files_output_paths
607
  else:
608
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
609
 
 
674
  anon_strat:str,
675
  language:str,
676
  chosen_redact_entities:List[str],
677
+ in_allow_list:List[str]=list(),
678
+ in_deny_list:List[str]=list(),
679
  max_fuzzy_spelling_mistakes_num:int=0,
680
  pii_identification_method:str="Local",
681
+ chosen_redact_comprehend_entities:List[str]=list(),
682
  comprehend_query_number:int=0,
683
  comprehend_client:botocore.client.BaseClient="",
684
  custom_entities:List[str]=custom_entities,
 
702
  if in_allow_list:
703
  in_allow_list_flat = in_allow_list
704
  else:
705
+ in_allow_list_flat = list()
706
  elif isinstance(in_allow_list, pd.DataFrame):
707
  if not in_allow_list.empty:
708
  in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
709
  else:
710
+ in_allow_list_flat = list()
711
  else:
712
+ in_allow_list_flat = list()
713
 
714
  ### Language check - check if selected language packs exist
715
  try:
716
+ if language != "en":
717
+ progress(0.1, desc=f"Loading SpaCy model for {language}")
718
+
719
  load_spacy_model(language)
720
 
721
  except Exception as e:
 
738
  in_deny_list = in_deny_list.iloc[:, 0].tolist()
739
  else:
740
  # Handle the case where the DataFrame is empty
741
+ in_deny_list = list() # or some default value
742
 
743
  # Sort the strings in order from the longest string to the shortest
744
  in_deny_list = sorted(in_deny_list, key=len, reverse=True)
 
756
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
757
  anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
758
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
759
+ analyzer_results = list()
760
 
761
  # Use provided comprehend language or fall back to main language
762
  language = language
tools/redaction_review.py CHANGED
@@ -615,7 +615,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
615
  # Now, when we group, we use `sort=False`. This tells groupby to respect the
616
  # DataFrame's current order, which we have just manually set. This is slightly
617
  # more efficient than letting it sort again.
618
- for image_path, group in merged_df.groupby('image', sort=False):
619
  # The progress.tqdm wrapper can be added back around the groupby object as you had it.
620
  # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
621
 
 
615
  # Now, when we group, we use `sort=False`. This tells groupby to respect the
616
  # DataFrame's current order, which we have just manually set. This is slightly
617
  # more efficient than letting it sort again.
618
+ for image_path, group in merged_df.groupby('image', sort=False, observed=False):
619
  # The progress.tqdm wrapper can be added back around the groupby object as you had it.
620
  # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
621