seanpedrickcase commited on
Commit
f188b10
·
1 Parent(s): f1425ca

Corrected an issue with finding valid language entities for AWS comprehend redaction

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. tools/custom_image_analyser_engine.py +11 -4
app.py CHANGED
@@ -275,7 +275,7 @@ with app:
275
  text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
276
 
277
  with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
278
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
279
 
280
  with gr.Row(equal_height=True):
281
  pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
@@ -302,7 +302,7 @@ with app:
302
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
303
 
304
  if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
305
- with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
306
  with gr.Row(equal_height=True):
307
  gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
308
  with gr.Row(equal_height=True):
 
275
  text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
276
 
277
  with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
278
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting"])
279
 
280
  with gr.Row(equal_height=True):
281
  pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
 
302
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
303
 
304
  if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
305
+ with gr.Accordion("Submit whole document to AWS Textract API (quickest text extraction for large documents)", open = False, visible=True):
306
  with gr.Row(equal_height=True):
307
  gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
308
  with gr.Row(equal_height=True):
tools/custom_image_analyser_engine.py CHANGED
@@ -717,8 +717,10 @@ class CustomImageAnalyzerEngine:
717
  aws_language = language or getattr(self, 'language', None) or 'en'
718
 
719
  valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
720
- valid_language_entities.append("CUSTOM")
721
- valid_language_entities.append("CUSTOM_FUZZY")
 
 
722
 
723
  # Process using either Local or AWS Comprehend
724
  if pii_identification_method == LOCAL_PII_OPTION:
@@ -1183,13 +1185,18 @@ def run_page_text_redaction(
1183
  page_text += text_line.text
1184
  page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
1185
 
 
 
 
 
 
 
 
1186
  # Process based on identification method
1187
  if pii_identification_method == LOCAL_PII_OPTION:
1188
  if not nlp_analyser:
1189
  raise ValueError("nlp_analyser is required for Local identification method")
1190
 
1191
- valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
1192
-
1193
  language_supported_entities = filter_entities_for_language(chosen_redact_entities, valid_language_entities, language)
1194
 
1195
  page_analyser_result = nlp_analyser.analyze(
 
717
  aws_language = language or getattr(self, 'language', None) or 'en'
718
 
719
  valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
720
+ if "CUSTOM" not in valid_language_entities:
721
+ valid_language_entities.append("CUSTOM")
722
+ if "CUSTOM_FUZZY" not in valid_language_entities:
723
+ valid_language_entities.append("CUSTOM_FUZZY")
724
 
725
  # Process using either Local or AWS Comprehend
726
  if pii_identification_method == LOCAL_PII_OPTION:
 
1185
  page_text += text_line.text
1186
  page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
1187
 
1188
+
1189
+ valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
1190
+ if "CUSTOM" not in valid_language_entities:
1191
+ valid_language_entities.append("CUSTOM")
1192
+ if "CUSTOM_FUZZY" not in valid_language_entities:
1193
+ valid_language_entities.append("CUSTOM_FUZZY")
1194
+
1195
  # Process based on identification method
1196
  if pii_identification_method == LOCAL_PII_OPTION:
1197
  if not nlp_analyser:
1198
  raise ValueError("nlp_analyser is required for Local identification method")
1199
 
 
 
1200
  language_supported_entities = filter_entities_for_language(chosen_redact_entities, valid_language_entities, language)
1201
 
1202
  page_analyser_result = nlp_analyser.analyze(