Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 21

Commit

f188b10

1 Parent(s): f1425ca

Corrected an issue with finding valid language entities for AWS comprehend redaction

Browse files

Files changed (2) hide show

app.py +2 -2
tools/custom_image_analyser_engine.py +11 -4

app.py CHANGED Viewed

@@ -275,7 +275,7 @@ with app:
             text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
             with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
-                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
             with gr.Row(equal_height=True):
                 pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
@@ -302,7 +302,7 @@ with app:
                             cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
             if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
-                with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
                     with gr.Row(equal_height=True):
                         gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
                     with gr.Row(equal_height=True):

             text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
             with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
+                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting"])
             with gr.Row(equal_height=True):
                 pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
                             cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
             if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
+                with gr.Accordion("Submit whole document to AWS Textract API (quickest text extraction for large documents)", open = False, visible=True):
                     with gr.Row(equal_height=True):
                         gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
                     with gr.Row(equal_height=True):

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -717,8 +717,10 @@ class CustomImageAnalyzerEngine:
         aws_language = language or getattr(self, 'language', None) or 'en'
         valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
-        valid_language_entities.append("CUSTOM")
-        valid_language_entities.append("CUSTOM_FUZZY")
         # Process using either Local or AWS Comprehend
         if pii_identification_method == LOCAL_PII_OPTION:
@@ -1183,13 +1185,18 @@ def run_page_text_redaction(
             page_text += text_line.text
             page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
     # Process based on identification method
     if pii_identification_method == LOCAL_PII_OPTION:
         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
-        valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
         language_supported_entities = filter_entities_for_language(chosen_redact_entities, valid_language_entities, language)
         page_analyser_result = nlp_analyser.analyze(

         aws_language = language or getattr(self, 'language', None) or 'en'
         valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
+        if "CUSTOM" not in valid_language_entities:
+            valid_language_entities.append("CUSTOM")
+        if "CUSTOM_FUZZY" not in valid_language_entities:
+            valid_language_entities.append("CUSTOM_FUZZY")
         # Process using either Local or AWS Comprehend
         if pii_identification_method == LOCAL_PII_OPTION:
             page_text += text_line.text
             page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
+    valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
+    if "CUSTOM" not in valid_language_entities:
+        valid_language_entities.append("CUSTOM")
+    if "CUSTOM_FUZZY" not in valid_language_entities:
+        valid_language_entities.append("CUSTOM_FUZZY")
     # Process based on identification method
     if pii_identification_method == LOCAL_PII_OPTION:
         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
         language_supported_entities = filter_entities_for_language(chosen_redact_entities, valid_language_entities, language)
         page_analyser_result = nlp_analyser.analyze(