Commit
·
f188b10
1
Parent(s):
f1425ca
Corrected an issue with finding valid language entities for AWS comprehend redaction
Browse files- app.py +2 -2
- tools/custom_image_analyser_engine.py +11 -4
app.py
CHANGED
@@ -275,7 +275,7 @@ with app:
|
|
275 |
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
|
276 |
|
277 |
with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
|
278 |
-
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting"
|
279 |
|
280 |
with gr.Row(equal_height=True):
|
281 |
pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
|
@@ -302,7 +302,7 @@ with app:
|
|
302 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
|
303 |
|
304 |
if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
|
305 |
-
with gr.Accordion("Submit whole document to AWS Textract API (
|
306 |
with gr.Row(equal_height=True):
|
307 |
gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
|
308 |
with gr.Row(equal_height=True):
|
|
|
275 |
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
|
276 |
|
277 |
with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
|
278 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting"])
|
279 |
|
280 |
with gr.Row(equal_height=True):
|
281 |
pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
|
|
|
302 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
|
303 |
|
304 |
if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
|
305 |
+
with gr.Accordion("Submit whole document to AWS Textract API (quickest text extraction for large documents)", open = False, visible=True):
|
306 |
with gr.Row(equal_height=True):
|
307 |
gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
|
308 |
with gr.Row(equal_height=True):
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -717,8 +717,10 @@ class CustomImageAnalyzerEngine:
|
|
717 |
aws_language = language or getattr(self, 'language', None) or 'en'
|
718 |
|
719 |
valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
|
720 |
-
|
721 |
-
|
|
|
|
|
722 |
|
723 |
# Process using either Local or AWS Comprehend
|
724 |
if pii_identification_method == LOCAL_PII_OPTION:
|
@@ -1183,13 +1185,18 @@ def run_page_text_redaction(
|
|
1183 |
page_text += text_line.text
|
1184 |
page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
|
1185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1186 |
# Process based on identification method
|
1187 |
if pii_identification_method == LOCAL_PII_OPTION:
|
1188 |
if not nlp_analyser:
|
1189 |
raise ValueError("nlp_analyser is required for Local identification method")
|
1190 |
|
1191 |
-
valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
|
1192 |
-
|
1193 |
language_supported_entities = filter_entities_for_language(chosen_redact_entities, valid_language_entities, language)
|
1194 |
|
1195 |
page_analyser_result = nlp_analyser.analyze(
|
|
|
717 |
aws_language = language or getattr(self, 'language', None) or 'en'
|
718 |
|
719 |
valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
|
720 |
+
if "CUSTOM" not in valid_language_entities:
|
721 |
+
valid_language_entities.append("CUSTOM")
|
722 |
+
if "CUSTOM_FUZZY" not in valid_language_entities:
|
723 |
+
valid_language_entities.append("CUSTOM_FUZZY")
|
724 |
|
725 |
# Process using either Local or AWS Comprehend
|
726 |
if pii_identification_method == LOCAL_PII_OPTION:
|
|
|
1185 |
page_text += text_line.text
|
1186 |
page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
|
1187 |
|
1188 |
+
|
1189 |
+
valid_language_entities = nlp_analyser.registry.get_supported_entities(languages=[language])
|
1190 |
+
if "CUSTOM" not in valid_language_entities:
|
1191 |
+
valid_language_entities.append("CUSTOM")
|
1192 |
+
if "CUSTOM_FUZZY" not in valid_language_entities:
|
1193 |
+
valid_language_entities.append("CUSTOM_FUZZY")
|
1194 |
+
|
1195 |
# Process based on identification method
|
1196 |
if pii_identification_method == LOCAL_PII_OPTION:
|
1197 |
if not nlp_analyser:
|
1198 |
raise ValueError("nlp_analyser is required for Local identification method")
|
1199 |
|
|
|
|
|
1200 |
language_supported_entities = filter_entities_for_language(chosen_redact_entities, valid_language_entities, language)
|
1201 |
|
1202 |
page_analyser_result = nlp_analyser.analyze(
|