Commit
·
2878a94
1
Parent(s):
57aca87
Added PaddleOCR support
Browse files- Dockerfile +9 -2
- app.py +10 -9
- pyproject.toml +3 -1
- requirements.txt +2 -1
- tools/config.py +5 -0
- tools/custom_image_analyser_engine.py +0 -0
- tools/data_anonymise.py +2 -2
- tools/file_conversion.py +1 -1
- tools/file_redaction.py +19 -10
- tools/load_spacy_model_custom_recognisers.py +18 -16
Dockerfile
CHANGED
@@ -17,7 +17,7 @@ WORKDIR /src
|
|
17 |
|
18 |
COPY requirements.txt .
|
19 |
|
20 |
-
RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
|
21 |
|
22 |
# Add lambda entrypoint and script
|
23 |
COPY lambda_entrypoint.py .
|
@@ -81,7 +81,7 @@ RUN mkdir -p \
|
|
81 |
${APP_HOME}/app/logs \
|
82 |
${APP_HOME}/app/usage \
|
83 |
${APP_HOME}/app/feedback \
|
84 |
-
${APP_HOME}/app/config
|
85 |
|
86 |
# Now handle the /tmp and /var/tmp directories and their subdirectories
|
87 |
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
@@ -89,6 +89,12 @@ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_
|
|
89 |
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
90 |
&& chmod 700 ${XDG_CACHE_HOME}
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
# Copy installed packages from builder stage
|
93 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
94 |
|
@@ -115,6 +121,7 @@ VOLUME ["/home/user/app/logs"]
|
|
115 |
VOLUME ["/home/user/app/usage"]
|
116 |
VOLUME ["/home/user/app/feedback"]
|
117 |
VOLUME ["/home/user/app/config"]
|
|
|
118 |
VOLUME ["/tmp"]
|
119 |
VOLUME ["/var/tmp"]
|
120 |
|
|
|
17 |
|
18 |
COPY requirements.txt .
|
19 |
|
20 |
+
RUN pip install --no-cache-dir --verbose --target=/install -r requirements.txt && rm requirements.txt
|
21 |
|
22 |
# Add lambda entrypoint and script
|
23 |
COPY lambda_entrypoint.py .
|
|
|
81 |
${APP_HOME}/app/logs \
|
82 |
${APP_HOME}/app/usage \
|
83 |
${APP_HOME}/app/feedback \
|
84 |
+
${APP_HOME}/app/config
|
85 |
|
86 |
# Now handle the /tmp and /var/tmp directories and their subdirectories
|
87 |
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
|
|
89 |
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
90 |
&& chmod 700 ${XDG_CACHE_HOME}
|
91 |
|
92 |
+
RUN mkdir -p ${APP_HOME}/.paddlex/official_models \
|
93 |
+
&& chown user:user \
|
94 |
+
${APP_HOME}/.paddlex/official_models \
|
95 |
+
&& chmod 755 \
|
96 |
+
${APP_HOME}/.paddlex/official_models
|
97 |
+
|
98 |
# Copy installed packages from builder stage
|
99 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
100 |
|
|
|
121 |
VOLUME ["/home/user/app/usage"]
|
122 |
VOLUME ["/home/user/app/feedback"]
|
123 |
VOLUME ["/home/user/app/config"]
|
124 |
+
VOLUME ["/home/user/.paddlex/official_models"]
|
125 |
VOLUME ["/tmp"]
|
126 |
VOLUME ["/var/tmp"]
|
127 |
|
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
@@ -71,6 +71,8 @@ with app:
|
|
71 |
all_page_line_level_ocr_results = gr.Dropdown("", label="all_page_line_level_ocr_results", allow_custom_value=True, visible=False)
|
72 |
all_page_line_level_ocr_results_with_words = gr.Dropdown("", label="all_page_line_level_ocr_results_with_words", allow_custom_value=True, visible=False)
|
73 |
|
|
|
|
|
74 |
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
|
75 |
host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
|
76 |
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
|
@@ -266,11 +268,11 @@ with app:
|
|
266 |
###
|
267 |
with gr.Tab("Redact PDFs/images"):
|
268 |
with gr.Accordion("Redact document", open = True):
|
269 |
-
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=FILE_INPUT_HEIGHT)
|
270 |
|
271 |
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
|
272 |
|
273 |
-
with gr.Accordion("AWS Textract signature detection (default is
|
274 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
275 |
|
276 |
with gr.Row(equal_height=True):
|
@@ -646,15 +648,15 @@ with app:
|
|
646 |
# Run redaction function
|
647 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
|
648 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
649 |
-
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
|
650 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
|
651 |
|
652 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
653 |
-
# current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
|
654 |
# outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state])
|
655 |
|
656 |
# If a file has been completed, the function will continue onto the next document
|
657 |
-
latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
|
658 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
659 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
660 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
@@ -684,7 +686,7 @@ with app:
|
|
684 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
685 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
686 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
687 |
-
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
|
688 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
689 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
690 |
|
@@ -930,8 +932,7 @@ with app:
|
|
930 |
outputs=[review_file_df, all_image_annotations_state]).\
|
931 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
932 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
933 |
-
|
934 |
-
|
935 |
|
936 |
###
|
937 |
# SETTINGS PAGE INPUT / OUTPUT
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
|
|
71 |
all_page_line_level_ocr_results = gr.Dropdown("", label="all_page_line_level_ocr_results", allow_custom_value=True, visible=False)
|
72 |
all_page_line_level_ocr_results_with_words = gr.Dropdown("", label="all_page_line_level_ocr_results_with_words", allow_custom_value=True, visible=False)
|
73 |
|
74 |
+
chosen_local_model_textbox = gr.Textbox(CHOSEN_LOCAL_OCR_MODEL, label="chosen_local_model_textbox", visible=False)
|
75 |
+
|
76 |
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
|
77 |
host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
|
78 |
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
|
|
|
268 |
###
|
269 |
with gr.Tab("Redact PDFs/images"):
|
270 |
with gr.Accordion("Redact document", open = True):
|
271 |
+
in_doc_files = gr.File(label="Choose a PDF document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=FILE_INPUT_HEIGHT)
|
272 |
|
273 |
text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
|
274 |
|
275 |
+
with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
|
276 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
|
277 |
|
278 |
with gr.Row(equal_height=True):
|
|
|
648 |
# Run redaction function
|
649 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
|
650 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
651 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
|
652 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
|
653 |
|
654 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
655 |
+
# current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
|
656 |
# outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state])
|
657 |
|
658 |
# If a file has been completed, the function will continue onto the next document
|
659 |
+
latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
|
660 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
661 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
662 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
|
|
686 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
687 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
688 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
689 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
|
690 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
691 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
692 |
|
|
|
932 |
outputs=[review_file_df, all_image_annotations_state]).\
|
933 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
934 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
935 |
+
|
|
|
936 |
|
937 |
###
|
938 |
# SETTINGS PAGE INPUT / OUTPUT
|
pyproject.toml
CHANGED
@@ -35,7 +35,9 @@ dependencies = [
|
|
35 |
"rapidfuzz==3.13.0",
|
36 |
"python-dotenv==1.0.1",
|
37 |
"awslambdaric==3.1.1",
|
38 |
-
"python-docx==1.2.0"
|
|
|
|
|
39 |
]
|
40 |
|
41 |
[project.urls]
|
|
|
35 |
"rapidfuzz==3.13.0",
|
36 |
"python-dotenv==1.0.1",
|
37 |
"awslambdaric==3.1.1",
|
38 |
+
"python-docx==1.2.0",
|
39 |
+
"paddlepaddle==3.1.0",
|
40 |
+
"paddleocr==3.1.1"
|
41 |
]
|
42 |
|
43 |
[project.urls]
|
requirements.txt
CHANGED
@@ -21,9 +21,10 @@ spaczz==0.6.1
|
|
21 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
rapidfuzz==3.13.0
|
23 |
python-dotenv==1.0.1
|
24 |
-
#numpy==1.26.4
|
25 |
awslambdaric==3.1.1
|
26 |
python-docx==1.2.0
|
|
|
|
|
27 |
|
28 |
|
29 |
|
|
|
21 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
rapidfuzz==3.13.0
|
23 |
python-dotenv==1.0.1
|
|
|
24 |
awslambdaric==3.1.1
|
25 |
python-docx==1.2.0
|
26 |
+
paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
27 |
+
paddleocr==3.1.1
|
28 |
|
29 |
|
30 |
|
tools/config.py
CHANGED
@@ -265,6 +265,11 @@ TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
|
|
265 |
if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
|
266 |
TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
|
267 |
|
|
|
|
|
|
|
|
|
|
|
268 |
# Entities for redaction
|
269 |
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
|
270 |
|
|
|
265 |
if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
|
266 |
TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
|
267 |
|
268 |
+
### Local OCR model - Tesseract vs PaddleOCR
|
269 |
+
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "hybrid") # Choose between "tesseract", "hybrid", and "paddle"
|
270 |
+
|
271 |
+
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "False") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
|
272 |
+
|
273 |
# Entities for redaction
|
274 |
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
|
275 |
|
tools/custom_image_analyser_engine.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tools/data_anonymise.py
CHANGED
@@ -116,8 +116,8 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
|
|
116 |
# ## Pick out common names and replace them with the same person value
|
117 |
df_dict = df.to_dict(orient="list")
|
118 |
|
119 |
-
analyzer = AnalyzerEngine()
|
120 |
-
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=
|
121 |
|
122 |
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
123 |
analyzer_results = list(analyzer_results)
|
|
|
116 |
# ## Pick out common names and replace them with the same person value
|
117 |
df_dict = df.to_dict(orient="list")
|
118 |
|
119 |
+
#analyzer = AnalyzerEngine()
|
120 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
121 |
|
122 |
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
123 |
analyzer_results = list(analyzer_results)
|
tools/file_conversion.py
CHANGED
@@ -1677,7 +1677,7 @@ def convert_annotation_json_to_review_df(
|
|
1677 |
if 'color' in review_file_df.columns:
|
1678 |
# Check if the column actually contains lists before applying lambda
|
1679 |
if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
|
1680 |
-
review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
1681 |
|
1682 |
# Sort the results
|
1683 |
# Ensure sort columns exist before sorting
|
|
|
1677 |
if 'color' in review_file_df.columns:
|
1678 |
# Check if the column actually contains lists before applying lambda
|
1679 |
if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
|
1680 |
+
review_file_df.loc[:, "color"] = review_file_df.loc[:, "color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
1681 |
|
1682 |
# Sort the results
|
1683 |
# Ensure sort columns exist before sorting
|
tools/file_redaction.py
CHANGED
@@ -133,6 +133,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
133 |
all_page_line_level_ocr_results:list[dict] = list(),
|
134 |
all_page_line_level_ocr_results_with_words:list[dict] = list(),
|
135 |
all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
|
|
|
136 |
prepare_images:bool=True,
|
137 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
138 |
progress=gr.Progress(track_tqdm=True)):
|
@@ -186,6 +187,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
186 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
187 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
188 |
- all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
|
|
|
189 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
190 |
- RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
|
191 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
@@ -202,8 +204,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
202 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
203 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
204 |
|
205 |
-
print("all_page_line_level_ocr_results_with_words at start of choose and run...:", all_page_line_level_ocr_results_with_words)
|
206 |
-
|
207 |
if all_page_line_level_ocr_results_with_words_df is None:
|
208 |
all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
|
209 |
|
@@ -538,6 +538,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
538 |
text_extraction_only,
|
539 |
all_page_line_level_ocr_results,
|
540 |
all_page_line_level_ocr_results_with_words,
|
|
|
541 |
log_files_output_paths=log_files_output_paths,
|
542 |
output_folder=output_folder)
|
543 |
|
@@ -1347,6 +1348,7 @@ def redact_image_pdf(file_path:str,
|
|
1347 |
text_extraction_only:bool=False,
|
1348 |
all_page_line_level_ocr_results = list(),
|
1349 |
all_page_line_level_ocr_results_with_words = list(),
|
|
|
1350 |
page_break_val:int=int(PAGE_BREAK_VALUE),
|
1351 |
log_files_output_paths:List=list(),
|
1352 |
max_time:int=int(MAX_TIME_VALUE),
|
@@ -1354,7 +1356,7 @@ def redact_image_pdf(file_path:str,
|
|
1354 |
progress=Progress(track_tqdm=True)):
|
1355 |
|
1356 |
'''
|
1357 |
-
This function redacts sensitive information from a PDF document. It takes the following parameters:
|
1358 |
|
1359 |
- file_path (str): The path to the PDF file to be redacted.
|
1360 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
|
@@ -1367,6 +1369,7 @@ def redact_image_pdf(file_path:str,
|
|
1367 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
|
1368 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1369 |
- textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
|
|
|
1370 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
1371 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
1372 |
- all_page_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
|
@@ -1382,7 +1385,10 @@ def redact_image_pdf(file_path:str,
|
|
1382 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
1383 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
|
1384 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
1385 |
-
-
|
|
|
|
|
|
|
1386 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
1387 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1388 |
- output_folder (str, optional): The folder for file outputs.
|
@@ -1393,8 +1399,6 @@ def redact_image_pdf(file_path:str,
|
|
1393 |
|
1394 |
tic = time.perf_counter()
|
1395 |
|
1396 |
-
print("all_page_line_level_ocr_results_with_words in redact_image_pdf:", all_page_line_level_ocr_results_with_words)
|
1397 |
-
|
1398 |
file_name = get_file_name_without_type(file_path)
|
1399 |
comprehend_query_number_new = 0
|
1400 |
|
@@ -1408,7 +1412,11 @@ def redact_image_pdf(file_path:str,
|
|
1408 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1409 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1410 |
|
1411 |
-
|
|
|
|
|
|
|
|
|
1412 |
|
1413 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1414 |
out_message = "Connection to AWS Comprehend service unsuccessful."
|
@@ -1418,7 +1426,8 @@ def redact_image_pdf(file_path:str,
|
|
1418 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
|
1419 |
out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
|
1420 |
print(out_message_warning)
|
1421 |
-
#raise Exception(out_message)
|
|
|
1422 |
|
1423 |
number_of_pages = pymupdf_doc.page_count
|
1424 |
print("Number of pages:", str(number_of_pages))
|
@@ -1437,7 +1446,7 @@ def redact_image_pdf(file_path:str,
|
|
1437 |
textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
|
1438 |
original_textract_data = textract_data.copy()
|
1439 |
|
1440 |
-
print("Successfully loaded in Textract analysis results from file")
|
1441 |
|
1442 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
1443 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
@@ -1445,7 +1454,7 @@ def redact_image_pdf(file_path:str,
|
|
1445 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
1446 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
1447 |
|
1448 |
-
print("Loaded in local OCR analysis results from file")
|
1449 |
|
1450 |
###
|
1451 |
if current_loop_page == 0: page_loop_start = 0
|
|
|
133 |
all_page_line_level_ocr_results:list[dict] = list(),
|
134 |
all_page_line_level_ocr_results_with_words:list[dict] = list(),
|
135 |
all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
|
136 |
+
chosen_local_model:str="tesseract",
|
137 |
prepare_images:bool=True,
|
138 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
139 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
187 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
188 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
189 |
- all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
|
190 |
+
- chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
|
191 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
192 |
- RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
|
193 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
|
|
204 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
205 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
206 |
|
|
|
|
|
207 |
if all_page_line_level_ocr_results_with_words_df is None:
|
208 |
all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
|
209 |
|
|
|
538 |
text_extraction_only,
|
539 |
all_page_line_level_ocr_results,
|
540 |
all_page_line_level_ocr_results_with_words,
|
541 |
+
chosen_local_model,
|
542 |
log_files_output_paths=log_files_output_paths,
|
543 |
output_folder=output_folder)
|
544 |
|
|
|
1348 |
text_extraction_only:bool=False,
|
1349 |
all_page_line_level_ocr_results = list(),
|
1350 |
all_page_line_level_ocr_results_with_words = list(),
|
1351 |
+
chosen_local_model:str="tesseract",
|
1352 |
page_break_val:int=int(PAGE_BREAK_VALUE),
|
1353 |
log_files_output_paths:List=list(),
|
1354 |
max_time:int=int(MAX_TIME_VALUE),
|
|
|
1356 |
progress=Progress(track_tqdm=True)):
|
1357 |
|
1358 |
'''
|
1359 |
+
This function redacts sensitive information from a PDF document. It takes the following parameters in order:
|
1360 |
|
1361 |
- file_path (str): The path to the PDF file to be redacted.
|
1362 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
|
|
|
1369 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
|
1370 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1371 |
- textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
|
1372 |
+
- current_loop_page (int, optional): The current page being processed. Defaults to 0.
|
1373 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
1374 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
1375 |
- all_page_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
|
|
|
1385 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
1386 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
|
1387 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
1388 |
+
- all_page_line_level_ocr_results (optional): List of all page line level OCR results.
|
1389 |
+
- all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
|
1390 |
+
- chosen_local_model (str, optional): The local model chosen for OCR. Defaults to "tesseract", other choices are "paddle" for PaddleOCR, or "hybrid" for a combination of both.
|
1391 |
+
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
|
1392 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
1393 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1394 |
- output_folder (str, optional): The folder for file outputs.
|
|
|
1399 |
|
1400 |
tic = time.perf_counter()
|
1401 |
|
|
|
|
|
1402 |
file_name = get_file_name_without_type(file_path)
|
1403 |
comprehend_query_number_new = 0
|
1404 |
|
|
|
1412 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1413 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1414 |
|
1415 |
+
# Only load in PaddleOCR models if not running Textract
|
1416 |
+
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1417 |
+
image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine="tesseract")
|
1418 |
+
else:
|
1419 |
+
image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine=chosen_local_model)
|
1420 |
|
1421 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1422 |
out_message = "Connection to AWS Comprehend service unsuccessful."
|
|
|
1426 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
|
1427 |
out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
|
1428 |
print(out_message_warning)
|
1429 |
+
#raise Exception(out_message)
|
1430 |
+
|
1431 |
|
1432 |
number_of_pages = pymupdf_doc.page_count
|
1433 |
print("Number of pages:", str(number_of_pages))
|
|
|
1446 |
textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
|
1447 |
original_textract_data = textract_data.copy()
|
1448 |
|
1449 |
+
#print("Successfully loaded in Textract analysis results from file")
|
1450 |
|
1451 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
1452 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
|
|
1454 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
1455 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
1456 |
|
1457 |
+
#print("Loaded in local OCR analysis results from file")
|
1458 |
|
1459 |
###
|
1460 |
if current_loop_page == 0: page_loop_start = 0
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from typing import List
|
2 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
3 |
-
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
|
|
4 |
import spacy
|
5 |
from spacy.matcher import Matcher, PhraseMatcher
|
6 |
from spaczz.matcher import FuzzyMatcher
|
@@ -25,6 +26,22 @@ except:
|
|
25 |
nlp = spacy.load(model_name)
|
26 |
print("Successfully downloaded and imported spaCy model", model_name)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# #### Custom recognisers
|
29 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
30 |
# Create regex pattern, handling quotes carefully
|
@@ -314,21 +331,6 @@ class CustomWordFuzzyRecognizer(EntityRecognizer):
|
|
314 |
custom_list_default = []
|
315 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
316 |
|
317 |
-
# Create a class inheriting from SpacyNlpEngine
|
318 |
-
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
319 |
-
def __init__(self, loaded_spacy_model):
|
320 |
-
super().__init__()
|
321 |
-
self.nlp = {"en": loaded_spacy_model}
|
322 |
-
|
323 |
-
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
324 |
-
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
325 |
-
|
326 |
-
|
327 |
-
nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
|
328 |
-
default_score_threshold=score_threshold,
|
329 |
-
supported_languages=["en"],
|
330 |
-
log_decision_process=False,
|
331 |
-
)
|
332 |
|
333 |
# Add custom recognisers to nlp_analyser
|
334 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|
|
|
1 |
from typing import List
|
2 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
3 |
+
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
|
4 |
+
|
5 |
import spacy
|
6 |
from spacy.matcher import Matcher, PhraseMatcher
|
7 |
from spaczz.matcher import FuzzyMatcher
|
|
|
26 |
nlp = spacy.load(model_name)
|
27 |
print("Successfully downloaded and imported spaCy model", model_name)
|
28 |
|
29 |
+
# Create a class inheriting from SpacyNlpEngine
|
30 |
+
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
31 |
+
def __init__(self, loaded_spacy_model):
|
32 |
+
super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
|
33 |
+
self.nlp = {"en": loaded_spacy_model}
|
34 |
+
|
35 |
+
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
36 |
+
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
37 |
+
|
38 |
+
|
39 |
+
nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
|
40 |
+
default_score_threshold=score_threshold,
|
41 |
+
supported_languages=["en"],
|
42 |
+
log_decision_process=False,
|
43 |
+
) # New custom recognisers based on the following functions are added at the end of this script
|
44 |
+
|
45 |
# #### Custom recognisers
|
46 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
47 |
# Create regex pattern, handling quotes carefully
|
|
|
331 |
custom_list_default = []
|
332 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
# Add custom recognisers to nlp_analyser
|
336 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|