seanpedrickcase commited on
Commit
2878a94
·
1 Parent(s): 57aca87

Added PaddleOCR support

Browse files
Dockerfile CHANGED
@@ -17,7 +17,7 @@ WORKDIR /src
17
 
18
  COPY requirements.txt .
19
 
20
- RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
21
 
22
  # Add lambda entrypoint and script
23
  COPY lambda_entrypoint.py .
@@ -81,7 +81,7 @@ RUN mkdir -p \
81
  ${APP_HOME}/app/logs \
82
  ${APP_HOME}/app/usage \
83
  ${APP_HOME}/app/feedback \
84
- ${APP_HOME}/app/config
85
 
86
  # Now handle the /tmp and /var/tmp directories and their subdirectories
87
  RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
@@ -89,6 +89,12 @@ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_
89
  && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
90
  && chmod 700 ${XDG_CACHE_HOME}
91
 
 
 
 
 
 
 
92
  # Copy installed packages from builder stage
93
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
94
 
@@ -115,6 +121,7 @@ VOLUME ["/home/user/app/logs"]
115
  VOLUME ["/home/user/app/usage"]
116
  VOLUME ["/home/user/app/feedback"]
117
  VOLUME ["/home/user/app/config"]
 
118
  VOLUME ["/tmp"]
119
  VOLUME ["/var/tmp"]
120
 
 
17
 
18
  COPY requirements.txt .
19
 
20
+ RUN pip install --no-cache-dir --verbose --target=/install -r requirements.txt && rm requirements.txt
21
 
22
  # Add lambda entrypoint and script
23
  COPY lambda_entrypoint.py .
 
81
  ${APP_HOME}/app/logs \
82
  ${APP_HOME}/app/usage \
83
  ${APP_HOME}/app/feedback \
84
+ ${APP_HOME}/app/config
85
 
86
  # Now handle the /tmp and /var/tmp directories and their subdirectories
87
  RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
 
89
  && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
90
  && chmod 700 ${XDG_CACHE_HOME}
91
 
92
+ RUN mkdir -p ${APP_HOME}/.paddlex/official_models \
93
+ && chown user:user \
94
+ ${APP_HOME}/.paddlex/official_models \
95
+ && chmod 755 \
96
+ ${APP_HOME}/.paddlex/official_models
97
+
98
  # Copy installed packages from builder stage
99
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
100
 
 
121
  VOLUME ["/home/user/app/usage"]
122
  VOLUME ["/home/user/app/feedback"]
123
  VOLUME ["/home/user/app/config"]
124
+ VOLUME ["/home/user/.paddlex/official_models"]
125
  VOLUME ["/tmp"]
126
  VOLUME ["/var/tmp"]
127
 
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
@@ -71,6 +71,8 @@ with app:
71
  all_page_line_level_ocr_results = gr.Dropdown("", label="all_page_line_level_ocr_results", allow_custom_value=True, visible=False)
72
  all_page_line_level_ocr_results_with_words = gr.Dropdown("", label="all_page_line_level_ocr_results_with_words", allow_custom_value=True, visible=False)
73
 
 
 
74
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
75
  host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
76
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
@@ -266,11 +268,11 @@ with app:
266
  ###
267
  with gr.Tab("Redact PDFs/images"):
268
  with gr.Accordion("Redact document", open = True):
269
- in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=FILE_INPUT_HEIGHT)
270
 
271
  text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
272
 
273
- with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
274
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
275
 
276
  with gr.Row(equal_height=True):
@@ -646,15 +648,15 @@ with app:
646
  # Run redaction function
647
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
648
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
649
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
650
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
651
 
652
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
653
- # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
654
  # outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state])
655
 
656
  # If a file has been completed, the function will continue onto the next document
657
- latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
658
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
659
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
660
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
@@ -684,7 +686,7 @@ with app:
684
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
685
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
686
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
687
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
688
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
689
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
690
 
@@ -930,8 +932,7 @@ with app:
930
  outputs=[review_file_df, all_image_annotations_state]).\
931
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
932
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
933
-
934
-
935
 
936
  ###
937
  # SETTINGS PAGE INPUT / OUTPUT
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
 
71
  all_page_line_level_ocr_results = gr.Dropdown("", label="all_page_line_level_ocr_results", allow_custom_value=True, visible=False)
72
  all_page_line_level_ocr_results_with_words = gr.Dropdown("", label="all_page_line_level_ocr_results_with_words", allow_custom_value=True, visible=False)
73
 
74
+ chosen_local_model_textbox = gr.Textbox(CHOSEN_LOCAL_OCR_MODEL, label="chosen_local_model_textbox", visible=False)
75
+
76
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
77
  host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
78
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
 
268
  ###
269
  with gr.Tab("Redact PDFs/images"):
270
  with gr.Accordion("Redact document", open = True):
271
+ in_doc_files = gr.File(label="Choose a PDF document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=FILE_INPUT_HEIGHT)
272
 
273
  text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
274
 
275
+ with gr.Accordion("Enable AWS Textract signature detection (default is off)", open = False):
276
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
277
 
278
  with gr.Row(equal_height=True):
 
648
  # Run redaction function
649
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
650
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
651
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
652
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
653
 
654
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
655
+ # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
656
  # outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state])
657
 
658
  # If a file has been completed, the function will continue onto the next document
659
+ latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
660
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
661
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
662
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
 
686
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
687
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
688
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
689
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
690
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
691
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
692
 
 
932
  outputs=[review_file_df, all_image_annotations_state]).\
933
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
934
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
935
+
 
936
 
937
  ###
938
  # SETTINGS PAGE INPUT / OUTPUT
pyproject.toml CHANGED
@@ -35,7 +35,9 @@ dependencies = [
35
  "rapidfuzz==3.13.0",
36
  "python-dotenv==1.0.1",
37
  "awslambdaric==3.1.1",
38
- "python-docx==1.2.0"
 
 
39
  ]
40
 
41
  [project.urls]
 
35
  "rapidfuzz==3.13.0",
36
  "python-dotenv==1.0.1",
37
  "awslambdaric==3.1.1",
38
+ "python-docx==1.2.0",
39
+ "paddlepaddle==3.1.0",
40
+ "paddleocr==3.1.1"
41
  ]
42
 
43
  [project.urls]
requirements.txt CHANGED
@@ -21,9 +21,10 @@ spaczz==0.6.1
21
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
  rapidfuzz==3.13.0
23
  python-dotenv==1.0.1
24
- #numpy==1.26.4
25
  awslambdaric==3.1.1
26
  python-docx==1.2.0
 
 
27
 
28
 
29
 
 
21
  https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
  rapidfuzz==3.13.0
23
  python-dotenv==1.0.1
 
24
  awslambdaric==3.1.1
25
  python-docx==1.2.0
26
+ paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
27
+ paddleocr==3.1.1
28
 
29
 
30
 
tools/config.py CHANGED
@@ -265,6 +265,11 @@ TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
265
  if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
266
  TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
267
 
 
 
 
 
 
268
  # Entities for redaction
269
  CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
270
 
 
265
  if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
266
  TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
267
 
268
+ ### Local OCR model - Tesseract vs PaddleOCR
269
+ CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "hybrid") # Choose between "tesseract", "hybrid", and "paddle"
270
+
271
+ PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "False") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
272
+
273
  # Entities for redaction
274
  CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
275
 
tools/custom_image_analyser_engine.py CHANGED
The diff for this file is too large to render. See raw diff
 
tools/data_anonymise.py CHANGED
@@ -116,8 +116,8 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
116
  # ## Pick out common names and replace them with the same person value
117
  df_dict = df.to_dict(orient="list")
118
 
119
- analyzer = AnalyzerEngine()
120
- batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
121
 
122
  analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
123
  analyzer_results = list(analyzer_results)
 
116
  # ## Pick out common names and replace them with the same person value
117
  df_dict = df.to_dict(orient="list")
118
 
119
+ #analyzer = AnalyzerEngine()
120
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
121
 
122
  analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
123
  analyzer_results = list(analyzer_results)
tools/file_conversion.py CHANGED
@@ -1677,7 +1677,7 @@ def convert_annotation_json_to_review_df(
1677
  if 'color' in review_file_df.columns:
1678
  # Check if the column actually contains lists before applying lambda
1679
  if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
1680
- review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
1681
 
1682
  # Sort the results
1683
  # Ensure sort columns exist before sorting
 
1677
  if 'color' in review_file_df.columns:
1678
  # Check if the column actually contains lists before applying lambda
1679
  if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
1680
+ review_file_df.loc[:, "color"] = review_file_df.loc[:, "color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
1681
 
1682
  # Sort the results
1683
  # Ensure sort columns exist before sorting
tools/file_redaction.py CHANGED
@@ -133,6 +133,7 @@ def choose_and_run_redactor(file_paths:List[str],
133
  all_page_line_level_ocr_results:list[dict] = list(),
134
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
135
  all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
 
136
  prepare_images:bool=True,
137
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
138
  progress=gr.Progress(track_tqdm=True)):
@@ -186,6 +187,7 @@ def choose_and_run_redactor(file_paths:List[str],
186
  - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
187
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
188
  - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
 
189
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
190
  - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
191
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -202,8 +204,6 @@ def choose_and_run_redactor(file_paths:List[str],
202
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
203
  review_out_file_paths = [prepared_pdf_file_paths[0]]
204
 
205
- print("all_page_line_level_ocr_results_with_words at start of choose and run...:", all_page_line_level_ocr_results_with_words)
206
-
207
  if all_page_line_level_ocr_results_with_words_df is None:
208
  all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
209
 
@@ -538,6 +538,7 @@ def choose_and_run_redactor(file_paths:List[str],
538
  text_extraction_only,
539
  all_page_line_level_ocr_results,
540
  all_page_line_level_ocr_results_with_words,
 
541
  log_files_output_paths=log_files_output_paths,
542
  output_folder=output_folder)
543
 
@@ -1347,6 +1348,7 @@ def redact_image_pdf(file_path:str,
1347
  text_extraction_only:bool=False,
1348
  all_page_line_level_ocr_results = list(),
1349
  all_page_line_level_ocr_results_with_words = list(),
 
1350
  page_break_val:int=int(PAGE_BREAK_VALUE),
1351
  log_files_output_paths:List=list(),
1352
  max_time:int=int(MAX_TIME_VALUE),
@@ -1354,7 +1356,7 @@ def redact_image_pdf(file_path:str,
1354
  progress=Progress(track_tqdm=True)):
1355
 
1356
  '''
1357
- This function redacts sensitive information from a PDF document. It takes the following parameters:
1358
 
1359
  - file_path (str): The path to the PDF file to be redacted.
1360
  - pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
@@ -1367,6 +1369,7 @@ def redact_image_pdf(file_path:str,
1367
  - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
1368
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
1369
  - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
 
1370
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
1371
  - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
1372
  - all_page_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
@@ -1382,7 +1385,10 @@ def redact_image_pdf(file_path:str,
1382
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
1383
  - page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
1384
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
1385
- - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
 
 
 
1386
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
1387
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1388
  - output_folder (str, optional): The folder for file outputs.
@@ -1393,8 +1399,6 @@ def redact_image_pdf(file_path:str,
1393
 
1394
  tic = time.perf_counter()
1395
 
1396
- print("all_page_line_level_ocr_results_with_words in redact_image_pdf:", all_page_line_level_ocr_results_with_words)
1397
-
1398
  file_name = get_file_name_without_type(file_path)
1399
  comprehend_query_number_new = 0
1400
 
@@ -1408,7 +1412,11 @@ def redact_image_pdf(file_path:str,
1408
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1409
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1410
 
1411
- image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
 
 
 
 
1412
 
1413
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1414
  out_message = "Connection to AWS Comprehend service unsuccessful."
@@ -1418,7 +1426,8 @@ def redact_image_pdf(file_path:str,
1418
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
1419
  out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
1420
  print(out_message_warning)
1421
- #raise Exception(out_message)
 
1422
 
1423
  number_of_pages = pymupdf_doc.page_count
1424
  print("Number of pages:", str(number_of_pages))
@@ -1437,7 +1446,7 @@ def redact_image_pdf(file_path:str,
1437
  textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
1438
  original_textract_data = textract_data.copy()
1439
 
1440
- print("Successfully loaded in Textract analysis results from file")
1441
 
1442
  # If running local OCR option, check if file already exists. If it does, load in existing data
1443
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
@@ -1445,7 +1454,7 @@ def redact_image_pdf(file_path:str,
1445
  all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
1446
  original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
1447
 
1448
- print("Loaded in local OCR analysis results from file")
1449
 
1450
  ###
1451
  if current_loop_page == 0: page_loop_start = 0
 
133
  all_page_line_level_ocr_results:list[dict] = list(),
134
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
135
  all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
136
+ chosen_local_model:str="tesseract",
137
  prepare_images:bool=True,
138
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
139
  progress=gr.Progress(track_tqdm=True)):
 
187
  - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
188
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
189
  - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
190
+ - chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
191
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
192
  - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
193
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
 
204
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
205
  review_out_file_paths = [prepared_pdf_file_paths[0]]
206
 
 
 
207
  if all_page_line_level_ocr_results_with_words_df is None:
208
  all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
209
 
 
538
  text_extraction_only,
539
  all_page_line_level_ocr_results,
540
  all_page_line_level_ocr_results_with_words,
541
+ chosen_local_model,
542
  log_files_output_paths=log_files_output_paths,
543
  output_folder=output_folder)
544
 
 
1348
  text_extraction_only:bool=False,
1349
  all_page_line_level_ocr_results = list(),
1350
  all_page_line_level_ocr_results_with_words = list(),
1351
+ chosen_local_model:str="tesseract",
1352
  page_break_val:int=int(PAGE_BREAK_VALUE),
1353
  log_files_output_paths:List=list(),
1354
  max_time:int=int(MAX_TIME_VALUE),
 
1356
  progress=Progress(track_tqdm=True)):
1357
 
1358
  '''
1359
+ This function redacts sensitive information from a PDF document. It takes the following parameters in order:
1360
 
1361
  - file_path (str): The path to the PDF file to be redacted.
1362
  - pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
 
1369
  - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
1370
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
1371
  - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
1372
+ - current_loop_page (int, optional): The current page being processed. Defaults to 0.
1373
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
1374
  - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
1375
  - all_page_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
 
1385
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
1386
  - page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
1387
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
1388
+ - all_page_line_level_ocr_results (optional): List of all page line level OCR results.
1389
+ - all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
1390
+ - chosen_local_model (str, optional): The local model chosen for OCR. Defaults to "tesseract", other choices are "paddle" for PaddleOCR, or "hybrid" for a combination of both.
1391
+ - page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
1392
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
1393
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1394
  - output_folder (str, optional): The folder for file outputs.
 
1399
 
1400
  tic = time.perf_counter()
1401
 
 
 
1402
  file_name = get_file_name_without_type(file_path)
1403
  comprehend_query_number_new = 0
1404
 
 
1412
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1413
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1414
 
1415
+ # Only load in PaddleOCR models if not running Textract
1416
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1417
+ image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine="tesseract")
1418
+ else:
1419
+ image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine=chosen_local_model)
1420
 
1421
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1422
  out_message = "Connection to AWS Comprehend service unsuccessful."
 
1426
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
1427
  out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
1428
  print(out_message_warning)
1429
+ #raise Exception(out_message)
1430
+
1431
 
1432
  number_of_pages = pymupdf_doc.page_count
1433
  print("Number of pages:", str(number_of_pages))
 
1446
  textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
1447
  original_textract_data = textract_data.copy()
1448
 
1449
+ #print("Successfully loaded in Textract analysis results from file")
1450
 
1451
  # If running local OCR option, check if file already exists. If it does, load in existing data
1452
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
 
1454
  all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
1455
  original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
1456
 
1457
+ #print("Loaded in local OCR analysis results from file")
1458
 
1459
  ###
1460
  if current_loop_page == 0: page_loop_start = 0
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -1,6 +1,7 @@
1
  from typing import List
2
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
3
- from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 
4
  import spacy
5
  from spacy.matcher import Matcher, PhraseMatcher
6
  from spaczz.matcher import FuzzyMatcher
@@ -25,6 +26,22 @@ except:
25
  nlp = spacy.load(model_name)
26
  print("Successfully downloaded and imported spaCy model", model_name)
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # #### Custom recognisers
29
  def custom_word_list_recogniser(custom_list:List[str]=[]):
30
  # Create regex pattern, handling quotes carefully
@@ -314,21 +331,6 @@ class CustomWordFuzzyRecognizer(EntityRecognizer):
314
  custom_list_default = []
315
  custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
316
 
317
- # Create a class inheriting from SpacyNlpEngine
318
- class LoadedSpacyNlpEngine(SpacyNlpEngine):
319
- def __init__(self, loaded_spacy_model):
320
- super().__init__()
321
- self.nlp = {"en": loaded_spacy_model}
322
-
323
- # Pass the loaded model to the new LoadedSpacyNlpEngine
324
- loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
325
-
326
-
327
- nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
328
- default_score_threshold=score_threshold,
329
- supported_languages=["en"],
330
- log_decision_process=False,
331
- )
332
 
333
  # Add custom recognisers to nlp_analyser
334
  nlp_analyser.registry.add_recognizer(street_recogniser)
 
1
  from typing import List
2
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
3
+ from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
4
+
5
  import spacy
6
  from spacy.matcher import Matcher, PhraseMatcher
7
  from spaczz.matcher import FuzzyMatcher
 
26
  nlp = spacy.load(model_name)
27
  print("Successfully downloaded and imported spaCy model", model_name)
28
 
29
+ # Create a class inheriting from SpacyNlpEngine
30
+ class LoadedSpacyNlpEngine(SpacyNlpEngine):
31
+ def __init__(self, loaded_spacy_model):
32
+ super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
33
+ self.nlp = {"en": loaded_spacy_model}
34
+
35
+ # Pass the loaded model to the new LoadedSpacyNlpEngine
36
+ loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
37
+
38
+
39
+ nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
40
+ default_score_threshold=score_threshold,
41
+ supported_languages=["en"],
42
+ log_decision_process=False,
43
+ ) # New custom recognisers based on the following functions are added at the end of this script
44
+
45
  # #### Custom recognisers
46
  def custom_word_list_recogniser(custom_list:List[str]=[]):
47
  # Create regex pattern, handling quotes carefully
 
331
  custom_list_default = []
332
  custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  # Add custom recognisers to nlp_analyser
336
  nlp_analyser.registry.add_recognizer(street_recogniser)