Commit
·
ef4000e
1
Parent(s):
c8ffcd4
Local text redaction now produces ocr results with words json and can make dataframe format
Browse files- app.py +23 -22
- tools/file_conversion.py +17 -14
- tools/file_redaction.py +245 -41
- tools/helper_functions.py +19 -5
app.py
CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
|
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
|
6 |
-
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select,
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
@@ -63,7 +63,7 @@ with app:
|
|
63 |
###
|
64 |
|
65 |
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
66 |
-
pdf_doc_state = gr.State([])
|
67 |
all_image_annotations_state = gr.State([])
|
68 |
|
69 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
@@ -211,7 +211,7 @@ with app:
|
|
211 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
|
212 |
|
213 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
|
214 |
-
|
215 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
|
216 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
|
217 |
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
|
@@ -274,7 +274,7 @@ with app:
|
|
274 |
with gr.Row(equal_height=True):
|
275 |
with gr.Column(scale=1):
|
276 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
277 |
-
|
278 |
with gr.Column(scale=4):
|
279 |
with gr.Row(equal_height=True):
|
280 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
|
@@ -576,7 +576,8 @@ with app:
|
|
576 |
if SHOW_COSTS == 'True':
|
577 |
# Calculate costs
|
578 |
total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
579 |
-
text_extract_method_radio.change(
|
|
|
580 |
pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
581 |
handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
582 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
@@ -584,14 +585,14 @@ with app:
|
|
584 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
585 |
|
586 |
# Calculate time taken
|
587 |
-
total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
588 |
-
text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
589 |
-
pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
590 |
-
handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
591 |
-
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
592 |
-
only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
593 |
-
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio,
|
594 |
-
|
595 |
|
596 |
# Allow user to select items from cost code dataframe for cost code
|
597 |
if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
|
@@ -601,9 +602,9 @@ with app:
|
|
601 |
cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
602 |
|
603 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
604 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
605 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
606 |
-
success(fn=
|
607 |
|
608 |
# Run redaction function
|
609 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
@@ -620,7 +621,7 @@ with app:
|
|
620 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
|
621 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
622 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
623 |
-
success(fn=
|
624 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
|
625 |
success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
|
626 |
|
@@ -640,9 +641,9 @@ with app:
|
|
640 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
641 |
|
642 |
convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
643 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
644 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
645 |
-
success(fn=
|
646 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
647 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
648 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
@@ -657,7 +658,7 @@ with app:
|
|
657 |
# Upload previous files for modifying redactions
|
658 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
659 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
660 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
661 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
662 |
|
663 |
# Manual updates to review di
|
@@ -753,12 +754,12 @@ with app:
|
|
753 |
|
754 |
# Convert review file to xfdf Adobe format
|
755 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
756 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder,
|
757 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
758 |
|
759 |
# Convert xfdf Adobe file back to review_file.csv
|
760 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
761 |
-
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder,
|
762 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
763 |
|
764 |
###
|
@@ -779,7 +780,7 @@ with app:
|
|
779 |
###
|
780 |
# IDENTIFY DUPLICATE PAGES
|
781 |
###
|
782 |
-
#in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base,
|
783 |
|
784 |
find_duplicate_pages_btn.click(
|
785 |
fn=run_duplicate_analysis,
|
|
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
|
6 |
+
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
|
|
63 |
###
|
64 |
|
65 |
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
66 |
+
pdf_doc_state = gr.State([])
|
67 |
all_image_annotations_state = gr.State([])
|
68 |
|
69 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
|
|
|
211 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
|
212 |
|
213 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
|
214 |
+
relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=False)
|
215 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
|
216 |
estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
|
217 |
estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
|
|
|
274 |
with gr.Row(equal_height=True):
|
275 |
with gr.Column(scale=1):
|
276 |
textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
|
277 |
+
relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
|
278 |
with gr.Column(scale=4):
|
279 |
with gr.Row(equal_height=True):
|
280 |
total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
|
|
|
576 |
if SHOW_COSTS == 'True':
|
577 |
# Calculate costs
|
578 |
total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
579 |
+
text_extract_method_radio.change(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
580 |
+
success(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
581 |
pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
582 |
handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
583 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
|
|
585 |
textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
|
586 |
|
587 |
# Calculate time taken
|
588 |
+
total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
589 |
+
text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
590 |
+
pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
591 |
+
handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
592 |
+
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
593 |
+
only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
594 |
+
textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
595 |
+
relevant_ocr_output_with_words_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
|
596 |
|
597 |
# Allow user to select items from cost code dataframe for cost code
|
598 |
if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
|
|
|
602 |
cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
603 |
|
604 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
605 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
|
606 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
607 |
+
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
|
608 |
|
609 |
# Run redaction function
|
610 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
|
|
621 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
|
622 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
623 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
624 |
+
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
625 |
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
|
626 |
success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
|
627 |
|
|
|
641 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
642 |
|
643 |
convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
644 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
|
645 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
646 |
+
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
647 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
648 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
649 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
|
|
658 |
# Upload previous files for modifying redactions
|
659 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
660 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
661 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox], api_name="prepare_doc").\
|
662 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
663 |
|
664 |
# Manual updates to review di
|
|
|
754 |
|
755 |
# Convert review file to xfdf Adobe format
|
756 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
757 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
|
758 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
759 |
|
760 |
# Convert xfdf Adobe file back to review_file.csv
|
761 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
762 |
+
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
|
763 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
764 |
|
765 |
###
|
|
|
780 |
###
|
781 |
# IDENTIFY DUPLICATE PAGES
|
782 |
###
|
783 |
+
#in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox])
|
784 |
|
785 |
find_duplicate_pages_btn.click(
|
786 |
fn=run_duplicate_analysis,
|
tools/file_conversion.py
CHANGED
@@ -454,7 +454,7 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
|
|
454 |
|
455 |
def prepare_image_or_pdf(
|
456 |
file_paths: List[str],
|
457 |
-
|
458 |
all_line_level_ocr_results_df:pd.DataFrame,
|
459 |
latest_file_completed: int = 0,
|
460 |
out_message: List[str] = [],
|
@@ -468,7 +468,7 @@ def prepare_image_or_pdf(
|
|
468 |
prepare_images:bool=True,
|
469 |
page_sizes:list[dict]=[],
|
470 |
textract_output_found:bool = False,
|
471 |
-
|
472 |
progress: Progress = Progress(track_tqdm=True)
|
473 |
) -> tuple[List[str], List[str]]:
|
474 |
"""
|
@@ -479,7 +479,7 @@ def prepare_image_or_pdf(
|
|
479 |
|
480 |
Args:
|
481 |
file_paths (List[str]): List of file paths to process.
|
482 |
-
|
483 |
latest_file_completed (optional, int): Index of the last completed file.
|
484 |
out_message (optional, List[str]): List to store output messages.
|
485 |
first_loop_state (optional, bool): Flag indicating if this is the first iteration.
|
@@ -491,7 +491,7 @@ def prepare_image_or_pdf(
|
|
491 |
prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
|
492 |
page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
|
493 |
textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
|
494 |
-
|
495 |
progress (optional, Progress): Progress tracker for the operation
|
496 |
|
497 |
|
@@ -542,7 +542,7 @@ def prepare_image_or_pdf(
|
|
542 |
final_out_message = '\n'.join(out_message)
|
543 |
else:
|
544 |
final_out_message = out_message
|
545 |
-
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df,
|
546 |
|
547 |
progress(0.1, desc='Preparing file')
|
548 |
|
@@ -599,8 +599,8 @@ def prepare_image_or_pdf(
|
|
599 |
|
600 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
601 |
# Check if the file is an image type and the user selected text ocr option
|
602 |
-
if file_extension in ['.jpg', '.jpeg', '.png'] and
|
603 |
-
|
604 |
|
605 |
# Convert image to a pymupdf document
|
606 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
@@ -663,15 +663,18 @@ def prepare_image_or_pdf(
|
|
663 |
elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
|
664 |
print("Saving local OCR output")
|
665 |
# Copy it to the output folder so it can be used later.
|
666 |
-
output_ocr_results_with_words_json_file_name = file_path_without_ext
|
667 |
-
if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
668 |
-
else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
669 |
|
670 |
out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
|
671 |
|
672 |
# Use shutil to copy the file directly
|
673 |
shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
|
674 |
-
|
|
|
|
|
|
|
675 |
continue
|
676 |
|
677 |
# NEW IF STATEMENT
|
@@ -768,13 +771,13 @@ def prepare_image_or_pdf(
|
|
768 |
|
769 |
# Must be something else, return with error message
|
770 |
else:
|
771 |
-
if
|
772 |
if is_pdf_or_image(file_path) == False:
|
773 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
774 |
print(out_message)
|
775 |
raise Exception(out_message)
|
776 |
|
777 |
-
elif
|
778 |
if is_pdf(file_path) == False:
|
779 |
out_message = "Please upload a PDF file for text analysis."
|
780 |
print(out_message)
|
@@ -793,7 +796,7 @@ def prepare_image_or_pdf(
|
|
793 |
|
794 |
number_of_pages = len(page_sizes)#len(image_file_paths)
|
795 |
|
796 |
-
return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df,
|
797 |
|
798 |
def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
799 |
"""
|
|
|
454 |
|
455 |
def prepare_image_or_pdf(
|
456 |
file_paths: List[str],
|
457 |
+
text_extract_method: str,
|
458 |
all_line_level_ocr_results_df:pd.DataFrame,
|
459 |
latest_file_completed: int = 0,
|
460 |
out_message: List[str] = [],
|
|
|
468 |
prepare_images:bool=True,
|
469 |
page_sizes:list[dict]=[],
|
470 |
textract_output_found:bool = False,
|
471 |
+
relevant_ocr_output_with_words_found:bool = False,
|
472 |
progress: Progress = Progress(track_tqdm=True)
|
473 |
) -> tuple[List[str], List[str]]:
|
474 |
"""
|
|
|
479 |
|
480 |
Args:
|
481 |
file_paths (List[str]): List of file paths to process.
|
482 |
+
text_extract_method (str): The redaction method to use.
|
483 |
latest_file_completed (optional, int): Index of the last completed file.
|
484 |
out_message (optional, List[str]): List to store output messages.
|
485 |
first_loop_state (optional, bool): Flag indicating if this is the first iteration.
|
|
|
491 |
prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
|
492 |
page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
|
493 |
textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
|
494 |
+
relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
|
495 |
progress (optional, Progress): Progress tracker for the operation
|
496 |
|
497 |
|
|
|
542 |
final_out_message = '\n'.join(out_message)
|
543 |
else:
|
544 |
final_out_message = out_message
|
545 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
|
546 |
|
547 |
progress(0.1, desc='Preparing file')
|
548 |
|
|
|
599 |
|
600 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
601 |
# Check if the file is an image type and the user selected text ocr option
|
602 |
+
if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
603 |
+
text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
|
604 |
|
605 |
# Convert image to a pymupdf document
|
606 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
|
|
663 |
elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
|
664 |
print("Saving local OCR output")
|
665 |
# Copy it to the output folder so it can be used later.
|
666 |
+
output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
667 |
+
# if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
668 |
+
# else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
669 |
|
670 |
out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
|
671 |
|
672 |
# Use shutil to copy the file directly
|
673 |
shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
|
674 |
+
|
675 |
+
if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
|
676 |
+
if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
|
677 |
+
if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
|
678 |
continue
|
679 |
|
680 |
# NEW IF STATEMENT
|
|
|
771 |
|
772 |
# Must be something else, return with error message
|
773 |
else:
|
774 |
+
if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
775 |
if is_pdf_or_image(file_path) == False:
|
776 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
777 |
print(out_message)
|
778 |
raise Exception(out_message)
|
779 |
|
780 |
+
elif text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
781 |
if is_pdf(file_path) == False:
|
782 |
out_message = "Please upload a PDF file for text analysis."
|
783 |
print(out_message)
|
|
|
796 |
|
797 |
number_of_pages = len(page_sizes)#len(image_file_paths)
|
798 |
|
799 |
+
return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
|
800 |
|
801 |
def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
802 |
"""
|
tools/file_redaction.py
CHANGED
@@ -8,7 +8,7 @@ import copy
|
|
8 |
|
9 |
from tqdm import tqdm
|
10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
11 |
-
from typing import List, Dict, Tuple, Optional
|
12 |
import pandas as pd
|
13 |
|
14 |
from pdfminer.high_level import extract_pages
|
@@ -59,6 +59,49 @@ def sum_numbers_before_seconds(string:str):
|
|
59 |
|
60 |
return sum_of_numbers
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def choose_and_run_redactor(file_paths:List[str],
|
63 |
prepared_pdf_file_paths:List[str],
|
64 |
pdf_image_file_paths:List[str],
|
@@ -499,7 +542,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
499 |
# Analyse text-based pdf
|
500 |
print('Redacting file as text-based PDF')
|
501 |
|
502 |
-
pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(
|
503 |
file_path,
|
504 |
language,
|
505 |
chosen_redact_entities,
|
@@ -513,6 +556,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
513 |
all_line_level_ocr_results_df,
|
514 |
all_pages_decision_process_table,
|
515 |
pymupdf_doc,
|
|
|
516 |
pii_identification_method,
|
517 |
comprehend_query_number,
|
518 |
comprehend_client,
|
@@ -522,7 +566,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
522 |
match_fuzzy_whole_phrase_bool,
|
523 |
page_sizes_df,
|
524 |
document_cropboxes,
|
525 |
-
text_extraction_only
|
|
|
526 |
else:
|
527 |
out_message = "No redaction method selected"
|
528 |
print(out_message)
|
@@ -536,9 +581,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
536 |
current_loop_page = 999
|
537 |
|
538 |
if latest_file_completed != len(file_paths_list):
|
539 |
-
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
540 |
-
|
541 |
-
|
542 |
|
543 |
# Save redacted file
|
544 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
@@ -572,6 +615,30 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
572 |
|
573 |
duplication_file_path_outputs.append(ocr_file_path)
|
574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
# Convert the gradio annotation boxes to relative coordinates
|
576 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
577 |
progress(0.93, "Creating review file output")
|
@@ -1343,7 +1410,7 @@ def redact_image_pdf(file_path:str,
|
|
1343 |
|
1344 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
1345 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
1346 |
-
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "
|
1347 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
1348 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
1349 |
|
@@ -1662,32 +1729,37 @@ def redact_image_pdf(file_path:str,
|
|
1662 |
# Append new annotation if it doesn't exist
|
1663 |
annotations_all_pages.append(page_image_annotations)
|
1664 |
|
1665 |
-
|
1666 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1667 |
if original_textract_data != textract_data:
|
1668 |
# Write the updated existing textract data back to the JSON file
|
1669 |
with open(textract_json_file_path, 'w') as json_file:
|
1670 |
json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
1671 |
|
1672 |
-
|
1673 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1674 |
|
1675 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
1676 |
if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
|
1677 |
# Write the updated existing textract data back to the JSON file
|
|
|
1678 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
1679 |
-
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
1680 |
|
1681 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1682 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1683 |
|
1684 |
-
#all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
1685 |
-
#all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
1686 |
-
|
1687 |
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
1688 |
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
1689 |
|
1690 |
-
|
1691 |
current_loop_page += 1
|
1692 |
|
1693 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
@@ -1784,22 +1856,21 @@ def get_text_container_characters(text_container:LTTextContainer):
|
|
1784 |
return characters
|
1785 |
return []
|
1786 |
|
1787 |
-
def
|
1788 |
'''
|
1789 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
1790 |
'''
|
1791 |
|
1792 |
line_level_results_out = []
|
1793 |
line_level_characters_out = []
|
1794 |
-
|
1795 |
-
character_objects_out = []
|
1796 |
-
# character_text_objects_out = []
|
1797 |
|
1798 |
# Initialize variables
|
1799 |
full_text = ""
|
1800 |
added_text = ""
|
1801 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
1802 |
-
|
1803 |
|
1804 |
# Iterate through the character objects
|
1805 |
current_word = ""
|
@@ -1813,7 +1884,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1813 |
# character_text_objects_out.append(character_text)
|
1814 |
|
1815 |
if isinstance(char, LTAnno):
|
1816 |
-
|
1817 |
added_text = char.get_text()
|
1818 |
|
1819 |
# Handle double quotes
|
@@ -1822,17 +1892,17 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1822 |
# Handle space separately by finalizing the word
|
1823 |
full_text += added_text # Adds space or newline
|
1824 |
|
1825 |
-
if current_word: # Only
|
1826 |
-
|
1827 |
current_word = ""
|
1828 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
1829 |
|
1830 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1831 |
if '\n' in added_text:
|
1832 |
|
1833 |
-
#
|
1834 |
if current_word:
|
1835 |
-
|
1836 |
# Create an OCRResult for the current line
|
1837 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
1838 |
line_level_characters_out.append(character_objects_out)
|
@@ -1872,23 +1942,138 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1872 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
1873 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
1874 |
|
1875 |
-
#
|
1876 |
if current_word:
|
1877 |
-
|
1878 |
|
1879 |
if full_text:
|
|
|
1880 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1881 |
# Convert special characters to a human-readable format
|
1882 |
|
1883 |
full_text = clean_unicode_text(full_text)
|
1884 |
full_text = full_text.strip()
|
1885 |
|
|
|
1886 |
|
1887 |
-
|
1888 |
|
1889 |
-
|
1890 |
|
1891 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1892 |
|
1893 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1894 |
decision_process_table = pd.DataFrame()
|
@@ -1938,7 +2123,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
1938 |
return pikepdf_redaction_annotations_on_page
|
1939 |
|
1940 |
def redact_text_pdf(
|
1941 |
-
|
1942 |
language: str, # Language of the PDF content
|
1943 |
chosen_redact_entities: List[str], # List of entities to be redacted
|
1944 |
chosen_redact_comprehend_entities: List[str],
|
@@ -1951,6 +2136,7 @@ def redact_text_pdf(
|
|
1951 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
|
1952 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
1953 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
|
|
1954 |
pii_identification_method: str = "Local",
|
1955 |
comprehend_query_number:int = 0,
|
1956 |
comprehend_client="",
|
@@ -1961,6 +2147,7 @@ def redact_text_pdf(
|
|
1961 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
1962 |
original_cropboxes:List[dict]=[],
|
1963 |
text_extraction_only:bool=False,
|
|
|
1964 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
1965 |
max_time: int = int(MAX_TIME_VALUE),
|
1966 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
@@ -1970,7 +2157,7 @@ def redact_text_pdf(
|
|
1970 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
1971 |
|
1972 |
Input Variables:
|
1973 |
-
-
|
1974 |
- language: Language of the PDF content
|
1975 |
- chosen_redact_entities: List of entities to be redacted
|
1976 |
- chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
|
@@ -1994,6 +2181,7 @@ def redact_text_pdf(
|
|
1994 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
|
1995 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
1996 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
|
|
1997 |
- page_break_val: Value for page break
|
1998 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1999 |
- progress: Progress tracking object
|
@@ -2023,8 +2211,13 @@ def redact_text_pdf(
|
|
2023 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
2024 |
|
2025 |
# Open with Pikepdf to get text lines
|
2026 |
-
pikepdf_pdf = Pdf.open(
|
2027 |
-
number_of_pages = len(pikepdf_pdf.pages)
|
|
|
|
|
|
|
|
|
|
|
2028 |
|
2029 |
# Check that page_min and page_max are within expected ranges
|
2030 |
if page_max > number_of_pages or page_max == 0:
|
@@ -2056,7 +2249,7 @@ def redact_text_pdf(
|
|
2056 |
|
2057 |
if page_min <= page_no < page_max:
|
2058 |
# Go page by page
|
2059 |
-
for page_layout in extract_pages(
|
2060 |
|
2061 |
all_page_line_text_extraction_characters = []
|
2062 |
all_page_line_level_text_extraction_results_list = []
|
@@ -2068,14 +2261,18 @@ def redact_text_pdf(
|
|
2068 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
2069 |
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
2070 |
|
|
|
2071 |
for n, text_container in enumerate(page_layout):
|
2072 |
characters = []
|
2073 |
|
2074 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
2075 |
characters = get_text_container_characters(text_container)
|
|
|
2076 |
|
2077 |
# Create dataframe for all the text on the page
|
2078 |
-
line_level_text_results_list, line_characters =
|
|
|
|
|
2079 |
|
2080 |
### Create page_text_ocr_outputs (OCR format outputs)
|
2081 |
if line_level_text_results_list:
|
@@ -2093,6 +2290,7 @@ def redact_text_pdf(
|
|
2093 |
|
2094 |
all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
|
2095 |
all_page_line_text_extraction_characters.extend(line_characters)
|
|
|
2096 |
|
2097 |
### REDACTION
|
2098 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
@@ -2143,9 +2341,9 @@ def redact_text_pdf(
|
|
2143 |
|
2144 |
# Join extracted text outputs for all lines together
|
2145 |
if not page_text_ocr_outputs.empty:
|
2146 |
-
page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
2147 |
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
|
2148 |
-
all_line_level_ocr_results_list.append(page_text_ocr_outputs)
|
2149 |
|
2150 |
toc = time.perf_counter()
|
2151 |
|
@@ -2174,7 +2372,7 @@ def redact_text_pdf(
|
|
2174 |
|
2175 |
current_loop_page += 1
|
2176 |
|
2177 |
-
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2178 |
|
2179 |
# Check if the image already exists in annotations_all_pages
|
2180 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
|
@@ -2195,7 +2393,7 @@ def redact_text_pdf(
|
|
2195 |
# Write logs
|
2196 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2197 |
|
2198 |
-
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2199 |
|
2200 |
# Write all page outputs
|
2201 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
@@ -2222,5 +2420,11 @@ def redact_text_pdf(
|
|
2222 |
if not all_line_level_ocr_results_df.empty:
|
2223 |
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
|
2224 |
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
|
|
|
|
|
|
|
|
|
|
|
|
2225 |
|
2226 |
-
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
|
|
8 |
|
9 |
from tqdm import tqdm
|
10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
11 |
+
from typing import List, Dict, Tuple, Optional, Any
|
12 |
import pandas as pd
|
13 |
|
14 |
from pdfminer.high_level import extract_pages
|
|
|
59 |
|
60 |
return sum_of_numbers
|
61 |
|
62 |
+
def merge_page_results(data):
|
63 |
+
merged = {}
|
64 |
+
|
65 |
+
for item in data:
|
66 |
+
page = item["page"]
|
67 |
+
|
68 |
+
if page not in merged:
|
69 |
+
merged[page] = {
|
70 |
+
"page": page,
|
71 |
+
"results": {}
|
72 |
+
}
|
73 |
+
|
74 |
+
# Merge line-level results into the existing page
|
75 |
+
merged[page]["results"].update(item.get("results", {}))
|
76 |
+
|
77 |
+
return list(merged.values())
|
78 |
+
|
79 |
+
def word_level_ocr_output_to_dataframe(ocr_result: dict) -> pd.DataFrame:
|
80 |
+
rows = []
|
81 |
+
ocr_result = ocr_result[0]
|
82 |
+
|
83 |
+
page_number = int(ocr_result['page'])
|
84 |
+
|
85 |
+
for line_key, line_data in ocr_result['results'].items():
|
86 |
+
line_number = int(line_data['line'])
|
87 |
+
for word in line_data['words']:
|
88 |
+
rows.append({
|
89 |
+
'page': page_number,
|
90 |
+
'line': line_number,
|
91 |
+
'word_text': word['text'],
|
92 |
+
'word_x0': word['bounding_box'][0],
|
93 |
+
'word_y0': word['bounding_box'][1],
|
94 |
+
'word_x1': word['bounding_box'][2],
|
95 |
+
'word_y1': word['bounding_box'][3],
|
96 |
+
'line_text': line_data['text'],
|
97 |
+
'line_x0': line_data['bounding_box'][0],
|
98 |
+
'line_y0': line_data['bounding_box'][1],
|
99 |
+
'line_x1': line_data['bounding_box'][2],
|
100 |
+
'line_y1': line_data['bounding_box'][3],
|
101 |
+
})
|
102 |
+
|
103 |
+
return pd.DataFrame(rows)
|
104 |
+
|
105 |
def choose_and_run_redactor(file_paths:List[str],
|
106 |
prepared_pdf_file_paths:List[str],
|
107 |
pdf_image_file_paths:List[str],
|
|
|
542 |
# Analyse text-based pdf
|
543 |
print('Redacting file as text-based PDF')
|
544 |
|
545 |
+
pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
|
546 |
file_path,
|
547 |
language,
|
548 |
chosen_redact_entities,
|
|
|
556 |
all_line_level_ocr_results_df,
|
557 |
all_pages_decision_process_table,
|
558 |
pymupdf_doc,
|
559 |
+
[], # All line level ocr results with words
|
560 |
pii_identification_method,
|
561 |
comprehend_query_number,
|
562 |
comprehend_client,
|
|
|
566 |
match_fuzzy_whole_phrase_bool,
|
567 |
page_sizes_df,
|
568 |
document_cropboxes,
|
569 |
+
text_extraction_only,
|
570 |
+
output_folder=output_folder)
|
571 |
else:
|
572 |
out_message = "No redaction method selected"
|
573 |
print(out_message)
|
|
|
581 |
current_loop_page = 999
|
582 |
|
583 |
if latest_file_completed != len(file_paths_list):
|
584 |
+
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
|
|
|
|
585 |
|
586 |
# Save redacted file
|
587 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
|
615 |
|
616 |
duplication_file_path_outputs.append(ocr_file_path)
|
617 |
|
618 |
+
if all_page_line_level_ocr_results_with_words:
|
619 |
+
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
620 |
+
|
621 |
+
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
622 |
+
|
623 |
+
# print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
624 |
+
|
625 |
+
file_name = get_file_name_without_type(file_path)
|
626 |
+
|
627 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
|
628 |
+
|
629 |
+
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
630 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
631 |
+
|
632 |
+
all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
|
633 |
+
|
634 |
+
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
|
635 |
+
|
636 |
+
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
|
637 |
+
|
638 |
+
all_page_line_level_ocr_results_with_words_df_file_path = output_folder + file_name + "_ocr_results_with_words.csv"
|
639 |
+
|
640 |
+
all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path)
|
641 |
+
|
642 |
# Convert the gradio annotation boxes to relative coordinates
|
643 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
644 |
progress(0.93, "Creating review file output")
|
|
|
1410 |
|
1411 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
1412 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
1413 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
|
1414 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
1415 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
1416 |
|
|
|
1729 |
# Append new annotation if it doesn't exist
|
1730 |
annotations_all_pages.append(page_image_annotations)
|
1731 |
|
1732 |
+
# Save word level options
|
1733 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1734 |
if original_textract_data != textract_data:
|
1735 |
# Write the updated existing textract data back to the JSON file
|
1736 |
with open(textract_json_file_path, 'w') as json_file:
|
1737 |
json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
1738 |
|
1739 |
+
if textract_json_file_path not in log_files_output_paths:
|
1740 |
+
log_files_output_paths.append(textract_json_file_path)
|
1741 |
+
|
1742 |
+
all_page_line_level_ocr_results_with_words_json_file_path_textract = output_folder + file_name + "_ocr_results_with_words_textract.json"
|
1743 |
+
|
1744 |
+
with open(all_page_line_level_ocr_results_with_words_json_file_path_textract, 'w') as json_file:
|
1745 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
1746 |
+
|
1747 |
+
if all_page_line_level_ocr_results_with_words_json_file_path_textract not in log_files_output_paths:
|
1748 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path_textract)
|
1749 |
|
1750 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
1751 |
if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
|
1752 |
# Write the updated existing textract data back to the JSON file
|
1753 |
+
|
1754 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
1755 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
1756 |
|
1757 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1758 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1759 |
|
|
|
|
|
|
|
1760 |
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
1761 |
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
1762 |
|
|
|
1763 |
current_loop_page += 1
|
1764 |
|
1765 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
|
|
1856 |
return characters
|
1857 |
return []
|
1858 |
|
1859 |
+
def create_line_level_ocr_results_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
1860 |
'''
|
1861 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
1862 |
'''
|
1863 |
|
1864 |
line_level_results_out = []
|
1865 |
line_level_characters_out = []
|
1866 |
+
line_level_words_out = {}
|
1867 |
+
character_objects_out = []
|
|
|
1868 |
|
1869 |
# Initialize variables
|
1870 |
full_text = ""
|
1871 |
added_text = ""
|
1872 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
1873 |
+
line_bboxes = []
|
1874 |
|
1875 |
# Iterate through the character objects
|
1876 |
current_word = ""
|
|
|
1884 |
# character_text_objects_out.append(character_text)
|
1885 |
|
1886 |
if isinstance(char, LTAnno):
|
|
|
1887 |
added_text = char.get_text()
|
1888 |
|
1889 |
# Handle double quotes
|
|
|
1892 |
# Handle space separately by finalizing the word
|
1893 |
full_text += added_text # Adds space or newline
|
1894 |
|
1895 |
+
if current_word: # Only finalise if there is a current word
|
1896 |
+
line_bboxes.append((current_word, current_word_bbox))
|
1897 |
current_word = ""
|
1898 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
1899 |
|
1900 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1901 |
if '\n' in added_text:
|
1902 |
|
1903 |
+
# finalise the current line
|
1904 |
if current_word:
|
1905 |
+
line_bboxes.append((current_word, current_word_bbox))
|
1906 |
# Create an OCRResult for the current line
|
1907 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
1908 |
line_level_characters_out.append(character_objects_out)
|
|
|
1942 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
1943 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
1944 |
|
1945 |
+
# Finalise the last word if any
|
1946 |
if current_word:
|
1947 |
+
line_bboxes.append((current_word, current_word_bbox))
|
1948 |
|
1949 |
if full_text:
|
1950 |
+
print("full_text found")
|
1951 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1952 |
# Convert special characters to a human-readable format
|
1953 |
|
1954 |
full_text = clean_unicode_text(full_text)
|
1955 |
full_text = full_text.strip()
|
1956 |
|
1957 |
+
line_ocr_result_bbox = round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)
|
1958 |
|
1959 |
+
line_ocr_result = OCRResult(full_text.strip(), line_ocr_result_bbox)
|
1960 |
|
1961 |
+
line_level_results_out.append(line_ocr_result)
|
1962 |
|
1963 |
+
else:
|
1964 |
+
line_ocr_result_bbox = []
|
1965 |
+
|
1966 |
+
# if line_ocr_result_bbox:
|
1967 |
+
# line_level_words_out["page"] = 1
|
1968 |
+
# line_level_words_out['results'] = {'text_line_1':{"line":1, "text":full_text, "bounding_box": line_ocr_result_bbox, "words": line_bboxes}}
|
1969 |
+
# else:
|
1970 |
+
# line_level_words_out = {}
|
1971 |
+
|
1972 |
+
|
1973 |
+
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
1974 |
+
|
1975 |
+
def generate_word_level_ocr(char_objects: List, page_number: int, text_line_number:int) -> Dict[str, Any]:
|
1976 |
+
"""
|
1977 |
+
Generates a dictionary with line and word-level OCR results from a list of pdfminer.six objects.
|
1978 |
+
|
1979 |
+
This robust version handles real-world pdfminer.six output by:
|
1980 |
+
1. Filtering out non-character (LTAnno) objects that lack coordinate data.
|
1981 |
+
2. Sorting all text characters (LTChar) into a proper reading order.
|
1982 |
+
3. Using an adaptive threshold for detecting spaces based on character font size.
|
1983 |
+
|
1984 |
+
Args:
|
1985 |
+
char_objects: A mixed list of pdfminer.six LTChar and LTAnno objects from a single page.
|
1986 |
+
page_number: The page number where the characters are from.
|
1987 |
+
|
1988 |
+
Returns:
|
1989 |
+
A dictionary formatted with page, line, and word-level results.
|
1990 |
+
"""
|
1991 |
+
# **CRITICAL FIX: Filter out LTAnno objects, as they lack '.bbox' and are not needed for layout analysis.**
|
1992 |
+
text_chars = [c for c in char_objects if isinstance(c, LTChar)]
|
1993 |
+
|
1994 |
+
if not text_chars:
|
1995 |
+
return {"page": str(page_number), "results": {}}
|
1996 |
+
|
1997 |
+
# Sort the remaining text characters into reading order.
|
1998 |
+
text_chars.sort(key=lambda c: (-c.bbox[3], c.bbox[0]))
|
1999 |
+
|
2000 |
+
page_data = {"page": str(page_number), "results": {}}
|
2001 |
+
line_number = text_line_number
|
2002 |
+
|
2003 |
+
# State variables
|
2004 |
+
line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
|
2005 |
+
current_word_text, current_word_bbox = "", [float('inf'), float('inf'), -1, -1]
|
2006 |
+
prev_char = None
|
2007 |
+
|
2008 |
+
def finalize_word():
|
2009 |
+
nonlocal current_word_text, current_word_bbox
|
2010 |
+
word_text = current_word_text.strip()
|
2011 |
+
if word_text:
|
2012 |
+
line_words.append({
|
2013 |
+
"text": word_text,
|
2014 |
+
"bounding_box": [round(b, 2) for b in current_word_bbox]
|
2015 |
+
})
|
2016 |
+
current_word_text = ""
|
2017 |
+
current_word_bbox = [float('inf'), float('inf'), -1, -1]
|
2018 |
+
|
2019 |
+
def finalize_line():
|
2020 |
+
nonlocal line_text, line_bbox, line_words, line_number, prev_char
|
2021 |
+
finalize_word()
|
2022 |
+
if line_text.strip():
|
2023 |
+
page_data["results"][f"text_line_{line_number}"] = {
|
2024 |
+
"line": line_number,
|
2025 |
+
"text": line_text.strip(),
|
2026 |
+
"bounding_box": [round(b, 2) for b in line_bbox],
|
2027 |
+
"words": line_words
|
2028 |
+
}
|
2029 |
+
line_number += 1
|
2030 |
+
line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
|
2031 |
+
prev_char = None
|
2032 |
+
|
2033 |
+
for char in text_chars:
|
2034 |
+
char_text = clean_unicode_text(char.get_text())
|
2035 |
+
|
2036 |
+
if prev_char:
|
2037 |
+
char_height = char.bbox[3] - char.bbox[1]
|
2038 |
+
vertical_gap = abs(char.bbox[1] - prev_char.bbox[1])
|
2039 |
+
|
2040 |
+
# Line break detection
|
2041 |
+
if vertical_gap > char_height * 0.7:
|
2042 |
+
finalize_line()
|
2043 |
+
else:
|
2044 |
+
# Check for spacing between characters
|
2045 |
+
space_threshold = char.size * 0.5
|
2046 |
+
gap = char.bbox[0] - prev_char.bbox[2]
|
2047 |
+
if gap > max(space_threshold, 1.0):
|
2048 |
+
finalize_word()
|
2049 |
+
line_text += " "
|
2050 |
+
|
2051 |
+
# ✅ Explicitly finalize if space character
|
2052 |
+
if char_text == " ":
|
2053 |
+
finalize_word()
|
2054 |
+
line_text += " "
|
2055 |
+
prev_char = char
|
2056 |
+
continue
|
2057 |
+
|
2058 |
+
current_word_text += char_text
|
2059 |
+
line_text += char_text
|
2060 |
+
|
2061 |
+
# Update bounding boxes
|
2062 |
+
current_word_bbox[0] = min(current_word_bbox[0], char.bbox[0])
|
2063 |
+
current_word_bbox[1] = min(current_word_bbox[1], char.bbox[1])
|
2064 |
+
current_word_bbox[2] = max(current_word_bbox[2], char.bbox[2])
|
2065 |
+
current_word_bbox[3] = max(current_word_bbox[3], char.bbox[3])
|
2066 |
+
|
2067 |
+
line_bbox[0] = min(line_bbox[0], char.bbox[0])
|
2068 |
+
line_bbox[1] = min(line_bbox[1], char.bbox[1])
|
2069 |
+
line_bbox[2] = max(line_bbox[2], char.bbox[2])
|
2070 |
+
line_bbox[3] = max(line_bbox[3], char.bbox[3])
|
2071 |
+
|
2072 |
+
prev_char = char
|
2073 |
+
|
2074 |
+
finalize_line()
|
2075 |
+
|
2076 |
+
return page_data
|
2077 |
|
2078 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
2079 |
decision_process_table = pd.DataFrame()
|
|
|
2123 |
return pikepdf_redaction_annotations_on_page
|
2124 |
|
2125 |
def redact_text_pdf(
|
2126 |
+
file_path: str, # Path to the PDF file to be redacted
|
2127 |
language: str, # Language of the PDF content
|
2128 |
chosen_redact_entities: List[str], # List of entities to be redacted
|
2129 |
chosen_redact_comprehend_entities: List[str],
|
|
|
2136 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
|
2137 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
2138 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
2139 |
+
all_page_line_level_ocr_results_with_words: List = [],
|
2140 |
pii_identification_method: str = "Local",
|
2141 |
comprehend_query_number:int = 0,
|
2142 |
comprehend_client="",
|
|
|
2147 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
2148 |
original_cropboxes:List[dict]=[],
|
2149 |
text_extraction_only:bool=False,
|
2150 |
+
output_folder:str=OUTPUT_FOLDER,
|
2151 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
2152 |
max_time: int = int(MAX_TIME_VALUE),
|
2153 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
|
2157 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
2158 |
|
2159 |
Input Variables:
|
2160 |
+
- file_path: Path to the PDF file to be redacted
|
2161 |
- language: Language of the PDF content
|
2162 |
- chosen_redact_entities: List of entities to be redacted
|
2163 |
- chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
|
|
|
2181 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
|
2182 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
2183 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
2184 |
+
- output_folder (str, optional): The output folder for the function
|
2185 |
- page_break_val: Value for page break
|
2186 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
2187 |
- progress: Progress tracking object
|
|
|
2211 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
2212 |
|
2213 |
# Open with Pikepdf to get text lines
|
2214 |
+
pikepdf_pdf = Pdf.open(file_path)
|
2215 |
+
number_of_pages = len(pikepdf_pdf.pages)
|
2216 |
+
|
2217 |
+
file_name = get_file_name_without_type(file_path)
|
2218 |
+
|
2219 |
+
if not all_page_line_level_ocr_results_with_words:
|
2220 |
+
all_page_line_level_ocr_results_with_words = []
|
2221 |
|
2222 |
# Check that page_min and page_max are within expected ranges
|
2223 |
if page_max > number_of_pages or page_max == 0:
|
|
|
2249 |
|
2250 |
if page_min <= page_no < page_max:
|
2251 |
# Go page by page
|
2252 |
+
for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
|
2253 |
|
2254 |
all_page_line_text_extraction_characters = []
|
2255 |
all_page_line_level_text_extraction_results_list = []
|
|
|
2261 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
2262 |
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
2263 |
|
2264 |
+
text_line_no = 0
|
2265 |
for n, text_container in enumerate(page_layout):
|
2266 |
characters = []
|
2267 |
|
2268 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
2269 |
characters = get_text_container_characters(text_container)
|
2270 |
+
text_line_no += 1
|
2271 |
|
2272 |
# Create dataframe for all the text on the page
|
2273 |
+
line_level_text_results_list, line_characters, = create_line_level_ocr_results_from_characters(characters)
|
2274 |
+
|
2275 |
+
line_level_ocr_results_with_words = generate_word_level_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
|
2276 |
|
2277 |
### Create page_text_ocr_outputs (OCR format outputs)
|
2278 |
if line_level_text_results_list:
|
|
|
2290 |
|
2291 |
all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
|
2292 |
all_page_line_text_extraction_characters.extend(line_characters)
|
2293 |
+
all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
|
2294 |
|
2295 |
### REDACTION
|
2296 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
|
2341 |
|
2342 |
# Join extracted text outputs for all lines together
|
2343 |
if not page_text_ocr_outputs.empty:
|
2344 |
+
#page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
2345 |
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
|
2346 |
+
all_line_level_ocr_results_list.append(page_text_ocr_outputs)
|
2347 |
|
2348 |
toc = time.perf_counter()
|
2349 |
|
|
|
2372 |
|
2373 |
current_loop_page += 1
|
2374 |
|
2375 |
+
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
2376 |
|
2377 |
# Check if the image already exists in annotations_all_pages
|
2378 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
|
|
|
2393 |
# Write logs
|
2394 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2395 |
|
2396 |
+
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
2397 |
|
2398 |
# Write all page outputs
|
2399 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
|
|
2420 |
if not all_line_level_ocr_results_df.empty:
|
2421 |
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
|
2422 |
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
2423 |
+
|
2424 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
|
2425 |
+
|
2426 |
+
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
2427 |
+
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
2428 |
+
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
2429 |
|
2430 |
+
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
tools/helper_functions.py
CHANGED
@@ -244,13 +244,27 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
|
|
244 |
else:
|
245 |
return False
|
246 |
|
247 |
-
def
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
if os.path.exists(local_ocr_output_path):
|
251 |
-
print("Existing
|
252 |
-
return True
|
253 |
-
|
254 |
else:
|
255 |
return False
|
256 |
|
|
|
244 |
else:
|
245 |
return False
|
246 |
|
247 |
+
def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:str, text_extraction_method:str, output_folder:str=OUTPUT_FOLDER):
|
248 |
+
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
|
249 |
+
elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
|
250 |
+
elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
|
251 |
+
else:
|
252 |
+
print("No valid text extraction method found. Returning False")
|
253 |
+
return False
|
254 |
+
|
255 |
+
print("doc_file_name_no_extension_textbox:", doc_file_name_no_extension_textbox)
|
256 |
+
|
257 |
+
doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
|
258 |
+
|
259 |
+
print("doc_file_with_ending:", doc_file_with_ending)
|
260 |
+
|
261 |
+
local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
|
262 |
+
|
263 |
+
print("looking for file path:", local_ocr_output_path)
|
264 |
|
265 |
if os.path.exists(local_ocr_output_path):
|
266 |
+
print("Existing OCR with words analysis output file found.")
|
267 |
+
return True
|
|
|
268 |
else:
|
269 |
return False
|
270 |
|