Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 29 days ago

Commit

ef4000e

1 Parent(s): c8ffcd4

Local text redaction now produces ocr results with words json and can make dataframe format

Browse files

Files changed (4) hide show

app.py +23 -22
tools/file_conversion.py +17 -14
tools/file_redaction.py +245 -41
tools/helper_functions.py +19 -5

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
 from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
-from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
@@ -63,7 +63,7 @@ with app:
     ###
     # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
-    pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
@@ -211,7 +211,7 @@ with app:
     cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
     textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
-    local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=False)
     total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
     estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
     estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
@@ -274,7 +274,7 @@ with app:
                     with gr.Row(equal_height=True):
                         with gr.Column(scale=1):
                             textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
-                            local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
                         with gr.Column(scale=4):
                             with gr.Row(equal_height=True):
                                 total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
@@ -576,7 +576,8 @@ with app:
     if SHOW_COSTS == 'True':
         # Calculate costs
         total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
-        text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
@@ -584,14 +585,14 @@ with app:
         textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         # Calculate time taken
-        total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio,          pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
-        text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
-        pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
-        handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
-        textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
-        only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
-        textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
-        local_ocr_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
     # Allow user to select items from cost code dataframe for cost code
     if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
@@ -601,9 +602,9 @@ with app:
         cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
-    success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
     # Run redaction function
     document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
@@ -620,7 +621,7 @@ with app:
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
-                    success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
                     success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
                     success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
@@ -640,9 +641,9 @@ with app:
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
         success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
-        success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
         success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
         success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
@@ -657,7 +658,7 @@ with app:
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Manual updates to review di
@@ -753,12 +754,12 @@ with app:
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
@@ -779,7 +780,7 @@ with app:
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
-    #in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox])
     find_duplicate_pages_btn.click(
         fn=run_duplicate_analysis,

 import gradio as gr
 from gradio_image_annotation import image_annotator
 from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
+from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
     ###
     # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
+    pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
     cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
     textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
+    relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=False)
     total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
     estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
     estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
                     with gr.Row(equal_height=True):
                         with gr.Column(scale=1):
                             textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
+                            relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
                         with gr.Column(scale=4):
                             with gr.Row(equal_height=True):
                                 total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
     if SHOW_COSTS == 'True':
         # Calculate costs
         total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
+        text_extract_method_radio.change(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
+            success(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         # Calculate time taken
+        total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio,          pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
+        text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
+        pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
+        handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
+        textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
+        only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
+        textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
+        relevant_ocr_output_with_words_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
     # Allow user to select items from cost code dataframe for cost code
     if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
         cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
+    success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
     # Run redaction function
     document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
+                    success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
                     success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
                     success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
         success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
+        success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
         success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
         success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox], api_name="prepare_doc").\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Manual updates to review di
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
+    #in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox])
     find_duplicate_pages_btn.click(
         fn=run_duplicate_analysis,

tools/file_conversion.py CHANGED Viewed

@@ -454,7 +454,7 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
 def prepare_image_or_pdf(
     file_paths: List[str],
-    in_redact_method: str,
     all_line_level_ocr_results_df:pd.DataFrame,
     latest_file_completed: int = 0,
     out_message: List[str] = [],
@@ -468,7 +468,7 @@ def prepare_image_or_pdf(
     prepare_images:bool=True,
     page_sizes:list[dict]=[],
     textract_output_found:bool = False,
-    local_ocr_output_found:bool = False,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
@@ -479,7 +479,7 @@ def prepare_image_or_pdf(
     Args:
         file_paths (List[str]): List of file paths to process.
-        in_redact_method (str): The redaction method to use.
         latest_file_completed (optional, int): Index of the last completed file.
         out_message (optional, List[str]): List to store output messages.
         first_loop_state (optional, bool): Flag indicating if this is the first iteration.
@@ -491,7 +491,7 @@ def prepare_image_or_pdf(
         prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
         page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
         textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
-        local_ocr_output_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
         progress (optional, Progress): Progress tracker for the operation
@@ -542,7 +542,7 @@ def prepare_image_or_pdf(
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
-        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, local_ocr_output_found
     progress(0.1, desc='Preparing file')
@@ -599,8 +599,8 @@ def prepare_image_or_pdf(
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Check if the file is an image type and the user selected text ocr option
-            if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
-                in_redact_method = TESSERACT_TEXT_EXTRACT_OPTION
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
@@ -663,15 +663,18 @@ def prepare_image_or_pdf(
             elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
                 print("Saving local OCR output")
                 # Copy it to the output folder so it can be used later.
-                output_ocr_results_with_words_json_file_name = file_path_without_ext
-                if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
-                else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
                 out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
                 # Use shutil to copy the file directly
                 shutil.copy2(file_path, out_ocr_results_with_words_path)  # Preserves metadata
-                local_ocr_output_found = True
                 continue
             # NEW IF STATEMENT
@@ -768,13 +771,13 @@ def prepare_image_or_pdf(
         # Must be something else, return with error message
         else:
-            if in_redact_method == TESSERACT_TEXT_EXTRACT_OPTION or in_redact_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 if is_pdf_or_image(file_path) == False:
                     out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                     print(out_message)
                     raise Exception(out_message)
-            elif in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
                 if is_pdf(file_path) == False:
                     out_message = "Please upload a PDF file for text analysis."
                     print(out_message)
@@ -793,7 +796,7 @@ def prepare_image_or_pdf(
     number_of_pages = len(page_sizes)#len(image_file_paths)
-    return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, local_ocr_output_found
 def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """

 def prepare_image_or_pdf(
     file_paths: List[str],
+    text_extract_method: str,
     all_line_level_ocr_results_df:pd.DataFrame,
     latest_file_completed: int = 0,
     out_message: List[str] = [],
     prepare_images:bool=True,
     page_sizes:list[dict]=[],
     textract_output_found:bool = False,
+    relevant_ocr_output_with_words_found:bool = False,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
     Args:
         file_paths (List[str]): List of file paths to process.
+        text_extract_method (str): The redaction method to use.
         latest_file_completed (optional, int): Index of the last completed file.
         out_message (optional, List[str]): List to store output messages.
         first_loop_state (optional, bool): Flag indicating if this is the first iteration.
         prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
         page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
         textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
+        relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
         progress (optional, Progress): Progress tracker for the operation
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
+        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
     progress(0.1, desc='Preparing file')
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Check if the file is an image type and the user selected text ocr option
+            if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
+                text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
             # Convert image to a pymupdf document
             pymupdf_doc = pymupdf.open()  # Create a new empty document
             elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
                 print("Saving local OCR output")
                 # Copy it to the output folder so it can be used later.
+                output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
+                # if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
+                # else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
                 out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
                 # Use shutil to copy the file directly
                 shutil.copy2(file_path, out_ocr_results_with_words_path)  # Preserves metadata
+                if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
+                if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
+                if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
                 continue
             # NEW IF STATEMENT
         # Must be something else, return with error message
         else:
+            if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 if is_pdf_or_image(file_path) == False:
                     out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                     print(out_message)
                     raise Exception(out_message)
+            elif text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
                 if is_pdf(file_path) == False:
                     out_message = "Please upload a PDF file for text analysis."
                     print(out_message)
     number_of_pages = len(page_sizes)#len(image_file_paths)
+    return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
 def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """

tools/file_redaction.py CHANGED Viewed

@@ -8,7 +8,7 @@ import copy
 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
-from typing import List, Dict, Tuple, Optional
 import pandas as pd
 from pdfminer.high_level import extract_pages
@@ -59,6 +59,49 @@ def sum_numbers_before_seconds(string:str):
     return sum_of_numbers
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
  pdf_image_file_paths:List[str],
@@ -499,7 +542,7 @@ def choose_and_run_redactor(file_paths:List[str],
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(
             file_path,
             language,
             chosen_redact_entities,
@@ -513,6 +556,7 @@ def choose_and_run_redactor(file_paths:List[str],
             all_line_level_ocr_results_df,
             all_pages_decision_process_table,
             pymupdf_doc,
             pii_identification_method,
             comprehend_query_number,
             comprehend_client,
@@ -522,7 +566,8 @@ def choose_and_run_redactor(file_paths:List[str],
             match_fuzzy_whole_phrase_bool,
             page_sizes_df,
             document_cropboxes,
-            text_extraction_only)
         else:
             out_message = "No redaction method selected"
             print(out_message)
@@ -536,9 +581,7 @@ def choose_and_run_redactor(file_paths:List[str],
             current_loop_page = 999
             if latest_file_completed != len(file_paths_list):
-                print("Completed file number:", str(latest_file_completed), "there are more files to do")
             # Save redacted file
             if pii_identification_method != NO_REDACTION_PII_OPTION:
@@ -572,6 +615,30 @@ def choose_and_run_redactor(file_paths:List[str],
             duplication_file_path_outputs.append(ocr_file_path)
             # Convert the gradio annotation boxes to relative coordinates
             # Convert annotations_all_pages to a consistent relative coordinate format output
             progress(0.93, "Creating review file output")
@@ -1343,7 +1410,7 @@ def redact_image_pdf(file_path:str,
     # If running local OCR option, check if file already exists. If it does, load in existing data
     if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
-        all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
         all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
         original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
@@ -1662,32 +1729,37 @@ def redact_image_pdf(file_path:str,
                     # Append new annotation if it doesn't exist
                     annotations_all_pages.append(page_image_annotations)
                 if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
                         with open(textract_json_file_path, 'w') as json_file:
                             json.dump(textract_data, json_file, separators=(",", ":"))  # indent=4 makes the JSON file pretty-printed
-                    if textract_json_file_path not in log_files_output_paths:
-                        log_files_output_paths.append(textract_json_file_path)
                 if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
                     if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
                         # Write the updated existing textract data back to the JSON file
                         with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
-                            json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))  # indent=4 makes the JSON file pretty-printed
                     if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
-                #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
-                #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
                 all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
                 all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
                 current_loop_page += 1
                 return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
@@ -1784,22 +1856,21 @@ def get_text_container_characters(text_container:LTTextContainer):
         return characters
     return []
-def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
     '''
     Create an OCRResult object based on a list of pdfminer LTChar objects.
     '''
     line_level_results_out = []
     line_level_characters_out = []
-    #all_line_level_characters_out = []
-    character_objects_out = []  # New list to store character objects
-    # character_text_objects_out = []
     # Initialize variables
     full_text = ""
     added_text = ""
     overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # [x0, y0, x1, y1]
-    word_bboxes = []
     # Iterate through the character objects
     current_word = ""
@@ -1813,7 +1884,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
             # character_text_objects_out.append(character_text)
         if isinstance(char, LTAnno):
             added_text = char.get_text()
             # Handle double quotes
@@ -1822,17 +1892,17 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
             # Handle space separately by finalizing the word
             full_text += added_text  # Adds space or newline
-            if current_word:  # Only finalize if there is a current word
-                word_bboxes.append((current_word, current_word_bbox))
                 current_word = ""
                 current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
             # Check for line break (assuming a new line is indicated by a specific character)
             if '\n' in added_text:
-                # Finalize the current line
                 if current_word:
-                    word_bboxes.append((current_word, current_word_bbox))
                 # Create an OCRResult for the current line
                 line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
                 line_level_characters_out.append(character_objects_out)
@@ -1872,23 +1942,138 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
         current_word_bbox[2] = max(current_word_bbox[2], x1)  # x1
         current_word_bbox[3] = max(current_word_bbox[3], y1)  # y1
-    # Finalize the last word if any
     if current_word:
-        word_bboxes.append((current_word, current_word_bbox))
     if full_text:
         if re.search(r'[^\x00-\x7F]', full_text):  # Matches any non-ASCII character
             # Convert special characters to a human-readable format
             full_text = clean_unicode_text(full_text)
             full_text = full_text.strip()
-        line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
-    #line_level_characters_out = character_objects_out
-    return line_level_results_out, line_level_characters_out  # Return both results and character objects
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -1938,7 +2123,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
     return pikepdf_redaction_annotations_on_page
 def redact_text_pdf(
-    filename: str,  # Path to the PDF file to be redacted
     language: str,  # Language of the PDF content
     chosen_redact_entities: List[str],  # List of entities to be redacted
     chosen_redact_comprehend_entities: List[str],
@@ -1951,6 +2136,7 @@ def redact_text_pdf(
     all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text",	"left", "top", "width", "height"]),  # DataFrame for OCR results
     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
@@ -1961,6 +2147,7 @@ def redact_text_pdf(
     page_sizes_df:pd.DataFrame=pd.DataFrame(),
     original_cropboxes:List[dict]=[],
     text_extraction_only:bool=False,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
     max_time: int = int(MAX_TIME_VALUE),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
@@ -1970,7 +2157,7 @@ def redact_text_pdf(
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
     Input Variables:
-    - filename: Path to the PDF file to be redacted
     - language: Language of the PDF content
     - chosen_redact_entities: List of entities to be redacted
     - chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
@@ -1994,6 +2181,7 @@ def redact_text_pdf(
     - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
     - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
@@ -2023,8 +2211,13 @@ def redact_text_pdf(
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
-    pikepdf_pdf = Pdf.open(filename)
-    number_of_pages = len(pikepdf_pdf.pages)
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0:
@@ -2056,7 +2249,7 @@ def redact_text_pdf(
         if page_min <= page_no < page_max:
             # Go page by page
-            for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
                 all_page_line_text_extraction_characters = []
                 all_page_line_level_text_extraction_results_list = []
@@ -2068,14 +2261,18 @@ def redact_text_pdf(
                 page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
                 page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
                 for n, text_container in enumerate(page_layout):
                     characters = []
                     if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                         characters = get_text_container_characters(text_container)
                     # Create dataframe for all the text on the page
-                    line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
                     ### Create page_text_ocr_outputs (OCR format outputs)
                     if line_level_text_results_list:
@@ -2093,6 +2290,7 @@ def redact_text_pdf(
                     all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
                     all_page_line_text_extraction_characters.extend(line_characters)
                 ### REDACTION
                 if pii_identification_method != NO_REDACTION_PII_OPTION:
@@ -2143,9 +2341,9 @@ def redact_text_pdf(
                 # Join extracted text outputs for all lines together
                 if not page_text_ocr_outputs.empty:
-                    page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
                     page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
-                    all_line_level_ocr_results_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
@@ -2174,7 +2372,7 @@ def redact_text_pdf(
                     current_loop_page += 1
-                    return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
         # Check if the image already exists in annotations_all_pages
         existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
@@ -2195,7 +2393,7 @@ def redact_text_pdf(
             # Write logs
             all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
-            return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Write all page outputs
     all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
@@ -2222,5 +2420,11 @@ def redact_text_pdf(
     if not all_line_level_ocr_results_df.empty:
         all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
         all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
-    return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number

 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
+from typing import List, Dict, Tuple, Optional, Any
 import pandas as pd
 from pdfminer.high_level import extract_pages
     return sum_of_numbers
+def merge_page_results(data):
+    merged = {}
+    for item in data:
+        page = item["page"]
+        if page not in merged:
+            merged[page] = {
+                "page": page,
+                "results": {}
+            }
+        # Merge line-level results into the existing page
+        merged[page]["results"].update(item.get("results", {}))
+    return list(merged.values())
+def word_level_ocr_output_to_dataframe(ocr_result: dict) -> pd.DataFrame:
+    rows = []
+    ocr_result = ocr_result[0]
+    page_number = int(ocr_result['page'])
+    for line_key, line_data in ocr_result['results'].items():
+        line_number = int(line_data['line'])
+        for word in line_data['words']:
+            rows.append({
+                'page': page_number,
+                'line': line_number,
+                'word_text': word['text'],
+                'word_x0': word['bounding_box'][0],
+                'word_y0': word['bounding_box'][1],
+                'word_x1': word['bounding_box'][2],
+                'word_y1': word['bounding_box'][3],
+                'line_text': line_data['text'],
+                'line_x0': line_data['bounding_box'][0],
+                'line_y0': line_data['bounding_box'][1],
+                'line_x1': line_data['bounding_box'][2],
+                'line_y1': line_data['bounding_box'][3],
+            })
+    return pd.DataFrame(rows)
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
  pdf_image_file_paths:List[str],
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
             file_path,
             language,
             chosen_redact_entities,
             all_line_level_ocr_results_df,
             all_pages_decision_process_table,
             pymupdf_doc,
+            [], # All line level ocr results with words
             pii_identification_method,
             comprehend_query_number,
             comprehend_client,
             match_fuzzy_whole_phrase_bool,
             page_sizes_df,
             document_cropboxes,
+            text_extraction_only,
+            output_folder=output_folder)
         else:
             out_message = "No redaction method selected"
             print(out_message)
             current_loop_page = 999
             if latest_file_completed != len(file_paths_list):
+                print("Completed file number:", str(latest_file_completed), "there are more files to do")
             # Save redacted file
             if pii_identification_method != NO_REDACTION_PII_OPTION:
             duplication_file_path_outputs.append(ocr_file_path)
+            if all_page_line_level_ocr_results_with_words:
+                #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
+                all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
+                # print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
+                file_name = get_file_name_without_type(file_path)
+                all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
+                with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
+                    json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
+                all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
+                all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
+                all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
+                all_page_line_level_ocr_results_with_words_df_file_path = output_folder + file_name + "_ocr_results_with_words.csv"
+                all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path)
             # Convert the gradio annotation boxes to relative coordinates
             # Convert annotations_all_pages to a consistent relative coordinate format output
             progress(0.93, "Creating review file output")
     # If running local OCR option, check if file already exists. If it does, load in existing data
     if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
+        all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
         all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
         original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
                     # Append new annotation if it doesn't exist
                     annotations_all_pages.append(page_image_annotations)
+                # Save word level options
                 if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                     if original_textract_data != textract_data:
                         # Write the updated existing textract data back to the JSON file
                         with open(textract_json_file_path, 'w') as json_file:
                             json.dump(textract_data, json_file, separators=(",", ":"))  # indent=4 makes the JSON file pretty-printed
+                        if textract_json_file_path not in log_files_output_paths:
+                            log_files_output_paths.append(textract_json_file_path)
+                    all_page_line_level_ocr_results_with_words_json_file_path_textract = output_folder + file_name + "_ocr_results_with_words_textract.json"
+                    with open(all_page_line_level_ocr_results_with_words_json_file_path_textract, 'w') as json_file:
+                        json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))  # indent=4 makes the JSON file pretty-printed
+                    if all_page_line_level_ocr_results_with_words_json_file_path_textract not in log_files_output_paths:
+                        log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path_textract)
                 if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
                     if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
                         # Write the updated existing textract data back to the JSON file
                         with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
+                            json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
                     if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
                 all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
                 all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
                 current_loop_page += 1
                 return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
         return characters
     return []
+def create_line_level_ocr_results_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
     '''
     Create an OCRResult object based on a list of pdfminer LTChar objects.
     '''
     line_level_results_out = []
     line_level_characters_out = []
+    line_level_words_out = {}
+    character_objects_out = []
     # Initialize variables
     full_text = ""
     added_text = ""
     overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # [x0, y0, x1, y1]
+    line_bboxes = []
     # Iterate through the character objects
     current_word = ""
             # character_text_objects_out.append(character_text)
         if isinstance(char, LTAnno):
             added_text = char.get_text()
             # Handle double quotes
             # Handle space separately by finalizing the word
             full_text += added_text  # Adds space or newline
+            if current_word:  # Only finalise if there is a current word
+                line_bboxes.append((current_word, current_word_bbox))
                 current_word = ""
                 current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
             # Check for line break (assuming a new line is indicated by a specific character)
             if '\n' in added_text:
+                # finalise the current line
                 if current_word:
+                    line_bboxes.append((current_word, current_word_bbox))
                 # Create an OCRResult for the current line
                 line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
                 line_level_characters_out.append(character_objects_out)
         current_word_bbox[2] = max(current_word_bbox[2], x1)  # x1
         current_word_bbox[3] = max(current_word_bbox[3], y1)  # y1
+    # Finalise the last word if any
     if current_word:
+        line_bboxes.append((current_word, current_word_bbox))
     if full_text:
+        print("full_text found")
         if re.search(r'[^\x00-\x7F]', full_text):  # Matches any non-ASCII character
             # Convert special characters to a human-readable format
             full_text = clean_unicode_text(full_text)
             full_text = full_text.strip()
+        line_ocr_result_bbox = round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)
+        line_ocr_result = OCRResult(full_text.strip(), line_ocr_result_bbox)
+        line_level_results_out.append(line_ocr_result)
+    else:
+        line_ocr_result_bbox = []
+    # if line_ocr_result_bbox:
+    #     line_level_words_out["page"] = 1
+    #     line_level_words_out['results'] = {'text_line_1':{"line":1, "text":full_text, "bounding_box": line_ocr_result_bbox, "words": line_bboxes}}
+    # else:
+    #     line_level_words_out = {}
+    return line_level_results_out, line_level_characters_out # Return both results and character objects
+def generate_word_level_ocr(char_objects: List, page_number: int, text_line_number:int) -> Dict[str, Any]:
+    """
+    Generates a dictionary with line and word-level OCR results from a list of pdfminer.six objects.
+    This robust version handles real-world pdfminer.six output by:
+    1. Filtering out non-character (LTAnno) objects that lack coordinate data.
+    2. Sorting all text characters (LTChar) into a proper reading order.
+    3. Using an adaptive threshold for detecting spaces based on character font size.
+    Args:
+        char_objects: A mixed list of pdfminer.six LTChar and LTAnno objects from a single page.
+        page_number: The page number where the characters are from.
+    Returns:
+        A dictionary formatted with page, line, and word-level results.
+    """
+    # **CRITICAL FIX: Filter out LTAnno objects, as they lack '.bbox' and are not needed for layout analysis.**
+    text_chars = [c for c in char_objects if isinstance(c, LTChar)]
+    if not text_chars:
+        return {"page": str(page_number), "results": {}}
+    # Sort the remaining text characters into reading order.
+    text_chars.sort(key=lambda c: (-c.bbox[3], c.bbox[0]))
+    page_data = {"page": str(page_number), "results": {}}
+    line_number = text_line_number
+    # State variables
+    line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
+    current_word_text, current_word_bbox = "", [float('inf'), float('inf'), -1, -1]
+    prev_char = None
+    def finalize_word():
+        nonlocal current_word_text, current_word_bbox
+        word_text = current_word_text.strip()
+        if word_text:
+            line_words.append({
+                "text": word_text,
+                "bounding_box": [round(b, 2) for b in current_word_bbox]
+            })
+        current_word_text = ""
+        current_word_bbox = [float('inf'), float('inf'), -1, -1]
+    def finalize_line():
+        nonlocal line_text, line_bbox, line_words, line_number, prev_char
+        finalize_word()
+        if line_text.strip():
+            page_data["results"][f"text_line_{line_number}"] = {
+                "line": line_number,
+                "text": line_text.strip(),
+                "bounding_box": [round(b, 2) for b in line_bbox],
+                "words": line_words
+            }
+            line_number += 1
+            line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
+            prev_char = None
+    for char in text_chars:
+        char_text = clean_unicode_text(char.get_text())
+        if prev_char:
+            char_height = char.bbox[3] - char.bbox[1]
+            vertical_gap = abs(char.bbox[1] - prev_char.bbox[1])
+            # Line break detection
+            if vertical_gap > char_height * 0.7:
+                finalize_line()
+            else:
+                # Check for spacing between characters
+                space_threshold = char.size * 0.5
+                gap = char.bbox[0] - prev_char.bbox[2]
+                if gap > max(space_threshold, 1.0):
+                    finalize_word()
+                    line_text += " "
+        # ✅ Explicitly finalize if space character
+        if char_text == " ":
+            finalize_word()
+            line_text += " "
+            prev_char = char
+            continue
+        current_word_text += char_text
+        line_text += char_text
+        # Update bounding boxes
+        current_word_bbox[0] = min(current_word_bbox[0], char.bbox[0])
+        current_word_bbox[1] = min(current_word_bbox[1], char.bbox[1])
+        current_word_bbox[2] = max(current_word_bbox[2], char.bbox[2])
+        current_word_bbox[3] = max(current_word_bbox[3], char.bbox[3])
+        line_bbox[0] = min(line_bbox[0], char.bbox[0])
+        line_bbox[1] = min(line_bbox[1], char.bbox[1])
+        line_bbox[2] = max(line_bbox[2], char.bbox[2])
+        line_bbox[3] = max(line_bbox[3], char.bbox[3])
+        prev_char = char
+    finalize_line()
+    return page_data
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
     return pikepdf_redaction_annotations_on_page
 def redact_text_pdf(
+    file_path: str,  # Path to the PDF file to be redacted
     language: str,  # Language of the PDF content
     chosen_redact_entities: List[str],  # List of entities to be redacted
     chosen_redact_comprehend_entities: List[str],
     all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text",	"left", "top", "width", "height"]),  # DataFrame for OCR results
     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
+    all_page_line_level_ocr_results_with_words: List = [],
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
     page_sizes_df:pd.DataFrame=pd.DataFrame(),
     original_cropboxes:List[dict]=[],
     text_extraction_only:bool=False,
+    output_folder:str=OUTPUT_FOLDER,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
     max_time: int = int(MAX_TIME_VALUE),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
     Input Variables:
+    - file_path: Path to the PDF file to be redacted
     - language: Language of the PDF content
     - chosen_redact_entities: List of entities to be redacted
     - chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
     - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
     - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
+    - output_folder (str, optional): The output folder for the function
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
+    pikepdf_pdf = Pdf.open(file_path)
+    number_of_pages = len(pikepdf_pdf.pages)
+    file_name = get_file_name_without_type(file_path)
+    if not all_page_line_level_ocr_results_with_words:
+        all_page_line_level_ocr_results_with_words = []
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0:
         if page_min <= page_no < page_max:
             # Go page by page
+            for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
                 all_page_line_text_extraction_characters = []
                 all_page_line_level_text_extraction_results_list = []
                 page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
                 page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
+                text_line_no = 0
                 for n, text_container in enumerate(page_layout):
                     characters = []
                     if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                         characters = get_text_container_characters(text_container)
+                        text_line_no += 1
                     # Create dataframe for all the text on the page
+                    line_level_text_results_list, line_characters, = create_line_level_ocr_results_from_characters(characters)
+                    line_level_ocr_results_with_words = generate_word_level_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
                     ### Create page_text_ocr_outputs (OCR format outputs)
                     if line_level_text_results_list:
                     all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
                     all_page_line_text_extraction_characters.extend(line_characters)
+                    all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
                 ### REDACTION
                 if pii_identification_method != NO_REDACTION_PII_OPTION:
                 # Join extracted text outputs for all lines together
                 if not page_text_ocr_outputs.empty:
+                    #page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
                     page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
+                    all_line_level_ocr_results_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
                     current_loop_page += 1
+                    return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
         # Check if the image already exists in annotations_all_pages
         existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
             # Write logs
             all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+            return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
     # Write all page outputs
     all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
     if not all_line_level_ocr_results_df.empty:
         all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
         all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
+    all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
+    #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
+    with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
+        json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
+    return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words

tools/helper_functions.py CHANGED Viewed

@@ -244,13 +244,27 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
     else:
         return False
-def check_for_existing_local_ocr_file(doc_file_name_no_extension_textbox:str, output_folder:str=OUTPUT_FOLDER):
-    local_ocr_output_path = os.path.join(output_folder, doc_file_name_no_extension_textbox + "_ocr_results_with_words.json")
     if os.path.exists(local_ocr_output_path):
-        print("Existing local OCR analysis output file found.")
-        return True
     else:
         return False

     else:
         return False
+def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:str, text_extraction_method:str, output_folder:str=OUTPUT_FOLDER):
+    if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
+    elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
+    elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
+    else:
+        print("No valid text extraction method found. Returning False")
+        return False
+    print("doc_file_name_no_extension_textbox:", doc_file_name_no_extension_textbox)
+    doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
+    print("doc_file_with_ending:", doc_file_with_ending)
+    local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
+    print("looking for file path:", local_ocr_output_path)
     if os.path.exists(local_ocr_output_path):
+        print("Existing OCR with words analysis output file found.")
+        return True
     else:
         return False