seanpedrickcase commited on
Commit
ef4000e
·
1 Parent(s): c8ffcd4

Local text redaction now produces ocr results with words json and can make dataframe format

Browse files
app.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
6
- from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
@@ -63,7 +63,7 @@ with app:
63
  ###
64
 
65
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
66
- pdf_doc_state = gr.State([])
67
  all_image_annotations_state = gr.State([])
68
 
69
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
@@ -211,7 +211,7 @@ with app:
211
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
212
 
213
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
214
- local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=False)
215
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
216
  estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
217
  estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
@@ -274,7 +274,7 @@ with app:
274
  with gr.Row(equal_height=True):
275
  with gr.Column(scale=1):
276
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
277
- local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
278
  with gr.Column(scale=4):
279
  with gr.Row(equal_height=True):
280
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
@@ -576,7 +576,8 @@ with app:
576
  if SHOW_COSTS == 'True':
577
  # Calculate costs
578
  total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
579
- text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
 
580
  pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
581
  handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
582
  textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
@@ -584,14 +585,14 @@ with app:
584
  textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
585
 
586
  # Calculate time taken
587
- total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
588
- text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
589
- pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
590
- handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
591
- textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
592
- only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
593
- textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
594
- local_ocr_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, local_ocr_output_found_checkbox], outputs=[estimated_time_taken_number])
595
 
596
  # Allow user to select items from cost code dataframe for cost code
597
  if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
@@ -601,9 +602,9 @@ with app:
601
  cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
602
 
603
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
604
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
605
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
606
- success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
607
 
608
  # Run redaction function
609
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
@@ -620,7 +621,7 @@ with app:
620
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
621
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
622
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
623
- success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
624
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
625
  success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
626
 
@@ -640,9 +641,9 @@ with app:
640
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
641
 
642
  convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
643
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
644
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
645
- success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
646
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
647
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
648
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
@@ -657,7 +658,7 @@ with app:
657
  # Upload previous files for modifying redactions
658
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
659
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
660
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
661
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
662
 
663
  # Manual updates to review di
@@ -753,12 +754,12 @@ with app:
753
 
754
  # Convert review file to xfdf Adobe format
755
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
756
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
757
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
758
 
759
  # Convert xfdf Adobe file back to review_file.csv
760
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
761
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
762
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
763
 
764
  ###
@@ -779,7 +780,7 @@ with app:
779
  ###
780
  # IDENTIFY DUPLICATE PAGES
781
  ###
782
- #in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox])
783
 
784
  find_duplicate_pages_btn.click(
785
  fn=run_duplicate_analysis,
 
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
6
+ from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 
63
  ###
64
 
65
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
66
+ pdf_doc_state = gr.State([])
67
  all_image_annotations_state = gr.State([])
68
 
69
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
 
211
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
212
 
213
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
214
+ relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=False)
215
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
216
  estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
217
  estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
 
274
  with gr.Row(equal_height=True):
275
  with gr.Column(scale=1):
276
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
277
+ relevant_ocr_output_with_words_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
278
  with gr.Column(scale=4):
279
  with gr.Row(equal_height=True):
280
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
 
576
  if SHOW_COSTS == 'True':
577
  # Calculate costs
578
  total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
579
+ text_extract_method_radio.change(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
580
+ success(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
581
  pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
582
  handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
583
  textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
 
585
  textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
586
 
587
  # Calculate time taken
588
+ total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
589
+ text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
590
+ pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
591
+ handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
592
+ textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
593
+ only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
594
+ textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
595
+ relevant_ocr_output_with_words_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio, relevant_ocr_output_with_words_found_checkbox], outputs=[estimated_time_taken_number])
596
 
597
  # Allow user to select items from cost code dataframe for cost code
598
  if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
 
602
  cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
603
 
604
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
605
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
606
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
607
+ success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
608
 
609
  # Run redaction function
610
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
 
621
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
622
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
623
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
624
+ success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
625
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
626
  success(fn = reset_aws_call_vars, outputs=[comprehend_query_number, textract_query_number])
627
 
 
641
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
642
 
643
  convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
644
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox]).\
645
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
646
+ success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
647
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
648
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
649
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
 
658
  # Upload previous files for modifying redactions
659
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
660
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
661
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox], api_name="prepare_doc").\
662
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
663
 
664
  # Manual updates to review di
 
754
 
755
  # Convert review file to xfdf Adobe format
756
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
757
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
758
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
759
 
760
  # Convert xfdf Adobe file back to review_file.csv
761
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
762
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, relevant_ocr_output_with_words_found_checkbox]).\
763
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
764
 
765
  ###
 
780
  ###
781
  # IDENTIFY DUPLICATE PAGES
782
  ###
783
+ #in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox])
784
 
785
  find_duplicate_pages_btn.click(
786
  fn=run_duplicate_analysis,
tools/file_conversion.py CHANGED
@@ -454,7 +454,7 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
454
 
455
  def prepare_image_or_pdf(
456
  file_paths: List[str],
457
- in_redact_method: str,
458
  all_line_level_ocr_results_df:pd.DataFrame,
459
  latest_file_completed: int = 0,
460
  out_message: List[str] = [],
@@ -468,7 +468,7 @@ def prepare_image_or_pdf(
468
  prepare_images:bool=True,
469
  page_sizes:list[dict]=[],
470
  textract_output_found:bool = False,
471
- local_ocr_output_found:bool = False,
472
  progress: Progress = Progress(track_tqdm=True)
473
  ) -> tuple[List[str], List[str]]:
474
  """
@@ -479,7 +479,7 @@ def prepare_image_or_pdf(
479
 
480
  Args:
481
  file_paths (List[str]): List of file paths to process.
482
- in_redact_method (str): The redaction method to use.
483
  latest_file_completed (optional, int): Index of the last completed file.
484
  out_message (optional, List[str]): List to store output messages.
485
  first_loop_state (optional, bool): Flag indicating if this is the first iteration.
@@ -491,7 +491,7 @@ def prepare_image_or_pdf(
491
  prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
492
  page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
493
  textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
494
- local_ocr_output_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
495
  progress (optional, Progress): Progress tracker for the operation
496
 
497
 
@@ -542,7 +542,7 @@ def prepare_image_or_pdf(
542
  final_out_message = '\n'.join(out_message)
543
  else:
544
  final_out_message = out_message
545
- return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, local_ocr_output_found
546
 
547
  progress(0.1, desc='Preparing file')
548
 
@@ -599,8 +599,8 @@ def prepare_image_or_pdf(
599
 
600
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
601
  # Check if the file is an image type and the user selected text ocr option
602
- if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
603
- in_redact_method = TESSERACT_TEXT_EXTRACT_OPTION
604
 
605
  # Convert image to a pymupdf document
606
  pymupdf_doc = pymupdf.open() # Create a new empty document
@@ -663,15 +663,18 @@ def prepare_image_or_pdf(
663
  elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
664
  print("Saving local OCR output")
665
  # Copy it to the output folder so it can be used later.
666
- output_ocr_results_with_words_json_file_name = file_path_without_ext
667
- if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
668
- else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
669
 
670
  out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
671
 
672
  # Use shutil to copy the file directly
673
  shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
674
- local_ocr_output_found = True
 
 
 
675
  continue
676
 
677
  # NEW IF STATEMENT
@@ -768,13 +771,13 @@ def prepare_image_or_pdf(
768
 
769
  # Must be something else, return with error message
770
  else:
771
- if in_redact_method == TESSERACT_TEXT_EXTRACT_OPTION or in_redact_method == TEXTRACT_TEXT_EXTRACT_OPTION:
772
  if is_pdf_or_image(file_path) == False:
773
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
774
  print(out_message)
775
  raise Exception(out_message)
776
 
777
- elif in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
778
  if is_pdf(file_path) == False:
779
  out_message = "Please upload a PDF file for text analysis."
780
  print(out_message)
@@ -793,7 +796,7 @@ def prepare_image_or_pdf(
793
 
794
  number_of_pages = len(page_sizes)#len(image_file_paths)
795
 
796
- return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, local_ocr_output_found
797
 
798
  def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
799
  """
 
454
 
455
  def prepare_image_or_pdf(
456
  file_paths: List[str],
457
+ text_extract_method: str,
458
  all_line_level_ocr_results_df:pd.DataFrame,
459
  latest_file_completed: int = 0,
460
  out_message: List[str] = [],
 
468
  prepare_images:bool=True,
469
  page_sizes:list[dict]=[],
470
  textract_output_found:bool = False,
471
+ relevant_ocr_output_with_words_found:bool = False,
472
  progress: Progress = Progress(track_tqdm=True)
473
  ) -> tuple[List[str], List[str]]:
474
  """
 
479
 
480
  Args:
481
  file_paths (List[str]): List of file paths to process.
482
+ text_extract_method (str): The redaction method to use.
483
  latest_file_completed (optional, int): Index of the last completed file.
484
  out_message (optional, List[str]): List to store output messages.
485
  first_loop_state (optional, bool): Flag indicating if this is the first iteration.
 
491
  prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
492
  page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
493
  textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
494
+ relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
495
  progress (optional, Progress): Progress tracker for the operation
496
 
497
 
 
542
  final_out_message = '\n'.join(out_message)
543
  else:
544
  final_out_message = out_message
545
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
546
 
547
  progress(0.1, desc='Preparing file')
548
 
 
599
 
600
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
601
  # Check if the file is an image type and the user selected text ocr option
602
+ if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
603
+ text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
604
 
605
  # Convert image to a pymupdf document
606
  pymupdf_doc = pymupdf.open() # Create a new empty document
 
663
  elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
664
  print("Saving local OCR output")
665
  # Copy it to the output folder so it can be used later.
666
+ output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
667
+ # if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
668
+ # else: output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
669
 
670
  out_ocr_results_with_words_path = os.path.join(output_folder, output_ocr_results_with_words_json_file_name)
671
 
672
  # Use shutil to copy the file directly
673
  shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
674
+
675
+ if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
676
+ if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
677
+ if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
678
  continue
679
 
680
  # NEW IF STATEMENT
 
771
 
772
  # Must be something else, return with error message
773
  else:
774
+ if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
775
  if is_pdf_or_image(file_path) == False:
776
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
777
  print(out_message)
778
  raise Exception(out_message)
779
 
780
+ elif text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
781
  if is_pdf(file_path) == False:
782
  out_message = "Please upload a PDF file for text analysis."
783
  print(out_message)
 
796
 
797
  number_of_pages = len(page_sizes)#len(image_file_paths)
798
 
799
+ return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
800
 
801
  def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
802
  """
tools/file_redaction.py CHANGED
@@ -8,7 +8,7 @@ import copy
8
 
9
  from tqdm import tqdm
10
  from PIL import Image, ImageChops, ImageFile, ImageDraw
11
- from typing import List, Dict, Tuple, Optional
12
  import pandas as pd
13
 
14
  from pdfminer.high_level import extract_pages
@@ -59,6 +59,49 @@ def sum_numbers_before_seconds(string:str):
59
 
60
  return sum_of_numbers
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def choose_and_run_redactor(file_paths:List[str],
63
  prepared_pdf_file_paths:List[str],
64
  pdf_image_file_paths:List[str],
@@ -499,7 +542,7 @@ def choose_and_run_redactor(file_paths:List[str],
499
  # Analyse text-based pdf
500
  print('Redacting file as text-based PDF')
501
 
502
- pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(
503
  file_path,
504
  language,
505
  chosen_redact_entities,
@@ -513,6 +556,7 @@ def choose_and_run_redactor(file_paths:List[str],
513
  all_line_level_ocr_results_df,
514
  all_pages_decision_process_table,
515
  pymupdf_doc,
 
516
  pii_identification_method,
517
  comprehend_query_number,
518
  comprehend_client,
@@ -522,7 +566,8 @@ def choose_and_run_redactor(file_paths:List[str],
522
  match_fuzzy_whole_phrase_bool,
523
  page_sizes_df,
524
  document_cropboxes,
525
- text_extraction_only)
 
526
  else:
527
  out_message = "No redaction method selected"
528
  print(out_message)
@@ -536,9 +581,7 @@ def choose_and_run_redactor(file_paths:List[str],
536
  current_loop_page = 999
537
 
538
  if latest_file_completed != len(file_paths_list):
539
- print("Completed file number:", str(latest_file_completed), "there are more files to do")
540
-
541
-
542
 
543
  # Save redacted file
544
  if pii_identification_method != NO_REDACTION_PII_OPTION:
@@ -572,6 +615,30 @@ def choose_and_run_redactor(file_paths:List[str],
572
 
573
  duplication_file_path_outputs.append(ocr_file_path)
574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  # Convert the gradio annotation boxes to relative coordinates
576
  # Convert annotations_all_pages to a consistent relative coordinate format output
577
  progress(0.93, "Creating review file output")
@@ -1343,7 +1410,7 @@ def redact_image_pdf(file_path:str,
1343
 
1344
  # If running local OCR option, check if file already exists. If it does, load in existing data
1345
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1346
- all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
1347
  all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
1348
  original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
1349
 
@@ -1662,32 +1729,37 @@ def redact_image_pdf(file_path:str,
1662
  # Append new annotation if it doesn't exist
1663
  annotations_all_pages.append(page_image_annotations)
1664
 
1665
-
1666
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1667
  if original_textract_data != textract_data:
1668
  # Write the updated existing textract data back to the JSON file
1669
  with open(textract_json_file_path, 'w') as json_file:
1670
  json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
1671
 
1672
- if textract_json_file_path not in log_files_output_paths:
1673
- log_files_output_paths.append(textract_json_file_path)
 
 
 
 
 
 
 
 
1674
 
1675
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1676
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1677
  # Write the updated existing textract data back to the JSON file
 
1678
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
1679
- json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
1680
 
1681
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1682
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1683
 
1684
- #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
1685
- #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
1686
-
1687
  all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1688
  all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
1689
 
1690
-
1691
  current_loop_page += 1
1692
 
1693
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
@@ -1784,22 +1856,21 @@ def get_text_container_characters(text_container:LTTextContainer):
1784
  return characters
1785
  return []
1786
 
1787
- def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
1788
  '''
1789
  Create an OCRResult object based on a list of pdfminer LTChar objects.
1790
  '''
1791
 
1792
  line_level_results_out = []
1793
  line_level_characters_out = []
1794
- #all_line_level_characters_out = []
1795
- character_objects_out = [] # New list to store character objects
1796
- # character_text_objects_out = []
1797
 
1798
  # Initialize variables
1799
  full_text = ""
1800
  added_text = ""
1801
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
1802
- word_bboxes = []
1803
 
1804
  # Iterate through the character objects
1805
  current_word = ""
@@ -1813,7 +1884,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1813
  # character_text_objects_out.append(character_text)
1814
 
1815
  if isinstance(char, LTAnno):
1816
-
1817
  added_text = char.get_text()
1818
 
1819
  # Handle double quotes
@@ -1822,17 +1892,17 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1822
  # Handle space separately by finalizing the word
1823
  full_text += added_text # Adds space or newline
1824
 
1825
- if current_word: # Only finalize if there is a current word
1826
- word_bboxes.append((current_word, current_word_bbox))
1827
  current_word = ""
1828
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
1829
 
1830
  # Check for line break (assuming a new line is indicated by a specific character)
1831
  if '\n' in added_text:
1832
 
1833
- # Finalize the current line
1834
  if current_word:
1835
- word_bboxes.append((current_word, current_word_bbox))
1836
  # Create an OCRResult for the current line
1837
  line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
1838
  line_level_characters_out.append(character_objects_out)
@@ -1872,23 +1942,138 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1872
  current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
1873
  current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
1874
 
1875
- # Finalize the last word if any
1876
  if current_word:
1877
- word_bboxes.append((current_word, current_word_bbox))
1878
 
1879
  if full_text:
 
1880
  if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
1881
  # Convert special characters to a human-readable format
1882
 
1883
  full_text = clean_unicode_text(full_text)
1884
  full_text = full_text.strip()
1885
 
 
1886
 
1887
- line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
1888
 
1889
- #line_level_characters_out = character_objects_out
1890
 
1891
- return line_level_results_out, line_level_characters_out # Return both results and character objects
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1892
 
1893
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1894
  decision_process_table = pd.DataFrame()
@@ -1938,7 +2123,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1938
  return pikepdf_redaction_annotations_on_page
1939
 
1940
  def redact_text_pdf(
1941
- filename: str, # Path to the PDF file to be redacted
1942
  language: str, # Language of the PDF content
1943
  chosen_redact_entities: List[str], # List of entities to be redacted
1944
  chosen_redact_comprehend_entities: List[str],
@@ -1951,6 +2136,7 @@ def redact_text_pdf(
1951
  all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
1952
  all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
1953
  pymupdf_doc: List = [], # List of PyMuPDF documents
 
1954
  pii_identification_method: str = "Local",
1955
  comprehend_query_number:int = 0,
1956
  comprehend_client="",
@@ -1961,6 +2147,7 @@ def redact_text_pdf(
1961
  page_sizes_df:pd.DataFrame=pd.DataFrame(),
1962
  original_cropboxes:List[dict]=[],
1963
  text_extraction_only:bool=False,
 
1964
  page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
1965
  max_time: int = int(MAX_TIME_VALUE),
1966
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
@@ -1970,7 +2157,7 @@ def redact_text_pdf(
1970
  Redact chosen entities from a PDF that is made up of multiple pages that are not images.
1971
 
1972
  Input Variables:
1973
- - filename: Path to the PDF file to be redacted
1974
  - language: Language of the PDF content
1975
  - chosen_redact_entities: List of entities to be redacted
1976
  - chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
@@ -1994,6 +2181,7 @@ def redact_text_pdf(
1994
  - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
1995
  - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
1996
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
 
1997
  - page_break_val: Value for page break
1998
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1999
  - progress: Progress tracking object
@@ -2023,8 +2211,13 @@ def redact_text_pdf(
2023
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
2024
 
2025
  # Open with Pikepdf to get text lines
2026
- pikepdf_pdf = Pdf.open(filename)
2027
- number_of_pages = len(pikepdf_pdf.pages)
 
 
 
 
 
2028
 
2029
  # Check that page_min and page_max are within expected ranges
2030
  if page_max > number_of_pages or page_max == 0:
@@ -2056,7 +2249,7 @@ def redact_text_pdf(
2056
 
2057
  if page_min <= page_no < page_max:
2058
  # Go page by page
2059
- for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
2060
 
2061
  all_page_line_text_extraction_characters = []
2062
  all_page_line_level_text_extraction_results_list = []
@@ -2068,14 +2261,18 @@ def redact_text_pdf(
2068
  page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
2069
  page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
2070
 
 
2071
  for n, text_container in enumerate(page_layout):
2072
  characters = []
2073
 
2074
  if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
2075
  characters = get_text_container_characters(text_container)
 
2076
 
2077
  # Create dataframe for all the text on the page
2078
- line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
 
 
2079
 
2080
  ### Create page_text_ocr_outputs (OCR format outputs)
2081
  if line_level_text_results_list:
@@ -2093,6 +2290,7 @@ def redact_text_pdf(
2093
 
2094
  all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
2095
  all_page_line_text_extraction_characters.extend(line_characters)
 
2096
 
2097
  ### REDACTION
2098
  if pii_identification_method != NO_REDACTION_PII_OPTION:
@@ -2143,9 +2341,9 @@ def redact_text_pdf(
2143
 
2144
  # Join extracted text outputs for all lines together
2145
  if not page_text_ocr_outputs.empty:
2146
- page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
2147
  page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
2148
- all_line_level_ocr_results_list.append(page_text_ocr_outputs)
2149
 
2150
  toc = time.perf_counter()
2151
 
@@ -2174,7 +2372,7 @@ def redact_text_pdf(
2174
 
2175
  current_loop_page += 1
2176
 
2177
- return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2178
 
2179
  # Check if the image already exists in annotations_all_pages
2180
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
@@ -2195,7 +2393,7 @@ def redact_text_pdf(
2195
  # Write logs
2196
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2197
 
2198
- return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2199
 
2200
  # Write all page outputs
2201
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
@@ -2222,5 +2420,11 @@ def redact_text_pdf(
2222
  if not all_line_level_ocr_results_df.empty:
2223
  all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
2224
  all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
 
 
 
 
 
 
2225
 
2226
- return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
 
8
 
9
  from tqdm import tqdm
10
  from PIL import Image, ImageChops, ImageFile, ImageDraw
11
+ from typing import List, Dict, Tuple, Optional, Any
12
  import pandas as pd
13
 
14
  from pdfminer.high_level import extract_pages
 
59
 
60
  return sum_of_numbers
61
 
62
+ def merge_page_results(data):
63
+ merged = {}
64
+
65
+ for item in data:
66
+ page = item["page"]
67
+
68
+ if page not in merged:
69
+ merged[page] = {
70
+ "page": page,
71
+ "results": {}
72
+ }
73
+
74
+ # Merge line-level results into the existing page
75
+ merged[page]["results"].update(item.get("results", {}))
76
+
77
+ return list(merged.values())
78
+
79
+ def word_level_ocr_output_to_dataframe(ocr_result: dict) -> pd.DataFrame:
80
+ rows = []
81
+ ocr_result = ocr_result[0]
82
+
83
+ page_number = int(ocr_result['page'])
84
+
85
+ for line_key, line_data in ocr_result['results'].items():
86
+ line_number = int(line_data['line'])
87
+ for word in line_data['words']:
88
+ rows.append({
89
+ 'page': page_number,
90
+ 'line': line_number,
91
+ 'word_text': word['text'],
92
+ 'word_x0': word['bounding_box'][0],
93
+ 'word_y0': word['bounding_box'][1],
94
+ 'word_x1': word['bounding_box'][2],
95
+ 'word_y1': word['bounding_box'][3],
96
+ 'line_text': line_data['text'],
97
+ 'line_x0': line_data['bounding_box'][0],
98
+ 'line_y0': line_data['bounding_box'][1],
99
+ 'line_x1': line_data['bounding_box'][2],
100
+ 'line_y1': line_data['bounding_box'][3],
101
+ })
102
+
103
+ return pd.DataFrame(rows)
104
+
105
  def choose_and_run_redactor(file_paths:List[str],
106
  prepared_pdf_file_paths:List[str],
107
  pdf_image_file_paths:List[str],
 
542
  # Analyse text-based pdf
543
  print('Redacting file as text-based PDF')
544
 
545
+ pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
546
  file_path,
547
  language,
548
  chosen_redact_entities,
 
556
  all_line_level_ocr_results_df,
557
  all_pages_decision_process_table,
558
  pymupdf_doc,
559
+ [], # All line level ocr results with words
560
  pii_identification_method,
561
  comprehend_query_number,
562
  comprehend_client,
 
566
  match_fuzzy_whole_phrase_bool,
567
  page_sizes_df,
568
  document_cropboxes,
569
+ text_extraction_only,
570
+ output_folder=output_folder)
571
  else:
572
  out_message = "No redaction method selected"
573
  print(out_message)
 
581
  current_loop_page = 999
582
 
583
  if latest_file_completed != len(file_paths_list):
584
+ print("Completed file number:", str(latest_file_completed), "there are more files to do")
 
 
585
 
586
  # Save redacted file
587
  if pii_identification_method != NO_REDACTION_PII_OPTION:
 
615
 
616
  duplication_file_path_outputs.append(ocr_file_path)
617
 
618
+ if all_page_line_level_ocr_results_with_words:
619
+ #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
620
+
621
+ all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
622
+
623
+ # print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
624
+
625
+ file_name = get_file_name_without_type(file_path)
626
+
627
+ all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
628
+
629
+ with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
630
+ json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
631
+
632
+ all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
633
+
634
+ all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
635
+
636
+ all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
637
+
638
+ all_page_line_level_ocr_results_with_words_df_file_path = output_folder + file_name + "_ocr_results_with_words.csv"
639
+
640
+ all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path)
641
+
642
  # Convert the gradio annotation boxes to relative coordinates
643
  # Convert annotations_all_pages to a consistent relative coordinate format output
644
  progress(0.93, "Creating review file output")
 
1410
 
1411
  # If running local OCR option, check if file already exists. If it does, load in existing data
1412
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1413
+ all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
1414
  all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
1415
  original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
1416
 
 
1729
  # Append new annotation if it doesn't exist
1730
  annotations_all_pages.append(page_image_annotations)
1731
 
1732
+ # Save word level options
1733
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1734
  if original_textract_data != textract_data:
1735
  # Write the updated existing textract data back to the JSON file
1736
  with open(textract_json_file_path, 'w') as json_file:
1737
  json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
1738
 
1739
+ if textract_json_file_path not in log_files_output_paths:
1740
+ log_files_output_paths.append(textract_json_file_path)
1741
+
1742
+ all_page_line_level_ocr_results_with_words_json_file_path_textract = output_folder + file_name + "_ocr_results_with_words_textract.json"
1743
+
1744
+ with open(all_page_line_level_ocr_results_with_words_json_file_path_textract, 'w') as json_file:
1745
+ json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
1746
+
1747
+ if all_page_line_level_ocr_results_with_words_json_file_path_textract not in log_files_output_paths:
1748
+ log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path_textract)
1749
 
1750
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1751
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1752
  # Write the updated existing textract data back to the JSON file
1753
+
1754
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
1755
+ json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
1756
 
1757
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1758
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1759
 
 
 
 
1760
  all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1761
  all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
1762
 
 
1763
  current_loop_page += 1
1764
 
1765
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
 
1856
  return characters
1857
  return []
1858
 
1859
+ def create_line_level_ocr_results_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
1860
  '''
1861
  Create an OCRResult object based on a list of pdfminer LTChar objects.
1862
  '''
1863
 
1864
  line_level_results_out = []
1865
  line_level_characters_out = []
1866
+ line_level_words_out = {}
1867
+ character_objects_out = []
 
1868
 
1869
  # Initialize variables
1870
  full_text = ""
1871
  added_text = ""
1872
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
1873
+ line_bboxes = []
1874
 
1875
  # Iterate through the character objects
1876
  current_word = ""
 
1884
  # character_text_objects_out.append(character_text)
1885
 
1886
  if isinstance(char, LTAnno):
 
1887
  added_text = char.get_text()
1888
 
1889
  # Handle double quotes
 
1892
  # Handle space separately by finalizing the word
1893
  full_text += added_text # Adds space or newline
1894
 
1895
+ if current_word: # Only finalise if there is a current word
1896
+ line_bboxes.append((current_word, current_word_bbox))
1897
  current_word = ""
1898
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
1899
 
1900
  # Check for line break (assuming a new line is indicated by a specific character)
1901
  if '\n' in added_text:
1902
 
1903
+ # finalise the current line
1904
  if current_word:
1905
+ line_bboxes.append((current_word, current_word_bbox))
1906
  # Create an OCRResult for the current line
1907
  line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
1908
  line_level_characters_out.append(character_objects_out)
 
1942
  current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
1943
  current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
1944
 
1945
+ # Finalise the last word if any
1946
  if current_word:
1947
+ line_bboxes.append((current_word, current_word_bbox))
1948
 
1949
  if full_text:
1950
+ print("full_text found")
1951
  if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
1952
  # Convert special characters to a human-readable format
1953
 
1954
  full_text = clean_unicode_text(full_text)
1955
  full_text = full_text.strip()
1956
 
1957
+ line_ocr_result_bbox = round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)
1958
 
1959
+ line_ocr_result = OCRResult(full_text.strip(), line_ocr_result_bbox)
1960
 
1961
+ line_level_results_out.append(line_ocr_result)
1962
 
1963
+ else:
1964
+ line_ocr_result_bbox = []
1965
+
1966
+ # if line_ocr_result_bbox:
1967
+ # line_level_words_out["page"] = 1
1968
+ # line_level_words_out['results'] = {'text_line_1':{"line":1, "text":full_text, "bounding_box": line_ocr_result_bbox, "words": line_bboxes}}
1969
+ # else:
1970
+ # line_level_words_out = {}
1971
+
1972
+
1973
+ return line_level_results_out, line_level_characters_out # Return both results and character objects
1974
+
1975
+ def generate_word_level_ocr(char_objects: List, page_number: int, text_line_number:int) -> Dict[str, Any]:
1976
+ """
1977
+ Generates a dictionary with line and word-level OCR results from a list of pdfminer.six objects.
1978
+
1979
+ This robust version handles real-world pdfminer.six output by:
1980
+ 1. Filtering out non-character (LTAnno) objects that lack coordinate data.
1981
+ 2. Sorting all text characters (LTChar) into a proper reading order.
1982
+ 3. Using an adaptive threshold for detecting spaces based on character font size.
1983
+
1984
+ Args:
1985
+ char_objects: A mixed list of pdfminer.six LTChar and LTAnno objects from a single page.
1986
+ page_number: The page number where the characters are from.
1987
+
1988
+ Returns:
1989
+ A dictionary formatted with page, line, and word-level results.
1990
+ """
1991
+ # **CRITICAL FIX: Filter out LTAnno objects, as they lack '.bbox' and are not needed for layout analysis.**
1992
+ text_chars = [c for c in char_objects if isinstance(c, LTChar)]
1993
+
1994
+ if not text_chars:
1995
+ return {"page": str(page_number), "results": {}}
1996
+
1997
+ # Sort the remaining text characters into reading order.
1998
+ text_chars.sort(key=lambda c: (-c.bbox[3], c.bbox[0]))
1999
+
2000
+ page_data = {"page": str(page_number), "results": {}}
2001
+ line_number = text_line_number
2002
+
2003
+ # State variables
2004
+ line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
2005
+ current_word_text, current_word_bbox = "", [float('inf'), float('inf'), -1, -1]
2006
+ prev_char = None
2007
+
2008
+ def finalize_word():
2009
+ nonlocal current_word_text, current_word_bbox
2010
+ word_text = current_word_text.strip()
2011
+ if word_text:
2012
+ line_words.append({
2013
+ "text": word_text,
2014
+ "bounding_box": [round(b, 2) for b in current_word_bbox]
2015
+ })
2016
+ current_word_text = ""
2017
+ current_word_bbox = [float('inf'), float('inf'), -1, -1]
2018
+
2019
+ def finalize_line():
2020
+ nonlocal line_text, line_bbox, line_words, line_number, prev_char
2021
+ finalize_word()
2022
+ if line_text.strip():
2023
+ page_data["results"][f"text_line_{line_number}"] = {
2024
+ "line": line_number,
2025
+ "text": line_text.strip(),
2026
+ "bounding_box": [round(b, 2) for b in line_bbox],
2027
+ "words": line_words
2028
+ }
2029
+ line_number += 1
2030
+ line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
2031
+ prev_char = None
2032
+
2033
+ for char in text_chars:
2034
+ char_text = clean_unicode_text(char.get_text())
2035
+
2036
+ if prev_char:
2037
+ char_height = char.bbox[3] - char.bbox[1]
2038
+ vertical_gap = abs(char.bbox[1] - prev_char.bbox[1])
2039
+
2040
+ # Line break detection
2041
+ if vertical_gap > char_height * 0.7:
2042
+ finalize_line()
2043
+ else:
2044
+ # Check for spacing between characters
2045
+ space_threshold = char.size * 0.5
2046
+ gap = char.bbox[0] - prev_char.bbox[2]
2047
+ if gap > max(space_threshold, 1.0):
2048
+ finalize_word()
2049
+ line_text += " "
2050
+
2051
+ # ✅ Explicitly finalize if space character
2052
+ if char_text == " ":
2053
+ finalize_word()
2054
+ line_text += " "
2055
+ prev_char = char
2056
+ continue
2057
+
2058
+ current_word_text += char_text
2059
+ line_text += char_text
2060
+
2061
+ # Update bounding boxes
2062
+ current_word_bbox[0] = min(current_word_bbox[0], char.bbox[0])
2063
+ current_word_bbox[1] = min(current_word_bbox[1], char.bbox[1])
2064
+ current_word_bbox[2] = max(current_word_bbox[2], char.bbox[2])
2065
+ current_word_bbox[3] = max(current_word_bbox[3], char.bbox[3])
2066
+
2067
+ line_bbox[0] = min(line_bbox[0], char.bbox[0])
2068
+ line_bbox[1] = min(line_bbox[1], char.bbox[1])
2069
+ line_bbox[2] = max(line_bbox[2], char.bbox[2])
2070
+ line_bbox[3] = max(line_bbox[3], char.bbox[3])
2071
+
2072
+ prev_char = char
2073
+
2074
+ finalize_line()
2075
+
2076
+ return page_data
2077
 
2078
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
2079
  decision_process_table = pd.DataFrame()
 
2123
  return pikepdf_redaction_annotations_on_page
2124
 
2125
  def redact_text_pdf(
2126
+ file_path: str, # Path to the PDF file to be redacted
2127
  language: str, # Language of the PDF content
2128
  chosen_redact_entities: List[str], # List of entities to be redacted
2129
  chosen_redact_comprehend_entities: List[str],
 
2136
  all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
2137
  all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
2138
  pymupdf_doc: List = [], # List of PyMuPDF documents
2139
+ all_page_line_level_ocr_results_with_words: List = [],
2140
  pii_identification_method: str = "Local",
2141
  comprehend_query_number:int = 0,
2142
  comprehend_client="",
 
2147
  page_sizes_df:pd.DataFrame=pd.DataFrame(),
2148
  original_cropboxes:List[dict]=[],
2149
  text_extraction_only:bool=False,
2150
+ output_folder:str=OUTPUT_FOLDER,
2151
  page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
2152
  max_time: int = int(MAX_TIME_VALUE),
2153
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
 
2157
  Redact chosen entities from a PDF that is made up of multiple pages that are not images.
2158
 
2159
  Input Variables:
2160
+ - file_path: Path to the PDF file to be redacted
2161
  - language: Language of the PDF content
2162
  - chosen_redact_entities: List of entities to be redacted
2163
  - chosen_redact_comprehend_entities: List of entities to be redacted for AWS Comprehend
 
2181
  - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
2182
  - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
2183
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
2184
+ - output_folder (str, optional): The output folder for the function
2185
  - page_break_val: Value for page break
2186
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
2187
  - progress: Progress tracking object
 
2211
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
2212
 
2213
  # Open with Pikepdf to get text lines
2214
+ pikepdf_pdf = Pdf.open(file_path)
2215
+ number_of_pages = len(pikepdf_pdf.pages)
2216
+
2217
+ file_name = get_file_name_without_type(file_path)
2218
+
2219
+ if not all_page_line_level_ocr_results_with_words:
2220
+ all_page_line_level_ocr_results_with_words = []
2221
 
2222
  # Check that page_min and page_max are within expected ranges
2223
  if page_max > number_of_pages or page_max == 0:
 
2249
 
2250
  if page_min <= page_no < page_max:
2251
  # Go page by page
2252
+ for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
2253
 
2254
  all_page_line_text_extraction_characters = []
2255
  all_page_line_level_text_extraction_results_list = []
 
2261
  page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
2262
  page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
2263
 
2264
+ text_line_no = 0
2265
  for n, text_container in enumerate(page_layout):
2266
  characters = []
2267
 
2268
  if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
2269
  characters = get_text_container_characters(text_container)
2270
+ text_line_no += 1
2271
 
2272
  # Create dataframe for all the text on the page
2273
+ line_level_text_results_list, line_characters, = create_line_level_ocr_results_from_characters(characters)
2274
+
2275
+ line_level_ocr_results_with_words = generate_word_level_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
2276
 
2277
  ### Create page_text_ocr_outputs (OCR format outputs)
2278
  if line_level_text_results_list:
 
2290
 
2291
  all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
2292
  all_page_line_text_extraction_characters.extend(line_characters)
2293
+ all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
2294
 
2295
  ### REDACTION
2296
  if pii_identification_method != NO_REDACTION_PII_OPTION:
 
2341
 
2342
  # Join extracted text outputs for all lines together
2343
  if not page_text_ocr_outputs.empty:
2344
+ #page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
2345
  page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
2346
+ all_line_level_ocr_results_list.append(page_text_ocr_outputs)
2347
 
2348
  toc = time.perf_counter()
2349
 
 
2372
 
2373
  current_loop_page += 1
2374
 
2375
+ return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
2376
 
2377
  # Check if the image already exists in annotations_all_pages
2378
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == page_image_annotations["image"]), None)
 
2393
  # Write logs
2394
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2395
 
2396
+ return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
2397
 
2398
  # Write all page outputs
2399
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
 
2420
  if not all_line_level_ocr_results_df.empty:
2421
  all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
2422
  all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
2423
+
2424
+ all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
2425
+
2426
+ #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
2427
+ with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
2428
+ json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
2429
 
2430
+ return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
tools/helper_functions.py CHANGED
@@ -244,13 +244,27 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
244
  else:
245
  return False
246
 
247
- def check_for_existing_local_ocr_file(doc_file_name_no_extension_textbox:str, output_folder:str=OUTPUT_FOLDER):
248
- local_ocr_output_path = os.path.join(output_folder, doc_file_name_no_extension_textbox + "_ocr_results_with_words.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  if os.path.exists(local_ocr_output_path):
251
- print("Existing local OCR analysis output file found.")
252
- return True
253
-
254
  else:
255
  return False
256
 
 
244
  else:
245
  return False
246
 
247
+ def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:str, text_extraction_method:str, output_folder:str=OUTPUT_FOLDER):
248
+ if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
249
+ elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
250
+ elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
251
+ else:
252
+ print("No valid text extraction method found. Returning False")
253
+ return False
254
+
255
+ print("doc_file_name_no_extension_textbox:", doc_file_name_no_extension_textbox)
256
+
257
+ doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
258
+
259
+ print("doc_file_with_ending:", doc_file_with_ending)
260
+
261
+ local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
262
+
263
+ print("looking for file path:", local_ocr_output_path)
264
 
265
  if os.path.exists(local_ocr_output_path):
266
+ print("Existing OCR with words analysis output file found.")
267
+ return True
 
268
  else:
269
  return False
270