|
import argparse |
|
import os |
|
import pandas as pd |
|
from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION |
|
from tools.helper_functions import ensure_output_folder_exists |
|
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf |
|
from tools.file_redaction import choose_and_run_redactor |
|
from tools.anonymisation import anonymise_files_with_open_text |
|
|
|
|
|
INPUT_FOLDER = 'input/' |
|
OUTPUT_FOLDER = 'output/' |
|
DEFAULT_LANGUAGE = 'en' |
|
|
|
|
|
chosen_comprehend_entities = [ |
|
'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER', |
|
'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS', |
|
'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD', |
|
'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER', |
|
'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER', |
|
'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER' |
|
] |
|
chosen_redact_entities = [ |
|
"TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE" |
|
] |
|
|
|
|
|
def main(): |
|
""" |
|
A unified command-line interface to prepare, redact, and anonymise various document types. |
|
""" |
|
parser = argparse.ArgumentParser( |
|
description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.', |
|
formatter_class=argparse.RawTextHelpFormatter |
|
) |
|
|
|
|
|
general_group = parser.add_argument_group('General Options') |
|
general_group.add_argument('--input_file', required=True, help='Path to the input file to process.') |
|
general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.') |
|
general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.') |
|
general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.') |
|
general_group.add_argument('--pii_detector', |
|
choices=[LOCAL_PII_OPTION, AWS_PII_OPTION], |
|
default=LOCAL_PII_OPTION, |
|
help='Core PII detection method (Local or AWS).') |
|
general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.') |
|
general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.') |
|
|
|
|
|
pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)') |
|
pdf_group.add_argument('--ocr_method', |
|
choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION], |
|
default=TESSERACT_TEXT_EXTRACT_OPTION, |
|
help='OCR method for text extraction from images.') |
|
pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.') |
|
pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.') |
|
pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.') |
|
pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.') |
|
|
|
|
|
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)') |
|
tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.') |
|
tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.') |
|
tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.') |
|
tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.') |
|
tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.') |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
ensure_output_folder_exists(args.output_dir) |
|
_, file_extension = os.path.splitext(args.input_file) |
|
file_extension = file_extension.lower() |
|
|
|
|
|
allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame() |
|
deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else [] |
|
|
|
|
|
|
|
|
|
|
|
if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']: |
|
print("--- Detected PDF/Image file. Starting Redaction Workflow... ---") |
|
try: |
|
|
|
print("\nStep 1: Preparing document...") |
|
( |
|
prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc, |
|
image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _ |
|
) = prepare_image_or_pdf( |
|
file_paths=[args.input_file], text_extract_method=args.ocr_method, |
|
all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(), |
|
first_loop_state=True, prepare_for_review=args.prepare_for_review, |
|
output_folder=args.output_dir, prepare_images=args.prepare_images |
|
) |
|
print(f"Preparation complete. {prep_summary}") |
|
|
|
|
|
print("\nStep 2: Running redaction...") |
|
( |
|
output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ |
|
) = choose_and_run_redactor( |
|
file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths, |
|
pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities, |
|
chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method, |
|
in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max, |
|
pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes, |
|
document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector, |
|
aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key, |
|
language=args.language, output_folder=args.output_dir |
|
) |
|
|
|
print("\n--- Redaction Process Complete ---") |
|
print(f"Summary: {output_summary}") |
|
print(f"\nOutput files saved to: {args.output_dir}") |
|
print("Generated Files:", sorted(output_files)) |
|
if log_files: print("Log Files:", sorted(log_files)) |
|
|
|
except Exception as e: |
|
print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}") |
|
|
|
|
|
elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']: |
|
print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---") |
|
try: |
|
|
|
output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text( |
|
file_paths=[args.input_file], |
|
in_text="", |
|
anon_strat=args.anon_strat, |
|
chosen_cols=args.columns, |
|
chosen_redact_entities=chosen_redact_entities, |
|
in_allow_list=allow_list, |
|
in_excel_sheets=args.excel_sheets, |
|
first_loop_state=True, |
|
output_folder=args.output_dir, |
|
in_deny_list=deny_list, |
|
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, |
|
pii_identification_method=args.pii_detector, |
|
chosen_redact_comprehend_entities=chosen_comprehend_entities, |
|
aws_access_key_textbox=args.aws_access_key, |
|
aws_secret_key_textbox=args.aws_secret_key, |
|
language=args.language |
|
) |
|
|
|
print("\n--- Anonymisation Process Complete ---") |
|
print(f"Summary: {output_summary}") |
|
print(f"\nOutput files saved to: {args.output_dir}") |
|
print("Generated Files:", sorted(output_files)) |
|
if log_files: print("Log Files:", sorted(log_files)) |
|
|
|
except Exception as e: |
|
print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}") |
|
|
|
else: |
|
print(f"Error: Unsupported file type '{file_extension}'.") |
|
print("Supported types for redaction: .pdf, .png, .jpg, .jpeg") |
|
print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet") |
|
|
|
if __name__ == "__main__": |
|
main() |