document_redaction / tools /cli_redact.py
seanpedrickcase's picture
Updated command line redaction script with more options
3bff849
raw
history blame
9.08 kB
import argparse
import os
import pandas as pd
from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
from tools.helper_functions import ensure_output_folder_exists
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
from tools.file_redaction import choose_and_run_redactor
from tools.anonymisation import anonymise_files_with_open_text
# --- Constants and Configuration ---
INPUT_FOLDER = 'input/'
OUTPUT_FOLDER = 'output/'
DEFAULT_LANGUAGE = 'en'
# Define entities for redaction
chosen_comprehend_entities = [
'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
]
chosen_redact_entities = [
"TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
]
# --- Main CLI Function ---
def main():
"""
A unified command-line interface to prepare, redact, and anonymise various document types.
"""
parser = argparse.ArgumentParser(
description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
formatter_class=argparse.RawTextHelpFormatter
)
# --- General Arguments (apply to all file types) ---
general_group = parser.add_argument_group('General Options')
general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
general_group.add_argument('--pii_detector',
choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
default=LOCAL_PII_OPTION,
help='Core PII detection method (Local or AWS).')
general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
# --- PDF/Image Redaction Arguments ---
pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
pdf_group.add_argument('--ocr_method',
choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
default=TESSERACT_TEXT_EXTRACT_OPTION,
help='OCR method for text extraction from images.')
pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
# --- Word/Tabular Anonymisation Arguments ---
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
args = parser.parse_args()
# --- Initial Setup ---
ensure_output_folder_exists(args.output_dir)
_, file_extension = os.path.splitext(args.input_file)
file_extension = file_extension.lower()
# Load allow/deny lists
allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
# --- Route to the Correct Workflow Based on File Type ---
# Workflow 1: PDF/Image Redaction
if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
try:
# Step 1: Prepare the document
print("\nStep 1: Preparing document...")
(
prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
) = prepare_image_or_pdf(
file_paths=[args.input_file], text_extract_method=args.ocr_method,
all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
first_loop_state=True, prepare_for_review=args.prepare_for_review,
output_folder=args.output_dir, prepare_images=args.prepare_images
)
print(f"Preparation complete. {prep_summary}")
# Step 2: Redact the prepared document
print("\nStep 2: Running redaction...")
(
output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
) = choose_and_run_redactor(
file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
language=args.language, output_folder=args.output_dir
)
print("\n--- Redaction Process Complete ---")
print(f"Summary: {output_summary}")
print(f"\nOutput files saved to: {args.output_dir}")
print("Generated Files:", sorted(output_files))
if log_files: print("Log Files:", sorted(log_files))
except Exception as e:
print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
# Workflow 2: Word/Tabular Data Anonymisation
elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
try:
# Run the anonymisation function directly
output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
file_paths=[args.input_file],
in_text="", # Not used for file-based operations
anon_strat=args.anon_strat,
chosen_cols=args.columns,
chosen_redact_entities=chosen_redact_entities,
in_allow_list=allow_list,
in_excel_sheets=args.excel_sheets,
first_loop_state=True,
output_folder=args.output_dir,
in_deny_list=deny_list,
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
pii_identification_method=args.pii_detector,
chosen_redact_comprehend_entities=chosen_comprehend_entities,
aws_access_key_textbox=args.aws_access_key,
aws_secret_key_textbox=args.aws_secret_key,
language=args.language
)
print("\n--- Anonymisation Process Complete ---")
print(f"Summary: {output_summary}")
print(f"\nOutput files saved to: {args.output_dir}")
print("Generated Files:", sorted(output_files))
if log_files: print("Log Files:", sorted(log_files))
except Exception as e:
print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
else:
print(f"Error: Unsupported file type '{file_extension}'.")
print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
if __name__ == "__main__":
main()