Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / tools /cli_redact.py

seanpedrickcase

Updated command line redaction script with more options

3bff849 about 1 month ago

raw

history blame

9.08 kB

	import argparse
	import os
	import pandas as pd
	from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
	from tools.helper_functions import ensure_output_folder_exists
	from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
	from tools.file_redaction import choose_and_run_redactor
	from tools.anonymisation import anonymise_files_with_open_text

	# --- Constants and Configuration ---
	INPUT_FOLDER = 'input/'
	OUTPUT_FOLDER = 'output/'
	DEFAULT_LANGUAGE = 'en'

	# Define entities for redaction
	chosen_comprehend_entities = [
	'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
	'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
	'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
	'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
	'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
	'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
	]
	chosen_redact_entities = [
	"TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
	]

	# --- Main CLI Function ---
	def main():
	"""
	A unified command-line interface to prepare, redact, and anonymise various document types.
	"""
	parser = argparse.ArgumentParser(
	description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
	formatter_class=argparse.RawTextHelpFormatter
	)

	# --- General Arguments (apply to all file types) ---
	general_group = parser.add_argument_group('General Options')
	general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
	general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
	general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
	general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
	general_group.add_argument('--pii_detector',
	choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
	default=LOCAL_PII_OPTION,
	help='Core PII detection method (Local or AWS).')
	general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
	general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')

	# --- PDF/Image Redaction Arguments ---
	pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
	pdf_group.add_argument('--ocr_method',
	choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
	default=TESSERACT_TEXT_EXTRACT_OPTION,
	help='OCR method for text extraction from images.')
	pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
	pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
	pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
	pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')

	# --- Word/Tabular Anonymisation Arguments ---
	tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
	tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
	tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
	tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
	tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
	tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')

	args = parser.parse_args()

	# --- Initial Setup ---
	ensure_output_folder_exists(args.output_dir)
	_, file_extension = os.path.splitext(args.input_file)
	file_extension = file_extension.lower()

	# Load allow/deny lists
	allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
	deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []


	# --- Route to the Correct Workflow Based on File Type ---

	# Workflow 1: PDF/Image Redaction
	if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
	print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
	try:
	# Step 1: Prepare the document
	print("\nStep 1: Preparing document...")
	(
	prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
	image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
	) = prepare_image_or_pdf(
	file_paths=[args.input_file], text_extract_method=args.ocr_method,
	all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
	first_loop_state=True, prepare_for_review=args.prepare_for_review,
	output_folder=args.output_dir, prepare_images=args.prepare_images
	)
	print(f"Preparation complete. {prep_summary}")

	# Step 2: Redact the prepared document
	print("\nStep 2: Running redaction...")
	(
	output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
	) = choose_and_run_redactor(
	file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
	pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
	chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
	in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
	pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
	document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
	aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
	language=args.language, output_folder=args.output_dir
	)

	print("\n--- Redaction Process Complete ---")
	print(f"Summary: {output_summary}")
	print(f"\nOutput files saved to: {args.output_dir}")
	print("Generated Files:", sorted(output_files))
	if log_files: print("Log Files:", sorted(log_files))

	except Exception as e:
	print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")

	# Workflow 2: Word/Tabular Data Anonymisation
	elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
	print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
	try:
	# Run the anonymisation function directly
	output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
	file_paths=[args.input_file],
	in_text="", # Not used for file-based operations
	anon_strat=args.anon_strat,
	chosen_cols=args.columns,
	chosen_redact_entities=chosen_redact_entities,
	in_allow_list=allow_list,
	in_excel_sheets=args.excel_sheets,
	first_loop_state=True,
	output_folder=args.output_dir,
	in_deny_list=deny_list,
	max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
	pii_identification_method=args.pii_detector,
	chosen_redact_comprehend_entities=chosen_comprehend_entities,
	aws_access_key_textbox=args.aws_access_key,
	aws_secret_key_textbox=args.aws_secret_key,
	language=args.language
	)

	print("\n--- Anonymisation Process Complete ---")
	print(f"Summary: {output_summary}")
	print(f"\nOutput files saved to: {args.output_dir}")
	print("Generated Files:", sorted(output_files))
	if log_files: print("Log Files:", sorted(log_files))

	except Exception as e:
	print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")

	else:
	print(f"Error: Unsupported file type '{file_extension}'.")
	print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
	print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")

	if __name__ == "__main__":
	main()