|
import os |
|
import tempfile |
|
import socket |
|
import logging |
|
from datetime import datetime |
|
from dotenv import load_dotenv |
|
from tldextract import TLDExtract |
|
|
|
today_rev = datetime.now().strftime("%Y%m%d") |
|
HOST_NAME = socket.gethostname() |
|
|
|
|
|
|
|
def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False): |
|
''' |
|
Get an environmental variable, and set it to a default value if it doesn't exist |
|
''' |
|
|
|
value = os.environ.get(var_name) |
|
|
|
|
|
if value is None: |
|
os.environ[var_name] = default_value |
|
value = default_value |
|
|
|
if print_val == True: |
|
print(f'The value of {var_name} is {value}') |
|
|
|
return value |
|
|
|
def ensure_folder_exists(output_folder:str): |
|
"""Checks if the specified folder exists, creates it if not.""" |
|
|
|
if not os.path.exists(output_folder): |
|
|
|
os.makedirs(output_folder, exist_ok=True) |
|
print(f"Created the {output_folder} folder.") |
|
else: |
|
print(f"The {output_folder} folder already exists.") |
|
|
|
def add_folder_to_path(folder_path: str): |
|
''' |
|
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run) |
|
''' |
|
|
|
if os.path.exists(folder_path) and os.path.isdir(folder_path): |
|
print(folder_path, "folder exists.") |
|
|
|
|
|
absolute_path = os.path.abspath(folder_path) |
|
|
|
current_path = os.environ['PATH'] |
|
if absolute_path not in current_path.split(os.pathsep): |
|
full_path_extension = absolute_path + os.pathsep + current_path |
|
os.environ['PATH'] = full_path_extension |
|
|
|
else: |
|
print(f"Directory {folder_path} already exists in PATH.") |
|
else: |
|
print(f"Folder not found at {folder_path} - not added to PATH") |
|
|
|
|
|
|
|
|
|
|
|
|
|
ensure_folder_exists("config/") |
|
|
|
|
|
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') |
|
|
|
if APP_CONFIG_PATH: |
|
if os.path.exists(APP_CONFIG_PATH): |
|
print(f"Loading app variables from config file {APP_CONFIG_PATH}") |
|
load_dotenv(APP_CONFIG_PATH) |
|
else: print("App config file not found at location:", APP_CONFIG_PATH) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') |
|
|
|
if AWS_CONFIG_PATH: |
|
if os.path.exists(AWS_CONFIG_PATH): |
|
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}") |
|
load_dotenv(AWS_CONFIG_PATH) |
|
else: print("AWS config file not found at location:", AWS_CONFIG_PATH) |
|
|
|
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0") |
|
|
|
AWS_REGION = get_or_create_env_var('AWS_REGION', '') |
|
|
|
AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '') |
|
|
|
AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '') |
|
|
|
AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '') |
|
|
|
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '') |
|
if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables') |
|
|
|
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '') |
|
if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables') |
|
|
|
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '') |
|
|
|
|
|
|
|
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '') |
|
|
|
|
|
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0') |
|
LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True') |
|
MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') |
|
|
|
|
|
|
|
|
|
|
|
SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') |
|
|
|
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') |
|
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') |
|
|
|
ensure_folder_exists(OUTPUT_FOLDER) |
|
ensure_folder_exists(INPUT_FOLDER) |
|
|
|
|
|
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP": |
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
print(f'Temporary directory created at: {temp_dir}') |
|
|
|
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/" |
|
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True') |
|
|
|
USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True') |
|
|
|
if USE_LOG_SUBFOLDERS == "True": |
|
day_log_subfolder = today_rev + '/' |
|
host_name_subfolder = HOST_NAME + '/' |
|
full_log_subfolder = day_log_subfolder + host_name_subfolder |
|
else: |
|
full_log_subfolder = "" |
|
|
|
FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder) |
|
ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder) |
|
USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder) |
|
|
|
ensure_folder_exists(FEEDBACK_LOGS_FOLDER) |
|
ensure_folder_exists(ACCESS_LOGS_FOLDER) |
|
ensure_folder_exists(USAGE_LOGS_FOLDER) |
|
|
|
|
|
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False') |
|
|
|
|
|
|
|
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') |
|
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') |
|
CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') |
|
|
|
|
|
|
|
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False') |
|
|
|
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log') |
|
DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var('DYNAMODB_ACCESS_LOG_HEADERS', '') |
|
|
|
FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', 'redaction_feedback') |
|
DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEADERS', '') |
|
|
|
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage') |
|
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '') |
|
|
|
|
|
LOGGING = get_or_create_env_var('LOGGING', 'False') |
|
|
|
if LOGGING == 'True': |
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") |
|
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") |
|
|
|
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER) |
|
if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER) |
|
|
|
|
|
|
|
SELECTABLE_TEXT_EXTRACT_OPTION = get_or_create_env_var('SELECTABLE_TEXT_EXTRACT_OPTION', "Local model - selectable text") |
|
TESSERACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TESSERACT_TEXT_EXTRACT_OPTION', "Local OCR model - PDFs without selectable text") |
|
TEXTRACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TEXTRACT_TEXT_EXTRACT_OPTION', "AWS Textract service - all PDF types") |
|
|
|
|
|
NO_REDACTION_PII_OPTION = get_or_create_env_var('NO_REDACTION_PII_OPTION', "Only extract text (no redaction)") |
|
LOCAL_PII_OPTION = get_or_create_env_var('LOCAL_PII_OPTION', "Local") |
|
AWS_PII_OPTION = get_or_create_env_var('AWS_PII_OPTION', "AWS Comprehend") |
|
|
|
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS', 'True') |
|
SHOW_AWS_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_AWS_TEXT_EXTRACTION_OPTIONS', 'True') |
|
|
|
|
|
if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS != "True" and SHOW_AWS_TEXT_EXTRACTION_OPTIONS != "True": |
|
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = "True" |
|
|
|
local_model_options = [] |
|
aws_model_options = [] |
|
text_extraction_models = [] |
|
|
|
if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS == 'True': |
|
local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION) |
|
local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION) |
|
|
|
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == 'True': |
|
aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION) |
|
|
|
TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options |
|
|
|
SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True') |
|
SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True') |
|
|
|
if SHOW_LOCAL_PII_DETECTION_OPTIONS != "True" and SHOW_AWS_PII_DETECTION_OPTIONS != "True": |
|
SHOW_LOCAL_PII_DETECTION_OPTIONS = "True" |
|
|
|
local_model_options = [NO_REDACTION_PII_OPTION] |
|
aws_model_options = [] |
|
pii_detection_models = [] |
|
|
|
if SHOW_LOCAL_PII_DETECTION_OPTIONS == 'True': |
|
local_model_options.append(LOCAL_PII_OPTION) |
|
|
|
if SHOW_AWS_PII_DETECTION_OPTIONS == 'True': |
|
aws_model_options.append(AWS_PII_OPTION) |
|
|
|
PII_DETECTION_MODELS = local_model_options + aws_model_options |
|
|
|
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True": |
|
DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', TEXTRACT_TEXT_EXTRACT_OPTION) |
|
else: |
|
DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', SELECTABLE_TEXT_EXTRACT_OPTION) |
|
|
|
if SHOW_AWS_PII_DETECTION_OPTIONS == "True": |
|
DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', AWS_PII_OPTION) |
|
else: |
|
DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', LOCAL_PII_OPTION) |
|
|
|
|
|
TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy() |
|
if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS: |
|
TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION) |
|
|
|
|
|
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']") |
|
|
|
FULL_COMPREHEND_ENTITY_LIST = get_or_create_env_var('FULL_COMPREHEND_ENTITY_LIST', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', 'CUSTOM_FUZZY']") |
|
|
|
|
|
CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CUSTOM']") |
|
|
|
FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']") |
|
|
|
|
|
|
|
PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999') |
|
|
|
MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999') |
|
|
|
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") |
|
|
|
REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") |
|
|
|
RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") |
|
|
|
COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") |
|
|
|
|
|
|
|
|
|
|
|
TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot') |
|
try: |
|
extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE) |
|
except: |
|
extract = TLDExtract(cache_dir=None) |
|
|
|
|
|
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0') |
|
|
|
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0') |
|
|
|
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5')) |
|
|
|
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb') |
|
|
|
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860')) |
|
|
|
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '') |
|
|
|
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3') |
|
|
|
GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '') |
|
|
|
ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') |
|
|
|
S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') |
|
|
|
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH |
|
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv' |
|
|
|
FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200') |
|
|
|
|
|
|
|
|
|
|
|
|
|
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False') |
|
|
|
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False') |
|
|
|
DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '') |
|
|
|
COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') |
|
|
|
S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') |
|
|
|
|
|
if COST_CODES_PATH: OUTPUT_COST_CODES_PATH = COST_CODES_PATH |
|
else: OUTPUT_COST_CODES_PATH = 'config/cost_codes.csv' |
|
|
|
ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') |
|
|
|
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') |
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET', '') |
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER', 'input') |
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER', 'output') |
|
|
|
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') |
|
|
|
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') |
|
|
|
TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') |
|
|
|
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') |
|
|
|
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') |