presidio-de-identify

Sleeping

File size: 6,741 Bytes

import logging
import os
import base64
import datetime
import dotenv
import pandas as pd
import streamlit as st
from streamlit_tags import st_tags
from PyPDF2 import PdfReader, PdfWriter
from presidio_helpers import (
    analyzer_engine,
    get_supported_entities,
    analyze,
    anonymize,
)

st.set_page_config(
    page_title="Presidio PHI De-identification",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={"About": "https://microsoft.github.io/presidio/"},
)

dotenv.load_dotenv()
logger = logging.getLogger("presidio-streamlit")

# Sidebar
st.sidebar.header("PHI De-identification with Presidio")

model_help_text = "Select Named Entity Recognition (NER) model for PHI detection."
model_list = [
    ("spaCy/en_core_web_lg", "https://huggingface.co/spacy/en_core_web_lg"),
    ("HuggingFace/obi/deid_roberta_i2b2", "https://huggingface.co/obi/deid_roberta_i2b2"),
    ("flair/ner-english-large", "https://huggingface.co/flair/ner-english-large"),
    ("HuggingFace/StanfordAIMI/stanford-deidentifier-base", "https://huggingface.co/StanfordAIMI/stanford-deidentifier-base"),
]

st_model = st.sidebar.selectbox(
    "NER model package",
    [model[0] for model in model_list],
    index=1,
    help=model_help_text,
)

# Display HuggingFace link for selected model
selected_model_url = next(url for model, url in model_list if model == st_model)
st.sidebar.markdown(f"[View model on HuggingFace]({selected_model_url})")

# Extract model package
st_model_package = st_model.split("/")[0]
st_model = st_model if st_model_package.lower() not in ("spacy", "huggingface") else "/".join(st_model.split("/")[1:])

analyzer_params = (st_model_package, st_model)
st.sidebar.warning("Note: Models might take some time to download on first run.")

st_operator = st.sidebar.selectbox(
    "De-identification approach",
    ["replace", "redact", "mask"],
    index=0,
    help="Select PHI manipulation method.",
)

st_threshold = st.sidebar.slider(
    label="Acceptance threshold",
    min_value=0.0,
    max_value=1.0,
    value=0.35,
)

st_return_decision_process = st.sidebar.checkbox(
    "Add analysis explanations",
    value=False,
)

# Allow and deny lists
with st.sidebar.expander("Allowlists and denylists", expanded=False):
    st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
    st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")

# Main panel
col1, col2 = st.columns(2)

with col1:
    st.subheader("Input")
    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
    
    if uploaded_file:
        try:
            # Read PDF
            pdf_reader = PdfReader(uploaded_file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"

            # Initialize analyzer
            try:
                analyzer = analyzer_engine(*analyzer_params)
            except Exception as e:
                st.error(f"Failed to load model: {str(e)}")
                st.info("Ensure models are downloaded (e.g., 'python -m spacy download en_core_web_lg') and check network/permissions.")
                raise

            # Analyze
            st_analyze_results = analyze(
                analyzer=analyzer,
                text=text,
                entities=get_supported_entities(*analyzer_params),
                language="en",
                score_threshold=st_threshold,
                return_decision_process=st_return_decision_process,
                allow_list=st_allow_list,
                deny_list=st_deny_list,
            )

            # Process results
            phi_types = set(res.entity_type for res in st_analyze_results)
            if phi_types:
                st.success(f"Removed PHI types: {', '.join(phi_types)}")
            else:
                st.info("No PHI detected")

            # Anonymize
            anonymized_result = anonymize(
                text=text,
                operator=st_operator,
                analyze_results=st_analyze_results,
            )

            # Create new PDF
            pdf_writer = PdfWriter()
            for page in pdf_reader.pages:
                pdf_writer.add_page(page)

            # Generate output filename with timestamp
            timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
            output_filename = f"{timestamp}_{uploaded_file.name}"
            
            # Save modified PDF
            try:
                with open(output_filename, "wb") as f:
                    pdf_writer.write(f)
            except PermissionError as e:
                st.error(f"Permission denied when saving PDF: {str(e)}")
                st.info("Check write permissions in the current directory.")
                raise

            # Generate base64 download link
            try:
                with open(output_filename, "rb") as f:
                    pdf_bytes = f.read()
                    b64 = base64.b64encode(pdf_bytes).decode()
                    href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
                    st.markdown(href, unsafe_allow_html=True)
            except Exception as e:
                st.error(f"Error generating download link: {str(e)}")
                raise

            # Display findings
            with col2:
                st.subheader("Findings")
                if st_analyze_results:
                    df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
                    df["text"] = [text[res.start:res.end] for res in st_analyze_results]
                    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
                        {
                            "entity_type": "Entity type",
                            "text": "Text",
                            "start": "Start",
                            "end": "End",
                            "score": "Confidence",
                        },
                        axis=1,
                    )
                    if st_return_decision_process:
                        analysis_explanation_df = pd.DataFrame.from_records(
                            [r.analysis_explanation.to_dict() for r in st_analyze_results]
                        )
                        df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
                    st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
                else:
                    st.text("No findings")

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
            logger.error(f"Processing error: {str(e)}")