Spaces:
Sleeping
Sleeping
File size: 6,741 Bytes
35c70df 5b4c45e 35c70df 55290a8 bbda733 35c70df bbda733 35c70df bbda733 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import logging
import os
import base64
import datetime
import dotenv
import pandas as pd
import streamlit as st
from streamlit_tags import st_tags
from PyPDF2 import PdfReader, PdfWriter
from presidio_helpers import (
analyzer_engine,
get_supported_entities,
analyze,
anonymize,
)
st.set_page_config(
page_title="Presidio PHI De-identification",
layout="wide",
initial_sidebar_state="expanded",
menu_items={"About": "https://microsoft.github.io/presidio/"},
)
dotenv.load_dotenv()
logger = logging.getLogger("presidio-streamlit")
# Sidebar
st.sidebar.header("PHI De-identification with Presidio")
model_help_text = "Select Named Entity Recognition (NER) model for PHI detection."
model_list = [
("spaCy/en_core_web_lg", "https://huggingface.co/spacy/en_core_web_lg"),
("HuggingFace/obi/deid_roberta_i2b2", "https://huggingface.co/obi/deid_roberta_i2b2"),
("flair/ner-english-large", "https://huggingface.co/flair/ner-english-large"),
("HuggingFace/StanfordAIMI/stanford-deidentifier-base", "https://huggingface.co/StanfordAIMI/stanford-deidentifier-base"),
]
st_model = st.sidebar.selectbox(
"NER model package",
[model[0] for model in model_list],
index=1,
help=model_help_text,
)
# Display HuggingFace link for selected model
selected_model_url = next(url for model, url in model_list if model == st_model)
st.sidebar.markdown(f"[View model on HuggingFace]({selected_model_url})")
# Extract model package
st_model_package = st_model.split("/")[0]
st_model = st_model if st_model_package.lower() not in ("spacy", "huggingface") else "/".join(st_model.split("/")[1:])
analyzer_params = (st_model_package, st_model)
st.sidebar.warning("Note: Models might take some time to download on first run.")
st_operator = st.sidebar.selectbox(
"De-identification approach",
["replace", "redact", "mask"],
index=0,
help="Select PHI manipulation method.",
)
st_threshold = st.sidebar.slider(
label="Acceptance threshold",
min_value=0.0,
max_value=1.0,
value=0.35,
)
st_return_decision_process = st.sidebar.checkbox(
"Add analysis explanations",
value=False,
)
# Allow and deny lists
with st.sidebar.expander("Allowlists and denylists", expanded=False):
st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")
# Main panel
col1, col2 = st.columns(2)
with col1:
st.subheader("Input")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file:
try:
# Read PDF
pdf_reader = PdfReader(uploaded_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
# Initialize analyzer
try:
analyzer = analyzer_engine(*analyzer_params)
except Exception as e:
st.error(f"Failed to load model: {str(e)}")
st.info("Ensure models are downloaded (e.g., 'python -m spacy download en_core_web_lg') and check network/permissions.")
raise
# Analyze
st_analyze_results = analyze(
analyzer=analyzer,
text=text,
entities=get_supported_entities(*analyzer_params),
language="en",
score_threshold=st_threshold,
return_decision_process=st_return_decision_process,
allow_list=st_allow_list,
deny_list=st_deny_list,
)
# Process results
phi_types = set(res.entity_type for res in st_analyze_results)
if phi_types:
st.success(f"Removed PHI types: {', '.join(phi_types)}")
else:
st.info("No PHI detected")
# Anonymize
anonymized_result = anonymize(
text=text,
operator=st_operator,
analyze_results=st_analyze_results,
)
# Create new PDF
pdf_writer = PdfWriter()
for page in pdf_reader.pages:
pdf_writer.add_page(page)
# Generate output filename with timestamp
timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
output_filename = f"{timestamp}_{uploaded_file.name}"
# Save modified PDF
try:
with open(output_filename, "wb") as f:
pdf_writer.write(f)
except PermissionError as e:
st.error(f"Permission denied when saving PDF: {str(e)}")
st.info("Check write permissions in the current directory.")
raise
# Generate base64 download link
try:
with open(output_filename, "rb") as f:
pdf_bytes = f.read()
b64 = base64.b64encode(pdf_bytes).decode()
href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
st.markdown(href, unsafe_allow_html=True)
except Exception as e:
st.error(f"Error generating download link: {str(e)}")
raise
# Display findings
with col2:
st.subheader("Findings")
if st_analyze_results:
df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
df["text"] = [text[res.start:res.end] for res in st_analyze_results]
df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
{
"entity_type": "Entity type",
"text": "Text",
"start": "Start",
"end": "End",
"score": "Confidence",
},
axis=1,
)
if st_return_decision_process:
analysis_explanation_df = pd.DataFrame.from_records(
[r.analysis_explanation.to_dict() for r in st_analyze_results]
)
df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
else:
st.text("No findings")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
logger.error(f"Processing error: {str(e)}") |