phishing-detector-api / phishing_datasets.py
kokluch's picture
Mask PII
8286de5
raw
history blame contribute delete
917 Bytes
import pandas as pd
from datasets import load_dataset, Dataset
import os
from piiranha import mask_pii
DATASET_NAME = os.getenv("DATASET_NAME")
dataset = load_dataset(DATASET_NAME, split="train")
df = pd.DataFrame(dataset)
def submit_entry(sender, message):
"""Adds a new SMS phishing report if it's not already in the dataset."""
global df
sender = sender.strip().replace(" ", "") # Remove all spaces inside sender
message = mask_pii(message).strip()
# Check for duplicates
if ((df["sender"] == sender) & (df["message"] == message)).any():
"⚠️ This entry already exists in the dataset!"
# Append new entry
new_entry = pd.DataFrame([[sender, message]], columns=["sender", "message"])
df = pd.concat([df, new_entry], ignore_index=True)
new_dataset = Dataset.from_pandas(df)
new_dataset.push_to_hub(DATASET_NAME)
"✅ Submission saved successfully!"