kokluch commited on
Commit
8286de5
·
1 Parent(s): d316383
Files changed (2) hide show
  1. phishing_datasets.py +2 -1
  2. piiranha.py +65 -0
phishing_datasets.py CHANGED
@@ -1,6 +1,7 @@
1
  import pandas as pd
2
  from datasets import load_dataset, Dataset
3
  import os
 
4
 
5
  DATASET_NAME = os.getenv("DATASET_NAME")
6
 
@@ -12,7 +13,7 @@ def submit_entry(sender, message):
12
  global df
13
 
14
  sender = sender.strip().replace(" ", "") # Remove all spaces inside sender
15
- message = message.strip()
16
 
17
  # Check for duplicates
18
  if ((df["sender"] == sender) & (df["message"] == message)).any():
 
1
  import pandas as pd
2
  from datasets import load_dataset, Dataset
3
  import os
4
+ from piiranha import mask_pii
5
 
6
  DATASET_NAME = os.getenv("DATASET_NAME")
7
 
 
13
  global df
14
 
15
  sender = sender.strip().replace(" ", "") # Remove all spaces inside sender
16
+ message = mask_pii(message).strip()
17
 
18
  # Check for duplicates
19
  if ((df["sender"] == sender) & (df["message"] == message)).any():
piiranha.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
3
+
4
+ model_name = "iiiorg/piiranha-v1-detect-personal-information"
5
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
6
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
7
+
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+ model.to(device)
10
+
11
+ def mask_pii(text, aggregate_redaction=False):
12
+ # Tokenize input text
13
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
14
+ inputs = {k: v.to(device) for k, v in inputs.items()}
15
+
16
+ # Get the model predictions
17
+ with torch.no_grad():
18
+ outputs = model(**inputs)
19
+
20
+ # Get the predicted labels
21
+ predictions = torch.argmax(outputs.logits, dim=-1)
22
+
23
+ # Convert token predictions to word predictions
24
+ encoded_inputs = tokenizer.encode_plus(text, return_offsets_mapping=True, add_special_tokens=True)
25
+ offset_mapping = encoded_inputs['offset_mapping']
26
+
27
+ masked_text = list(text)
28
+ is_redacting = False
29
+ redaction_start = 0
30
+ current_pii_type = ''
31
+
32
+ for i, (start, end) in enumerate(offset_mapping):
33
+ if start == end: # Special token
34
+ continue
35
+
36
+ label = predictions[0][i].item()
37
+ if label != model.config.label2id['O']: # Non-O label
38
+ pii_type = model.config.id2label[label]
39
+ if not is_redacting:
40
+ is_redacting = True
41
+ redaction_start = start
42
+ current_pii_type = pii_type
43
+ elif not aggregate_redaction and pii_type != current_pii_type:
44
+ # End current redaction and start a new one
45
+ apply_redaction(masked_text, redaction_start, start, current_pii_type, aggregate_redaction)
46
+ redaction_start = start
47
+ current_pii_type = pii_type
48
+ else:
49
+ if is_redacting:
50
+ apply_redaction(masked_text, redaction_start, end, current_pii_type, aggregate_redaction)
51
+ is_redacting = False
52
+
53
+ # Handle case where PII is at the end of the text
54
+ if is_redacting:
55
+ apply_redaction(masked_text, redaction_start, len(masked_text), current_pii_type, aggregate_redaction)
56
+
57
+ return ''.join(masked_text)
58
+
59
+ def apply_redaction(masked_text, start, end, pii_type, aggregate_redaction):
60
+ for j in range(start, end):
61
+ masked_text[j] = ''
62
+ if aggregate_redaction:
63
+ masked_text[start] = '[redacted]'
64
+ else:
65
+ masked_text[start] = f'[{pii_type}]'