Upload 4 files
Browse files- components.js +131 -0
- data.jsonl +2 -0
- index.html +129 -0
- inference.js +148 -0
components.js
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { loadModel, processText, isModelLoaded } from './inference.js';
|
2 |
+
|
3 |
+
// DOM Elements
|
4 |
+
const inputText = document.getElementById('inputText');
|
5 |
+
const outputText = document.getElementById('outputText');
|
6 |
+
const statusElement = document.getElementById('processingStatus');
|
7 |
+
const privacyMaskDiv = document.getElementById('privacyMask');
|
8 |
+
const thresholdInput = document.getElementById('thresholdInput');
|
9 |
+
const settingsButton = document.getElementById('settingsButton');
|
10 |
+
const settingsPanel = document.getElementById('settingsPanel');
|
11 |
+
|
12 |
+
// Initialize variables
|
13 |
+
let currentInput = "";
|
14 |
+
let settingsVisible = false;
|
15 |
+
let samples = {};
|
16 |
+
|
17 |
+
// Add debounce to input handler
|
18 |
+
let timeout;
|
19 |
+
inputText.addEventListener('input', (event) => {
|
20 |
+
currentInput = event.target.value;
|
21 |
+
statusElement.textContent = 'Processing...';
|
22 |
+
clearTimeout(timeout);
|
23 |
+
timeout = setTimeout(updateOutput, 300);
|
24 |
+
});
|
25 |
+
|
26 |
+
async function updateOutput() {
|
27 |
+
if (!isModelLoaded) {
|
28 |
+
statusElement.textContent = 'Loading model...';
|
29 |
+
outputText.value = "";
|
30 |
+
return;
|
31 |
+
}
|
32 |
+
|
33 |
+
try {
|
34 |
+
const threshold = parseFloat(thresholdInput.value) || 0.01;
|
35 |
+
const processed = await processText(currentInput, threshold);
|
36 |
+
statusElement.textContent = `Processed ${currentInput.length} characters`;
|
37 |
+
outputText.value = processed.maskedText;
|
38 |
+
|
39 |
+
privacyMaskDiv.innerHTML = '';
|
40 |
+
|
41 |
+
if (processed.replacements.length > 0) {
|
42 |
+
processed.replacements.forEach(replacement => {
|
43 |
+
const tile = document.createElement('div');
|
44 |
+
tile.className = 'bg-gray-800 p-3 rounded-lg border border-white/10 hover:border-white/20 transition duration-200 hover:-translate-y-[2px] hover:shadow-[0_4px_6px_rgba(0,0,0,0.1)]';
|
45 |
+
tile.innerHTML = `
|
46 |
+
<div class="text-xs text-white/60 mb-1">${replacement.placeholder}</div>
|
47 |
+
<div class="text-sm text-white font-medium">${replacement.original}</div>
|
48 |
+
<div class="text-xs text-white/40 mt-1">Sensitive Information</div>
|
49 |
+
<div class="text-xs text-white/40 mt-1">Activation: ${Math.round(replacement.activation * 100)}%</div>
|
50 |
+
`;
|
51 |
+
privacyMaskDiv.appendChild(tile);
|
52 |
+
});
|
53 |
+
} else {
|
54 |
+
const emptyState = document.createElement('div');
|
55 |
+
emptyState.className = 'text-center text-white/40 py-4';
|
56 |
+
emptyState.textContent = 'No sensitive information detected.';
|
57 |
+
privacyMaskDiv.appendChild(emptyState);
|
58 |
+
}
|
59 |
+
} catch (err) {
|
60 |
+
statusElement.textContent = 'Error processing text';
|
61 |
+
console.error("Error processing text:", err);
|
62 |
+
outputText.value = "Error processing text.";
|
63 |
+
}
|
64 |
+
}
|
65 |
+
|
66 |
+
// Settings toggle functionality
|
67 |
+
settingsButton.addEventListener('click', (e) => {
|
68 |
+
settingsVisible = !settingsVisible;
|
69 |
+
settingsPanel.classList.toggle('hidden', !settingsVisible);
|
70 |
+
e.stopPropagation();
|
71 |
+
});
|
72 |
+
|
73 |
+
document.addEventListener('click', (e) => {
|
74 |
+
if (settingsVisible && !settingsPanel.contains(e.target)) {
|
75 |
+
settingsPanel.classList.add('hidden');
|
76 |
+
settingsVisible = false;
|
77 |
+
}
|
78 |
+
});
|
79 |
+
|
80 |
+
// Load sample data from data.jsonl
|
81 |
+
async function loadSamples() {
|
82 |
+
try {
|
83 |
+
const response = await fetch('data.jsonl');
|
84 |
+
const text = await response.text();
|
85 |
+
const lines = text.split('\n').filter(line => line.trim() !== '');
|
86 |
+
samples = {};
|
87 |
+
lines.forEach(line => {
|
88 |
+
const obj = JSON.parse(line);
|
89 |
+
samples[obj.title] = obj.text;
|
90 |
+
});
|
91 |
+
} catch (err) {
|
92 |
+
console.error("Error loading samples:", err);
|
93 |
+
}
|
94 |
+
}
|
95 |
+
|
96 |
+
// Sample button event listeners
|
97 |
+
document.getElementById('sampleEmailButton').addEventListener('click', () => {
|
98 |
+
if (samples["customer email"]) {
|
99 |
+
inputText.value = samples["customer email"];
|
100 |
+
currentInput = samples["customer email"];
|
101 |
+
updateOutput();
|
102 |
+
} else {
|
103 |
+
alert("Sample customer email not found.");
|
104 |
+
}
|
105 |
+
});
|
106 |
+
|
107 |
+
document.getElementById('sampleDocumentButton').addEventListener('click', () => {
|
108 |
+
if (samples["document"]) {
|
109 |
+
inputText.value = samples["document"];
|
110 |
+
currentInput = samples["document"];
|
111 |
+
updateOutput();
|
112 |
+
} else {
|
113 |
+
alert("Sample document not found.");
|
114 |
+
}
|
115 |
+
});
|
116 |
+
|
117 |
+
// Initialize the application
|
118 |
+
async function init() {
|
119 |
+
statusElement.textContent = 'Loading model and samples...';
|
120 |
+
try {
|
121 |
+
await loadModel();
|
122 |
+
await loadSamples();
|
123 |
+
statusElement.textContent = 'Model and samples loaded';
|
124 |
+
updateOutput();
|
125 |
+
} catch (err) {
|
126 |
+
statusElement.textContent = 'Error loading model or samples';
|
127 |
+
outputText.value = "Error loading model or samples.";
|
128 |
+
}
|
129 |
+
}
|
130 |
+
|
131 |
+
init();
|
data.jsonl
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
{"title":"customer email","text":"Customer Complaint:\n\n\"This is Sarah Gomez born on the 9th March 2002 in San-Fransisco with policy number #2301. I've been on hold for over 30 minutes trying to speak with a customer service representative about my claim on life insurance. It's incredibly frustrating. Is there a faster way to get the help I need?\"\n\nThis is the complaint and can you write a quick message saying we are on the case and I will review this as soon as possible."}
|
2 |
+
{"title": "document","text": "COMMUNITY CENTER MEMBERSHIP REGISTRATION FORM\n\nPersonal Information:\n\n -Full Name: Kelsang Rishi\n -Date of Birth: 1939-04-21T00:00:00\n - Gender: Non-binary\n - Street Address: 50 Farm-to-Market 3059\n \n - City: Marshfield\n - Postal/ZIP Code: 44691\n - Phone Number: +19-19-079.8396\n - Email Address: ZJuly@tutanota.com\n"}
|
index.html
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Short Text and Open Source: Anonymiser</title>
|
7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
8 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/iconify/2.0.0/iconify.min.js"></script>
|
9 |
+
<style>
|
10 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap');
|
11 |
+
* {
|
12 |
+
font-family: 'Inter', sans-serif;
|
13 |
+
}
|
14 |
+
::-webkit-scrollbar {
|
15 |
+
width: 6px;
|
16 |
+
}
|
17 |
+
::-webkit-scrollbar-track {
|
18 |
+
background: #2d2d2d;
|
19 |
+
}
|
20 |
+
::-webkit-scrollbar-thumb {
|
21 |
+
background: #4a4a4a;
|
22 |
+
border-radius: 3px;
|
23 |
+
}
|
24 |
+
</style>
|
25 |
+
</head>
|
26 |
+
<body class="bg-gray-900 min-h-screen">
|
27 |
+
<!-- Branding Header -->
|
28 |
+
<div class="bg-black/30 py-4 border-b border-white/10">
|
29 |
+
<div class="max-w-7xl mx-auto px-4 flex items-center justify-between">
|
30 |
+
<div class="flex items-center space-x-3">
|
31 |
+
<img src="ai4privacy-logo.png" alt="Logo" class="h-8 w-8">
|
32 |
+
<div>
|
33 |
+
<span class="text-xl font-bold text-white">Ai4Privacy</span>
|
34 |
+
<span class="block text-xs text-white/60">Short Text Anonymization Locally in Your Browser</span>
|
35 |
+
</div>
|
36 |
+
</div>
|
37 |
+
<!-- Settings Button -->
|
38 |
+
<button id="settingsButton" class="text-white/60 hover:text-white transition-colors">
|
39 |
+
<span class="iconify" data-icon="mdi:cog" data-width="24"></span>
|
40 |
+
</button>
|
41 |
+
</div>
|
42 |
+
</div>
|
43 |
+
|
44 |
+
<!-- Settings Panel -->
|
45 |
+
<div id="settingsPanel" class="hidden absolute right-4 top-20 bg-gray-800 border border-white/10 rounded-xl p-4 w-64 space-y-4 z-50">
|
46 |
+
<div>
|
47 |
+
<label class="block text-sm text-white/80 mb-2">Detection Threshold</label>
|
48 |
+
<input type="number" id="thresholdInput" step="0.001" min="0" max="1" value="0.3"
|
49 |
+
class="w-full bg-gray-700 border border-white/10 rounded-lg px-3 py-2 text-white">
|
50 |
+
</div>
|
51 |
+
<div>
|
52 |
+
<label class="block text-sm text-white/80 mb-2">Language Model</label>
|
53 |
+
<select id="modelSelect" class="w-full bg-gray-700 border border-white/10 rounded-lg px-3 py-2 text-white">
|
54 |
+
<option value="english">English - ai4privacy/llama-ai4privacy-english-anonymiser-openpii</option>
|
55 |
+
</select>
|
56 |
+
</div>
|
57 |
+
</div>
|
58 |
+
|
59 |
+
<div class="max-w-7xl mx-auto px-4 py-8">
|
60 |
+
<div class="flex flex-col lg:flex-row gap-8">
|
61 |
+
<!-- Input/Output Section -->
|
62 |
+
<div class="flex-1 space-y-6">
|
63 |
+
<div>
|
64 |
+
<label class="block text-sm font-medium text-white/80 mb-2">Input Text</label>
|
65 |
+
<textarea
|
66 |
+
id="inputText"
|
67 |
+
class="w-full p-4 bg-gray-800 border border-white/10 rounded-xl text-white placeholder-white/30 focus:border-blue-500 focus:ring-2 focus:ring-blue-500/30 resize-none transition duration-200 ease-in-out"
|
68 |
+
rows="6"
|
69 |
+
placeholder="Enter sensitive text to anonymize..."
|
70 |
+
></textarea>
|
71 |
+
</div>
|
72 |
+
|
73 |
+
<div>
|
74 |
+
<label class="block text-sm font-medium text-white/80 mb-2">Anonymized Output</label>
|
75 |
+
<textarea
|
76 |
+
id="outputText"
|
77 |
+
class="w-full p-4 bg-gray-800 border border-white/10 rounded-xl text-white/80 resize-none transition duration-200 ease-in-out"
|
78 |
+
rows="6"
|
79 |
+
readonly
|
80 |
+
></textarea>
|
81 |
+
</div>
|
82 |
+
</div>
|
83 |
+
|
84 |
+
<!-- Privacy Mask Panel -->
|
85 |
+
<div class="lg:w-96">
|
86 |
+
<div class="sticky top-8">
|
87 |
+
<label class="block text-sm font-medium text-white/80 mb-2">Detected Entities</label>
|
88 |
+
<div class="bg-gray-800 border border-white/10 rounded-xl p-4">
|
89 |
+
<div class="mb-4">
|
90 |
+
<span id="processingStatus" class="text-xs text-white/40">Ready</span>
|
91 |
+
</div>
|
92 |
+
<div
|
93 |
+
id="privacyMask"
|
94 |
+
class="h-96 bg-gray-850 rounded-lg p-3 overflow-y-auto text-sm space-y-2 transition duration-200 ease-in-out"
|
95 |
+
>
|
96 |
+
<div class="text-center text-white/40 py-4">Processing results will appear here</div>
|
97 |
+
</div>
|
98 |
+
</div>
|
99 |
+
</div>
|
100 |
+
</div>
|
101 |
+
</div>
|
102 |
+
<!-- Sample Buttons -->
|
103 |
+
<div class="mt-8 flex justify-center space-x-4">
|
104 |
+
<button id="sampleEmailButton" class="bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition">
|
105 |
+
Sample Customer Email
|
106 |
+
</button>
|
107 |
+
<button id="sampleDocumentButton" class="bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition">
|
108 |
+
Sample Document
|
109 |
+
</button>
|
110 |
+
</div>
|
111 |
+
</div>
|
112 |
+
|
113 |
+
<!-- Branding Footer -->
|
114 |
+
<div class="fixed bottom-0 left-0 right-0 bg-black/30 border-t border-white/10 py-3">
|
115 |
+
<div class="max-w-7xl mx-auto px-4">
|
116 |
+
<div class="flex items-center justify-between">
|
117 |
+
<div class="text-sm text-white/50">© 2025 Ai4Privacy. All rights reserved. Use at your own risk. Ai4Privacy assumes no responsibility for implementation, accuracy, or any resulting damages.</div>
|
118 |
+
<div class="flex items-center space-x-4">
|
119 |
+
<span class="text-sm text-white/50">v2.1.0</span>
|
120 |
+
<div class="w-px h-4 bg-white/10"></div>
|
121 |
+
<img src="ai4privacy-logo.png" alt="Logo" class="h-6 w-6 opacity-70">
|
122 |
+
</div>
|
123 |
+
</div>
|
124 |
+
</div>
|
125 |
+
</div>
|
126 |
+
|
127 |
+
<script type="module" src="components.js"></script>
|
128 |
+
</body>
|
129 |
+
</html>
|
inference.js
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { AutoModel, AutoTokenizer } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.0';
|
2 |
+
|
3 |
+
let tokenizer, model;
|
4 |
+
let isModelLoaded = false;
|
5 |
+
|
6 |
+
async function loadModel() {
|
7 |
+
try {
|
8 |
+
tokenizer = await AutoTokenizer.from_pretrained('ai4privacy/llama-ai4privacy-english-anonymiser-openpii');
|
9 |
+
model = await AutoModel.from_pretrained('ai4privacy/llama-ai4privacy-english-anonymiser-openpii', { dtype: "q8" });
|
10 |
+
isModelLoaded = true;
|
11 |
+
} catch (err) {
|
12 |
+
console.error("Error loading model:", err);
|
13 |
+
isModelLoaded = false;
|
14 |
+
throw err;
|
15 |
+
}
|
16 |
+
}
|
17 |
+
|
18 |
+
async function processText(text, threshold = 0.3) {
|
19 |
+
if (!isModelLoaded) {
|
20 |
+
throw new Error('Model not loaded');
|
21 |
+
}
|
22 |
+
const inputs = await tokenizer(text);
|
23 |
+
const inputTokens = inputs.input_ids.data;
|
24 |
+
const tokenStrings = Array.from(inputTokens).map(id =>
|
25 |
+
tokenizer.decode([id], { skip_special_tokens: false })
|
26 |
+
);
|
27 |
+
|
28 |
+
const { logits } = await model(inputs);
|
29 |
+
const logitsData = Array.from(logits.data);
|
30 |
+
const numTokens = tokenStrings.length;
|
31 |
+
const numClasses = 3;
|
32 |
+
|
33 |
+
const logitsPerToken = [];
|
34 |
+
for (let i = 0; i < numTokens; i++) {
|
35 |
+
logitsPerToken.push(logitsData.slice(i * numClasses, (i + 1) * numClasses));
|
36 |
+
}
|
37 |
+
|
38 |
+
function softmax(logits) {
|
39 |
+
const expLogits = logits.map(Math.exp);
|
40 |
+
const sumExp = expLogits.reduce((a, b) => a + b, 0);
|
41 |
+
return expLogits.map(exp => exp / sumExp);
|
42 |
+
}
|
43 |
+
|
44 |
+
const tokenPredictions = tokenStrings.map((token, i) => {
|
45 |
+
const probs = softmax(logitsPerToken[i]);
|
46 |
+
const maxSensitive = Math.max(probs[0], probs[1]);
|
47 |
+
return {
|
48 |
+
token: token,
|
49 |
+
start: i,
|
50 |
+
end: i + 1,
|
51 |
+
probabilities: {
|
52 |
+
"B-PRIVATE": probs[0],
|
53 |
+
"I-PRIVATE": probs[1],
|
54 |
+
"O": probs[2]
|
55 |
+
},
|
56 |
+
maxSensitiveScore: maxSensitive
|
57 |
+
};
|
58 |
+
});
|
59 |
+
|
60 |
+
const aggregated = aggregatePrivacyTokens(tokenPredictions, threshold);
|
61 |
+
const { maskedText, replacements } = maskText(tokenPredictions, aggregated);
|
62 |
+
return { maskedText, replacements };
|
63 |
+
}
|
64 |
+
|
65 |
+
function aggregatePrivacyTokens(tokenPredictions, threshold) {
|
66 |
+
const aggregated = [];
|
67 |
+
let i = 0;
|
68 |
+
const n = tokenPredictions.length;
|
69 |
+
|
70 |
+
while (i < n) {
|
71 |
+
const currentToken = tokenPredictions[i];
|
72 |
+
if (['[CLS]', '[SEP]'].includes(currentToken.token)) {
|
73 |
+
i++;
|
74 |
+
continue;
|
75 |
+
}
|
76 |
+
const startsWithSpace = currentToken.token.startsWith(' ');
|
77 |
+
const isFirstWord = aggregated.length === 0 && i === 0;
|
78 |
+
if (startsWithSpace || isFirstWord) {
|
79 |
+
const group = {
|
80 |
+
tokens: [currentToken],
|
81 |
+
indices: [i],
|
82 |
+
scores: [currentToken.maxSensitiveScore],
|
83 |
+
startsWithSpace: startsWithSpace
|
84 |
+
};
|
85 |
+
i++;
|
86 |
+
while (i < n &&
|
87 |
+
!tokenPredictions[i].token.startsWith(' ') &&
|
88 |
+
!['[CLS]', '[SEP]'].includes(tokenPredictions[i].token)) {
|
89 |
+
group.tokens.push(tokenPredictions[i]);
|
90 |
+
group.indices.push(i);
|
91 |
+
group.scores.push(tokenPredictions[i].maxSensitiveScore);
|
92 |
+
i++;
|
93 |
+
}
|
94 |
+
if (Math.max(...group.scores) >= threshold) {
|
95 |
+
aggregated.push(group);
|
96 |
+
}
|
97 |
+
} else {
|
98 |
+
i++;
|
99 |
+
}
|
100 |
+
}
|
101 |
+
return aggregated;
|
102 |
+
}
|
103 |
+
|
104 |
+
function maskText(tokenPredictions, aggregatedGroups) {
|
105 |
+
const maskedTokens = [];
|
106 |
+
const replacements = [];
|
107 |
+
const maskedIndices = new Set();
|
108 |
+
let redactedCounter = 1;
|
109 |
+
|
110 |
+
aggregatedGroups.forEach(group => {
|
111 |
+
group.indices.forEach(idx => maskedIndices.add(idx));
|
112 |
+
});
|
113 |
+
|
114 |
+
tokenPredictions.forEach((token, idx) => {
|
115 |
+
if (['[CLS]', '[SEP]'].includes(token.token)) return;
|
116 |
+
if (maskedIndices.has(idx)) {
|
117 |
+
const group = aggregatedGroups.find(g => g.indices[0] === idx);
|
118 |
+
if (group) {
|
119 |
+
const originalTokens = group.tokens.map(t => t.token);
|
120 |
+
const originalText = originalTokens
|
121 |
+
.map((token, i) => (i === 0 && group.startsWithSpace ? token.trimStart() : token))
|
122 |
+
.join('');
|
123 |
+
const placeholder = `[PII_${redactedCounter}]`;
|
124 |
+
replacements.push({
|
125 |
+
original: originalText,
|
126 |
+
placeholder: placeholder,
|
127 |
+
activation: Math.max(...group.scores)
|
128 |
+
});
|
129 |
+
redactedCounter++;
|
130 |
+
const maskWithSpace = group.startsWithSpace ? ` ${placeholder}` : placeholder;
|
131 |
+
maskedTokens.push(maskWithSpace);
|
132 |
+
}
|
133 |
+
} else {
|
134 |
+
maskedTokens.push(token.token);
|
135 |
+
}
|
136 |
+
});
|
137 |
+
|
138 |
+
// First join the tokens, then split into lines.
|
139 |
+
const joinedText = maskedTokens.join('');
|
140 |
+
// For each line, collapse only spaces and tabs.
|
141 |
+
const processedLines = joinedText.split('\n').map(line => line.replace(/[ \t]+/g, ' ').trim());
|
142 |
+
const maskedText = processedLines.join('\n').trim();
|
143 |
+
|
144 |
+
return { maskedText, replacements };
|
145 |
+
}
|
146 |
+
|
147 |
+
|
148 |
+
export { loadModel, processText, isModelLoaded };
|