Spaces:

ai4privacy
/

general-english-anonymiser-openpii-500k

Running

App Files Files Community

MikeDoes commited on Apr 14

Commit

d46aa0f

verified ·

1 Parent(s): 6e118b7

Upload 4 files

Browse files

Files changed (4) hide show

components.js +131 -0
data.jsonl +2 -0
index.html +129 -0
inference.js +148 -0

components.js ADDED Viewed

	@@ -0,0 +1,131 @@

+import { loadModel, processText, isModelLoaded } from './inference.js';
+// DOM Elements
+const inputText = document.getElementById('inputText');
+const outputText = document.getElementById('outputText');
+const statusElement = document.getElementById('processingStatus');
+const privacyMaskDiv = document.getElementById('privacyMask');
+const thresholdInput = document.getElementById('thresholdInput');
+const settingsButton = document.getElementById('settingsButton');
+const settingsPanel = document.getElementById('settingsPanel');
+// Initialize variables
+let currentInput = "";
+let settingsVisible = false;
+let samples = {};
+// Add debounce to input handler
+let timeout;
+inputText.addEventListener('input', (event) => {
+  currentInput = event.target.value;
+  statusElement.textContent = 'Processing...';
+  clearTimeout(timeout);
+  timeout = setTimeout(updateOutput, 300);
+});
+async function updateOutput() {
+  if (!isModelLoaded) {
+    statusElement.textContent = 'Loading model...';
+    outputText.value = "";
+    return;
+  }
+  try {
+    const threshold = parseFloat(thresholdInput.value) || 0.01;
+    const processed = await processText(currentInput, threshold);
+    statusElement.textContent = `Processed ${currentInput.length} characters`;
+    outputText.value = processed.maskedText;
+    privacyMaskDiv.innerHTML = '';
+    if (processed.replacements.length > 0) {
+      processed.replacements.forEach(replacement => {
+        const tile = document.createElement('div');
+        tile.className = 'bg-gray-800 p-3 rounded-lg border border-white/10 hover:border-white/20 transition duration-200 hover:-translate-y-[2px] hover:shadow-[0_4px_6px_rgba(0,0,0,0.1)]';
+        tile.innerHTML = `
+          <div class="text-xs text-white/60 mb-1">${replacement.placeholder}</div>
+          <div class="text-sm text-white font-medium">${replacement.original}</div>
+          <div class="text-xs text-white/40 mt-1">Sensitive Information</div>
+          <div class="text-xs text-white/40 mt-1">Activation: ${Math.round(replacement.activation * 100)}%</div>
+        `;
+        privacyMaskDiv.appendChild(tile);
+      });
+    } else {
+      const emptyState = document.createElement('div');
+      emptyState.className = 'text-center text-white/40 py-4';
+      emptyState.textContent = 'No sensitive information detected.';
+      privacyMaskDiv.appendChild(emptyState);
+    }
+  } catch (err) {
+    statusElement.textContent = 'Error processing text';
+    console.error("Error processing text:", err);
+    outputText.value = "Error processing text.";
+  }
+}
+// Settings toggle functionality
+settingsButton.addEventListener('click', (e) => {
+  settingsVisible = !settingsVisible;
+  settingsPanel.classList.toggle('hidden', !settingsVisible);
+  e.stopPropagation();
+});
+document.addEventListener('click', (e) => {
+  if (settingsVisible && !settingsPanel.contains(e.target)) {
+    settingsPanel.classList.add('hidden');
+    settingsVisible = false;
+  }
+});
+// Load sample data from data.jsonl
+async function loadSamples() {
+  try {
+    const response = await fetch('data.jsonl');
+    const text = await response.text();
+    const lines = text.split('\n').filter(line => line.trim() !== '');
+    samples = {};
+    lines.forEach(line => {
+      const obj = JSON.parse(line);
+      samples[obj.title] = obj.text;
+    });
+  } catch (err) {
+    console.error("Error loading samples:", err);
+  }
+}
+// Sample button event listeners
+document.getElementById('sampleEmailButton').addEventListener('click', () => {
+  if (samples["customer email"]) {
+    inputText.value = samples["customer email"];
+    currentInput = samples["customer email"];
+    updateOutput();
+  } else {
+    alert("Sample customer email not found.");
+  }
+});
+document.getElementById('sampleDocumentButton').addEventListener('click', () => {
+  if (samples["document"]) {
+    inputText.value = samples["document"];
+    currentInput = samples["document"];
+    updateOutput();
+  } else {
+    alert("Sample document not found.");
+  }
+});
+// Initialize the application
+async function init() {
+  statusElement.textContent = 'Loading model and samples...';
+  try {
+    await loadModel();
+    await loadSamples();
+    statusElement.textContent = 'Model and samples loaded';
+    updateOutput();
+  } catch (err) {
+    statusElement.textContent = 'Error loading model or samples';
+    outputText.value = "Error loading model or samples.";
+  }
+}
+init();

data.jsonl ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {"title":"customer email","text":"Customer Complaint:\n\n\"This is Sarah Gomez born on the 9th March 2002 in San-Fransisco with policy number #2301. I've been on hold for over 30 minutes trying to speak with a customer service representative about my claim on life insurance. It's incredibly frustrating. Is there a faster way to get the help I need?\"\n\nThis is the complaint and can you write a quick message saying we are on the case and I will review this as soon as possible."}
2	+ {"title": "document","text": "COMMUNITY CENTER MEMBERSHIP REGISTRATION FORM\n\nPersonal Information:\n\n -Full Name: Kelsang Rishi\n -Date of Birth: 1939-04-21T00:00:00\n - Gender: Non-binary\n - Street Address: 50 Farm-to-Market 3059\n \n - City: Marshfield\n - Postal/ZIP Code: 44691\n - Phone Number: +19-19-079.8396\n - Email Address: ZJuly@tutanota.com\n"}

index.html ADDED Viewed

	@@ -0,0 +1,129 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Short Text and Open Source: Anonymiser</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/iconify/2.0.0/iconify.min.js"></script>
+  <style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap');
+    * {
+      font-family: 'Inter', sans-serif;
+    }
+    ::-webkit-scrollbar {
+      width: 6px;
+    }
+    ::-webkit-scrollbar-track {
+      background: #2d2d2d;
+    }
+    ::-webkit-scrollbar-thumb {
+      background: #4a4a4a;
+      border-radius: 3px;
+    }
+  </style>
+</head>
+<body class="bg-gray-900 min-h-screen">
+  <!-- Branding Header -->
+  <div class="bg-black/30 py-4 border-b border-white/10">
+    <div class="max-w-7xl mx-auto px-4 flex items-center justify-between">
+      <div class="flex items-center space-x-3">
+        <img src="ai4privacy-logo.png" alt="Logo" class="h-8 w-8">
+        <div>
+          <span class="text-xl font-bold text-white">Ai4Privacy</span>
+          <span class="block text-xs text-white/60">Short Text Anonymization Locally in Your Browser</span>
+        </div>
+      </div>
+      <!-- Settings Button -->
+      <button id="settingsButton" class="text-white/60 hover:text-white transition-colors">
+        <span class="iconify" data-icon="mdi:cog" data-width="24"></span>
+      </button>
+    </div>
+  </div>
+  <!-- Settings Panel -->
+  <div id="settingsPanel" class="hidden absolute right-4 top-20 bg-gray-800 border border-white/10 rounded-xl p-4 w-64 space-y-4 z-50">
+    <div>
+      <label class="block text-sm text-white/80 mb-2">Detection Threshold</label>
+      <input type="number" id="thresholdInput" step="0.001" min="0" max="1" value="0.3"
+             class="w-full bg-gray-700 border border-white/10 rounded-lg px-3 py-2 text-white">
+    </div>
+    <div>
+      <label class="block text-sm text-white/80 mb-2">Language Model</label>
+      <select id="modelSelect" class="w-full bg-gray-700 border border-white/10 rounded-lg px-3 py-2 text-white">
+        <option value="english">English - ai4privacy/llama-ai4privacy-english-anonymiser-openpii</option>
+      </select>
+    </div>
+  </div>
+  <div class="max-w-7xl mx-auto px-4 py-8">
+    <div class="flex flex-col lg:flex-row gap-8">
+      <!-- Input/Output Section -->
+      <div class="flex-1 space-y-6">
+        <div>
+          <label class="block text-sm font-medium text-white/80 mb-2">Input Text</label>
+          <textarea
+            id="inputText"
+            class="w-full p-4 bg-gray-800 border border-white/10 rounded-xl text-white placeholder-white/30 focus:border-blue-500 focus:ring-2 focus:ring-blue-500/30 resize-none transition duration-200 ease-in-out"
+            rows="6"
+            placeholder="Enter sensitive text to anonymize..."
+          ></textarea>
+        </div>
+        <div>
+          <label class="block text-sm font-medium text-white/80 mb-2">Anonymized Output</label>
+          <textarea
+            id="outputText"
+            class="w-full p-4 bg-gray-800 border border-white/10 rounded-xl text-white/80 resize-none transition duration-200 ease-in-out"
+            rows="6"
+            readonly
+          ></textarea>
+        </div>
+      </div>
+      <!-- Privacy Mask Panel -->
+      <div class="lg:w-96">
+        <div class="sticky top-8">
+          <label class="block text-sm font-medium text-white/80 mb-2">Detected Entities</label>
+          <div class="bg-gray-800 border border-white/10 rounded-xl p-4">
+            <div class="mb-4">
+              <span id="processingStatus" class="text-xs text-white/40">Ready</span>
+            </div>
+            <div
+              id="privacyMask"
+              class="h-96 bg-gray-850 rounded-lg p-3 overflow-y-auto text-sm space-y-2 transition duration-200 ease-in-out"
+            >
+              <div class="text-center text-white/40 py-4">Processing results will appear here</div>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+    <!-- Sample Buttons -->
+    <div class="mt-8 flex justify-center space-x-4">
+      <button id="sampleEmailButton" class="bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition">
+        Sample Customer Email
+      </button>
+      <button id="sampleDocumentButton" class="bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition">
+        Sample Document
+      </button>
+    </div>
+  </div>
+  <!-- Branding Footer -->
+  <div class="fixed bottom-0 left-0 right-0 bg-black/30 border-t border-white/10 py-3">
+    <div class="max-w-7xl mx-auto px-4">
+      <div class="flex items-center justify-between">
+        <div class="text-sm text-white/50">© 2025 Ai4Privacy. All rights reserved. Use at your own risk. Ai4Privacy assumes no responsibility for implementation, accuracy, or any resulting damages.</div>
+        <div class="flex items-center space-x-4">
+          <span class="text-sm text-white/50">v2.1.0</span>
+          <div class="w-px h-4 bg-white/10"></div>
+          <img src="ai4privacy-logo.png" alt="Logo" class="h-6 w-6 opacity-70">
+        </div>
+      </div>
+    </div>
+  </div>
+  <script type="module" src="components.js"></script>
+</body>
+</html>

inference.js ADDED Viewed

	@@ -0,0 +1,148 @@

+import { AutoModel, AutoTokenizer } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.0';
+let tokenizer, model;
+let isModelLoaded = false;
+async function loadModel() {
+  try {
+    tokenizer = await AutoTokenizer.from_pretrained('ai4privacy/llama-ai4privacy-english-anonymiser-openpii');
+    model = await AutoModel.from_pretrained('ai4privacy/llama-ai4privacy-english-anonymiser-openpii', { dtype: "q8" });
+    isModelLoaded = true;
+  } catch (err) {
+    console.error("Error loading model:", err);
+    isModelLoaded = false;
+    throw err;
+  }
+}
+async function processText(text, threshold = 0.3) {
+  if (!isModelLoaded) {
+    throw new Error('Model not loaded');
+  }
+  const inputs = await tokenizer(text);
+  const inputTokens = inputs.input_ids.data;
+  const tokenStrings = Array.from(inputTokens).map(id =>
+    tokenizer.decode([id], { skip_special_tokens: false })
+  );
+  const { logits } = await model(inputs);
+  const logitsData = Array.from(logits.data);
+  const numTokens = tokenStrings.length;
+  const numClasses = 3;
+  const logitsPerToken = [];
+  for (let i = 0; i < numTokens; i++) {
+    logitsPerToken.push(logitsData.slice(i * numClasses, (i + 1) * numClasses));
+  }
+  function softmax(logits) {
+    const expLogits = logits.map(Math.exp);
+    const sumExp = expLogits.reduce((a, b) => a + b, 0);
+    return expLogits.map(exp => exp / sumExp);
+  }
+  const tokenPredictions = tokenStrings.map((token, i) => {
+    const probs = softmax(logitsPerToken[i]);
+    const maxSensitive = Math.max(probs[0], probs[1]);
+    return {
+      token: token,
+      start: i,
+      end: i + 1,
+      probabilities: {
+        "B-PRIVATE": probs[0],
+        "I-PRIVATE": probs[1],
+        "O": probs[2]
+      },
+      maxSensitiveScore: maxSensitive
+    };
+  });
+  const aggregated = aggregatePrivacyTokens(tokenPredictions, threshold);
+  const { maskedText, replacements } = maskText(tokenPredictions, aggregated);
+  return { maskedText, replacements };
+}
+function aggregatePrivacyTokens(tokenPredictions, threshold) {
+  const aggregated = [];
+  let i = 0;
+  const n = tokenPredictions.length;
+  while (i < n) {
+    const currentToken = tokenPredictions[i];
+    if (['[CLS]', '[SEP]'].includes(currentToken.token)) {
+      i++;
+      continue;
+    }
+    const startsWithSpace = currentToken.token.startsWith(' ');
+    const isFirstWord = aggregated.length === 0 && i === 0;
+    if (startsWithSpace || isFirstWord) {
+      const group = {
+        tokens: [currentToken],
+        indices: [i],
+        scores: [currentToken.maxSensitiveScore],
+        startsWithSpace: startsWithSpace
+      };
+      i++;
+      while (i < n &&
+            !tokenPredictions[i].token.startsWith(' ') &&
+            !['[CLS]', '[SEP]'].includes(tokenPredictions[i].token)) {
+        group.tokens.push(tokenPredictions[i]);
+        group.indices.push(i);
+        group.scores.push(tokenPredictions[i].maxSensitiveScore);
+        i++;
+      }
+      if (Math.max(...group.scores) >= threshold) {
+        aggregated.push(group);
+      }
+    } else {
+      i++;
+    }
+  }
+  return aggregated;
+}
+function maskText(tokenPredictions, aggregatedGroups) {
+    const maskedTokens = [];
+    const replacements = [];
+    const maskedIndices = new Set();
+    let redactedCounter = 1;
+    aggregatedGroups.forEach(group => {
+      group.indices.forEach(idx => maskedIndices.add(idx));
+    });
+    tokenPredictions.forEach((token, idx) => {
+      if (['[CLS]', '[SEP]'].includes(token.token)) return;
+      if (maskedIndices.has(idx)) {
+        const group = aggregatedGroups.find(g => g.indices[0] === idx);
+        if (group) {
+          const originalTokens = group.tokens.map(t => t.token);
+          const originalText = originalTokens
+            .map((token, i) => (i === 0 && group.startsWithSpace ? token.trimStart() : token))
+            .join('');
+          const placeholder = `[PII_${redactedCounter}]`;
+          replacements.push({
+            original: originalText,
+            placeholder: placeholder,
+            activation: Math.max(...group.scores)
+          });
+          redactedCounter++;
+          const maskWithSpace = group.startsWithSpace ? ` ${placeholder}` : placeholder;
+          maskedTokens.push(maskWithSpace);
+        }
+      } else {
+        maskedTokens.push(token.token);
+      }
+    });
+    // First join the tokens, then split into lines.
+    const joinedText = maskedTokens.join('');
+    // For each line, collapse only spaces and tabs.
+    const processedLines = joinedText.split('\n').map(line => line.replace(/[ \t]+/g, ' ').trim());
+    const maskedText = processedLines.join('\n').trim();
+    return { maskedText, replacements };
+  }
+export { loadModel, processText, isModelLoaded };