In [11]:
import numpy as np
import re


In [45]:
from presidio_anonymizer.entities import (RecognizerResult,
    OperatorResult,
    OperatorConfig)
from privacy.service.service import PrivacyService

In [12]:
text = "John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com."

# Define regular expressions for different types of PII
ssn_pattern = r"\d{3}-\d{2}-\d{4}"
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"


In [3]:
# Find matches for Social Security numbers
ssn_matches = re.findall(ssn_pattern, text)

# Find matches for email addresses
email_matches = re.findall(email_pattern, text)

In [13]:
# Apply differential privacy to the detected PII counts
epsilon = 0.1 

In [16]:
def add_noise(value):
    scale = 1 / epsilon
    laplace_noise = np.random.laplace(loc=0, scale=scale)
    print(value)
    print(laplace_noise)
    return value + laplace_noise

In [17]:

# Find matches for Social Security numbers
ssn_matches = re.findall(ssn_pattern, text)

# Add differential privacy to the Social Security numbers
noisy_ssn_matches = [add_noise(ssn) for ssn in ssn_matches]

123-45-6789
-6.338424074647873


TypeError: can only concatenate str (not "float") to str

In [18]:
# Find matches for email addresses
email_matches = re.findall(email_pattern, text)

# Add differential privacy to the email addresses
noisy_email_matches = [add_noise(email) for email in email_matches]

johndoe@example.com
21.357718997606124


TypeError: can only concatenate str (not "float") to str

In [23]:
import numpy as np
from presidio_anonymizer import AnonymizerEngine

# Initialize the anonymizer engine
anonymizer = AnonymizerEngine()

# Define the text containing potential PII
text = "John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com."

# Apply differential privacy to the PII detection process
epsilon = 0.1  # Privacy parameter for differential privacy
sensitivity = 1  # Sensitivity of the PII detection result
delta = 1e-6  # Privacy parameter for differential privacy

# Calculate the noise to be added
scale = sensitivity / epsilon
laplace_noise = np.random.laplace(loc=0, scale=scale)

# Detect PII in the text
# Example rule-based matching for SSN and email
ssn_pattern = r"\d{3}-\d{2}-\d{4}"
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"

# Apply noise to the PII detection results
noisy_ssn_count = len(re.findall(ssn_pattern, text)) + laplace_noise
noisy_email_count = len(re.findall(email_pattern, text)) + laplace_noise

# Anonymize the PII detection results using Presidio
anonymized_ssn_count = anonymizer.anonymize(
    str(noisy_ssn_count),
    analyzer_results=[],
    operators={"anonymizer_config": {"type": "replace", "value": "<COUNT>"}},
)

anonymized_email_count = anonymizer.anonymize(
    str(noisy_email_count),
    analyzer_results=[],
    operators={"anonymizer_config": {"type": "replace", "value": "<COUNT>"}},
)

# Print the anonymized PII detection results
print("Noisy SSN count:", noisy_ssn_count)
print("Anonymized SSN count:", anonymized_ssn_count)

print("Noisy email count:", noisy_email_count)
print("Anonymized email count:", anonymized_email_count)

Noisy SSN count: 1.5109753118487679
Anonymized SSN count: text: 1.5109753118487679
items:
[
    
]

Noisy email count: 1.5109753118487679
Anonymized email count: text: 1.5109753118487679
items:
[
    
]



In [89]:
import numpy as np
from presidio_anonymizer import AnonymizerEngine

# Initialize the anonymizer engine
anonymizer = AnonymizerEngine()

# Define the text containing PII
text = "My email is john.doe@example.com"

# Apply differential privacy to the PII value
epsilon = 0.1  # Privacy parameter for differential privacy

# Generate Laplace noise for each character in the email
laplace_noise = np.random.laplace(loc=0, scale=1/epsilon, size=len(text))

# Add the noise to each character in the email
noisy_email = ''.join(chr(ord(c) + int(round(n))) for c, n in zip(text, laplace_noise))

# Anonymize the noisy email using Presidio
anonymized_text = anonymizer.anonymize(
    noisy_email,
    analyzer_results=[],
    operators=
        {"Email": {"type": "replace", "value": "<EMAIL>"}}
    ,
)

# Print the anonymized text
print(anonymized_text)

text: Px(Z~fmÂ‚#[nXndl.kZ^@j{lplsX(`gx
items:
[
    
]



In [92]:
import numpy as np
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine

# Initialize the anonymizer engine
anonymizer = AnonymizerEngine()

# Define the text containing PII
text = "John Doe's Social Security number is 123-45-6789 and his email is johndoe@example.com."

# Apply differential privacy to the PII value
epsilon = 0.1  # Privacy parameter for differential privacy
sensitivity = 2  # Sensitivity of the PII value
delta = 1e-6  # Privacy parameter for differential privacy

# Calculate the noise to be added
def run():
    scale = sensitivity / epsilon
    laplace_noise = np.random.laplace(loc=0, scale=scale)

# Add the noise to the PII value
    noisy_value =laplace_noise
    return noisy_value
# results = PrivacyService.__analyze(text=text)
# Anonymize the noisy value using Presidio

registry = RecognizerRegistry()
analyzer = AnalyzerEngine(registry=registry)
registry.load_predefined_recognizers()

results = analyzer.analyze(text=text, language="en")
        
print(results)
op={}
for i in results:
    print(i)
    op[i.entity_type]=OperatorConfig("replace", {"new_value": str(run())})
anonymized_text = anonymizer.anonymize(
    text,
    analyzer_results=results,
    operators=op
              

    ,
)

# Print the anonymized text
print(anonymized_text)

John Doe'S Social Security Number Is 123-45-6789 And His Email Is Johndoe@Example.Com.
[type: EMAIL_ADDRESS, start: 66, end: 85, score: 1.0, type: PERSON, start: 0, end: 10, score: 0.85, type: URL, start: 74, end: 85, score: 0.5]
type: EMAIL_ADDRESS, start: 66, end: 85, score: 1.0
type: PERSON, start: 0, end: 10, score: 0.85
type: URL, start: 74, end: 85, score: 0.5
text: -6.311321244615104 Social Security number is 123-45-6789 and his email is -11.671955800130334.
items:
[
    {'start': 74, 'end': 93, 'entity_type': 'EMAIL_ADDRESS', 'text': '-11.671955800130334', 'operator': 'replace'},
    {'start': 0, 'end': 18, 'entity_type': 'PERSON', 'text': '-6.311321244615104', 'operator': 'replace'}
]



In [10]:
import numpy as np
import pandas as pd


# Sample dataset with PII
data = pd.DataFrame({
    'Name': ['John Doe', 'Jane Smith', 'Alice Johnson'],
    'Age': [25, 30, 50],
    "s":[1,2,3],
    'Email': ['john.doe@example.com', 'jane.smith@example.com', 'alice.johnson@example.com']
})
print(data)

# Generalization
# Generalize age into age ranges
data['Age1'] = pd.cut(data['Age'], bins=[0, 30, 40,50], labels=['20-30', '30-40','40-50'])

# Suppression
# Suppress or remove email column
data = data.drop('Email', axis=1)

# Perturbation
# Perturb age values by adding Laplace noise
epsilon = 1.0  # Privacy parameter for differential privacy
sensitivity = 1  # Sensitivity of the age values
scale = sensitivity / epsilon
laplace_noise = np.random.laplace(loc=0, scale=scale, size=len(data))
# data['Age','s'] += laplace_noise

# print(data['Age'])

print(data)

            Name  Age  s                      Email
0       John Doe   25  1       john.doe@example.com
1     Jane Smith   30  2     jane.smith@example.com
2  Alice Johnson   50  3  alice.johnson@example.com
['20-30', '30-40', '40-50']
[0, 30, 40, 50]
            Name  Age  s   Age1
0       John Doe   25  1  20-30
1     Jane Smith   30  2  20-30
2  Alice Johnson   50  3  40-50


In [82]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

registry = RecognizerRegistry()
analyzer = AnalyzerEngine(registry=registry)
registry.load_predefined_recognizers()

print(str(data["Name"]))
results = analyzer.analyze(text=str(data["Name"]), language="en")
print(results)


0         John Doe
1       Jane Smith
2    Alice Johnson
Name: Name, dtype: object
0         John Doe
1       Jane Smith
2    Alice Johnson
Name: Name, Dtype: Object
[type: PERSON, start: 10, end: 18, score: 0.85, type: PERSON, start: 27, end: 37, score: 0.85, type: PERSON, start: 43, end: 56, score: 0.85]


In [86]:
s=data["Name"].to_string()
print(s)
p=pd.Series(s)
print(p)

0         John Doe
1       Jane Smith
2    Alice Johnson
0    0         John Doe\n1       Jane Smith\n2    A...
dtype: object


In [88]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
dataset = [
    {"text": "John Doe's email is john.doe@example.com and his phone number is 555-123-4567."},
    {"text": "Alice Smith's social security number is 123-45-6789."},
]
analyzed_dataset = analyzer.analyze(dataset,language='en')
masked_dataset = anonymizer.anonymize(analyzed_dataset, dataset)
for item in masked_dataset:
    print(item["text"])



ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'list'>

In [None]:
off
hashyfy
diffrential_pryivacy

In [101]:
class A:
    def x():
        print("x")
        return "x"

    def y():
        return "y"

# def fun(s):
    
#     print(s())
    
# fun("x")
s=getattr(A,"x")
# s=globals()["x"]
print(s())
    

x
x


In [7]:
!pip install diffprivlib

Looking in indexes: https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/simple, https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/simple


[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting diffprivlib
  Downloading https://infyartifactory.ad.infosys.com/artifactory/api/pypi/pypi-remote/packages/packages/a9/10/200015b77240c50f6f438e2b9e54a7179fdbf56f6ca9f40a11d90fd2c8f9/diffprivlib-0.6.3-py3-none-any.whl (176 kB)
     ---------------------------------------- 0.0/176.0 kB ? eta -:--:--
     -------------------------------------  174.1/176.0 kB 5.1 MB/s eta 0:00:01
     -------------------------------------  174.1/176.0 kB 5.1 MB/s eta 0:00:01
     -------------------------------------- 176.0/176.0 kB 1.8 MB/s eta 0:00:00
Installing collected packages: diffprivlib
Successfully installed diffprivlib-0.6.3


In [1]:
from diffprivlib.mechanisms import binary
import pandas as pd
df=pd.read_csv(r"C:\WORK\GIT\responsible-ai-admin\responsible-ai-admin\src\rai_admin\temp\emplist.csv")

In [53]:
print(df)

  Employee_ID Gender  Age  Education_Level Relationship_Status     Hometown  \
0   EID_22713      F   32                5              Single  Springfield   
1    EID_9658      M   65                2              Single      Lebanon   
2   EID_22203      M   52                3             Married  Springfield   
3    EID_7652      M   50                5              Single   Washington   
4    EID_6516      F   44                3             Married     Franklin   
5   EID_20283      F   22                4             Married     Franklin   
6   EID_21014      M   42                3             Married   Washington   
7    EID_7693      F   41                2             Married  Springfield   
8   EID_13232      M   31                1              Single  Springfield   

         Unit Decision_skill_possess  Time_of_service  Time_since_promotion  \
0         R&D             Conceptual                7                     4   
1          IT              Directive               

In [24]:
b=binary.Binary(epsilon=0.1,value0="F",value1="M",random_state=None)
b.randomise("F")

'M'

In [38]:
binaryList=[]
for c in df.columns:
    # print(s)
    if(len(df[c].unique())==2):
        binaryList.append(c)
print(binaryList)

['Gender', 'Relationship_Status']


In [40]:
mechanism = binary.Binary(epsilon=1.0,value0="F",value1="M")
df["Gender"] = mechanism.randomise(df["Gender"])

TypeError: Value to be randomised must be a string

In [41]:
df["Gender"].unique()

array(['F', 'M'], dtype=object)

In [54]:
def binaryCheck(df,col):
        data=list(df[col].unique())
        # print(data)
        mechanism = binary.Binary(epsilon=1.0,value0=data[0],value1=data[1])
        for d in range(len(df[col])):
            temp=df.loc[d,col]
            # print("==/",temp)
            df.loc[d,col]=mechanism.randomise(temp)
            # print("=====",temp,df.loc[d,col])

In [55]:
binaryCheck(df,"Gender")

In [2]:
df

Unnamed: 0,Employee_ID,Gender,Age,Education_Level,Relationship_Status,Hometown,Unit,Decision_skill_possess,Time_of_service,Time_since_promotion,growth_rate,Travel_Rate,Post_Level,Pay_Scale,Compensation_and_Benefits,Work_Life_balance
0,EID_22713,F,32,5,Single,Springfield,R&D,Conceptual,7,4,30,1,5,4,type2,1
1,EID_9658,M,65,2,Single,Lebanon,IT,Directive,41,2,72,1,1,1,type2,1
2,EID_22203,M,52,3,Married,Springfield,Sales,Directive,21,3,25,0,1,8,type3,1
3,EID_7652,M,50,5,Single,Washington,Marketing,Analytical,11,4,28,1,1,2,type0,4
4,EID_6516,F,44,3,Married,Franklin,R&D,Conceptual,12,4,47,1,3,2,type2,4
5,EID_20283,F,22,4,Married,Franklin,IT,Behavioral,3,1,53,0,3,6,type2,1
6,EID_21014,M,42,3,Married,Washington,Purchasing,Analytical,6,4,35,1,3,4,type2,1
7,EID_7693,F,41,2,Married,Springfield,Sales,Conceptual,4,4,35,1,4,8,type2,1
8,EID_13232,M,31,1,Single,Springfield,IT,Analytical,7,3,73,2,3,8,type2,3


In [3]:
minv=df.Age.min()
maxv=df.Age.max()



In [4]:
import math

base=10
maxrange=math.ceil(maxv / base) * base
minrange=round(minv/base)*base

print(maxrange,minrange)
diff=maxrange-minrange



70 20


In [18]:
range_magnitude = abs(maxrange - minrange)
# print(range_magnitude)
# Determine the number of ranges based on the magnitude``
num_ranges = max(range_magnitude // 10, 1)  # Assuming a minimum range size of 10

# Calculate the interval
interval = range_magnitude / num_ranges

ranges = []
binlist=set()
lablelist=[]

for i in range(num_ranges):
    start = minrange + i * interval
    end = minrange + (i + 1) * interval
    if(i==num_ranges-1):
        # print(i)
        end=maxrange
    binlist.add(start)
    binlist.add(end)
    lablelist.append(f"{start}-{end}")
    # ranges.append((start, end))
binlist=sorted(list(binlist))
print(lablelist)
print(binlist)

50
4
['20.0-30.0', '30.0-40.0', '40.0-50.0', '50.0-60.0', '60.0-70']
[20.0, 30.0, 40.0, 50.0, 60.0, 70]


In [16]:
df['Age1'] = pd.cut(df['Age'], bins=binlist, labels=lablelist)
df

['20.0-30.0', '30.0-40.0', '40.0-50.0', '50.0-60.0', '60.0-70.0']
[20.0, 30.0, 40.0, 50.0, 60.0, 70.0]


Unnamed: 0,Employee_ID,Gender,Age,Education_Level,Relationship_Status,Hometown,Unit,Decision_skill_possess,Time_of_service,Time_since_promotion,growth_rate,Travel_Rate,Post_Level,Pay_Scale,Compensation_and_Benefits,Work_Life_balance,Age1
0,EID_22713,F,32,5,Single,Springfield,R&D,Conceptual,7,4,30,1,5,4,type2,1,30.0-40.0
1,EID_9658,M,65,2,Single,Lebanon,IT,Directive,41,2,72,1,1,1,type2,1,60.0-70.0
2,EID_22203,M,52,3,Married,Springfield,Sales,Directive,21,3,25,0,1,8,type3,1,50.0-60.0
3,EID_7652,M,50,5,Single,Washington,Marketing,Analytical,11,4,28,1,1,2,type0,4,40.0-50.0
4,EID_6516,F,44,3,Married,Franklin,R&D,Conceptual,12,4,47,1,3,2,type2,4,40.0-50.0
5,EID_20283,F,22,4,Married,Franklin,IT,Behavioral,3,1,53,0,3,6,type2,1,20.0-30.0
6,EID_21014,M,42,3,Married,Washington,Purchasing,Analytical,6,4,35,1,3,4,type2,1,40.0-50.0
7,EID_7693,F,41,2,Married,Springfield,Sales,Conceptual,4,4,35,1,4,8,type2,1,40.0-50.0
8,EID_13232,M,31,1,Single,Springfield,IT,Analytical,7,3,73,2,3,8,type2,3,30.0-40.0


In [82]:
max(1,2)

TypeError: 'numpy.int64' object is not callable