Created
September 8, 2023 15:58
-
-
Save fsndzomga/f6fca781f243eff89f215bc7aad8f1ad to your computer and use it in GitHub Desktop.
Anonymize without LangChain
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import spacy | |
import hashlib | |
from faker import Faker | |
fake = Faker() | |
nlp = spacy.load("en_core_web_sm") | |
DATA = [] | |
DATA_ANON = [] | |
email_map = {} | |
name_map = {} | |
phone_map = {} | |
# Function to encrypt any string (names, emails, phone numbers) | |
def encrypt_string(text): | |
return hashlib.sha256(text.encode()).hexdigest() | |
# Generate fake data | |
for _ in range(2): | |
name = fake.name() | |
phone = fake.phone_number() | |
email = fake.email() | |
DATA.append(f"My name is {name}, call me at {phone} or email me at {email}. I say it again, my name is {name}, call me at {phone} or email me at {email}") | |
# Anonymize data | |
for sentence in DATA: | |
doc = nlp(sentence) | |
anon_sentence = sentence | |
# Detect and replace names | |
for ent in doc.ents: | |
if ent.label_ == "PERSON": | |
encrypted_name = encrypt_string(ent.text) | |
if encrypted_name not in name_map: | |
new_name = fake.name() | |
name_map[encrypted_name] = new_name | |
anon_sentence = anon_sentence.replace(ent.text, name_map[encrypted_name]) | |
# Detect and replace phone numbers | |
for phone in re.findall(r'\(\d{3}\)\d{3}-\d{4}', sentence): | |
encrypted_phone = encrypt_string(phone) | |
if encrypted_phone not in phone_map: | |
new_phone = fake.phone_number() | |
phone_map[encrypted_phone] = new_phone | |
anon_sentence = anon_sentence.replace(phone, phone_map[encrypted_phone]) | |
# Detect and replace emails | |
for email in re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', sentence): | |
encrypted_email = encrypt_string(email) | |
if encrypted_email not in email_map: | |
new_email = fake.email() | |
email_map[encrypted_email] = new_email | |
anon_sentence = anon_sentence.replace(email, email_map[encrypted_email]) | |
DATA_ANON.append(anon_sentence) | |
print("Original Data:") | |
for sentence in DATA: | |
print(sentence) | |
print("\nAnonymized Data:") | |
for sentence in DATA_ANON: | |
print(sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment