Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created September 8, 2023 15:58
Show Gist options
  • Save fsndzomga/f6fca781f243eff89f215bc7aad8f1ad to your computer and use it in GitHub Desktop.
Save fsndzomga/f6fca781f243eff89f215bc7aad8f1ad to your computer and use it in GitHub Desktop.
Anonymize without LangChain
import re
import spacy
import hashlib
from faker import Faker
fake = Faker()
nlp = spacy.load("en_core_web_sm")
DATA = []
DATA_ANON = []
email_map = {}
name_map = {}
phone_map = {}
# Function to encrypt any string (names, emails, phone numbers)
def encrypt_string(text):
return hashlib.sha256(text.encode()).hexdigest()
# Generate fake data
for _ in range(2):
name = fake.name()
phone = fake.phone_number()
email = fake.email()
DATA.append(f"My name is {name}, call me at {phone} or email me at {email}. I say it again, my name is {name}, call me at {phone} or email me at {email}")
# Anonymize data
for sentence in DATA:
doc = nlp(sentence)
anon_sentence = sentence
# Detect and replace names
for ent in doc.ents:
if ent.label_ == "PERSON":
encrypted_name = encrypt_string(ent.text)
if encrypted_name not in name_map:
new_name = fake.name()
name_map[encrypted_name] = new_name
anon_sentence = anon_sentence.replace(ent.text, name_map[encrypted_name])
# Detect and replace phone numbers
for phone in re.findall(r'\(\d{3}\)\d{3}-\d{4}', sentence):
encrypted_phone = encrypt_string(phone)
if encrypted_phone not in phone_map:
new_phone = fake.phone_number()
phone_map[encrypted_phone] = new_phone
anon_sentence = anon_sentence.replace(phone, phone_map[encrypted_phone])
# Detect and replace emails
for email in re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', sentence):
encrypted_email = encrypt_string(email)
if encrypted_email not in email_map:
new_email = fake.email()
email_map[encrypted_email] = new_email
anon_sentence = anon_sentence.replace(email, email_map[encrypted_email])
DATA_ANON.append(anon_sentence)
print("Original Data:")
for sentence in DATA:
print(sentence)
print("\nAnonymized Data:")
for sentence in DATA_ANON:
print(sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment