Skip to content

Instantly share code, notes, and snippets.

@steniowagner
Last active May 4, 2018 20:14
Show Gist options
  • Save steniowagner/a5b0397a4dedccf6b9aa6567d52af847 to your computer and use it in GitHub Desktop.
Save steniowagner/a5b0397a4dedccf6b9aa6567d52af847 to your computer and use it in GitHub Desktop.
import pandas
import datetime
import numpy
import math
import csv
dataset_background = pandas.read_csv('background.csv', skiprows = 1, header = None, names = ['nome', 'genero', 'data_nascimento', 'cidade', 'estado'])
dataset_diseases = pandas.read_csv('doencas.csv', skiprows = 1, header = None, names = ['genero', 'data_nascimento', 'cidade', 'estado', 'doenca'])
dataset_diseases_size = len(dataset_diseases)
cities = set(dataset_diseases['cidade'])
states = set(dataset_diseases['estado'])
def get_gender_value(raw_gender):
gender = raw_gender.to_string()
index_substring = gender.rfind(' ') + 1
return 0 if gender[index_substring:] == 'Female' else 1
def get_city_value(raw_city):
city_string = raw_city.to_string()
city = (city_string[city_string.find(' '):])[4:]
return list(cities).index(city)
def get_state_value(raw_state):
state = raw_state.to_string()
index_substring = state.rfind(' ') + 1
return list(states).index(state[index_substring:])
def get_days_from_date(raw_birth_date):
birth_date_string = raw_birth_date.to_string()
index_substring = birth_date_string.rfind(' ') + 1
month, day, year = birth_date_string[index_substring:].split('/')
birth_date = datetime.date(int(year), int(month), int(day))
current_date = datetime.datetime.now().date()
date_value = str(current_date - birth_date)
index_substring = date_value.find(' ')
return int(date_value[0:index_substring])
def get_register_value(register):
gender_value = get_gender_value(register['genero'])
state_value = get_state_value(register['estado'])
city_value = get_city_value(register['cidade'])
date_value = get_days_from_date(register['data_nascimento'])
return gender_value + state_value + city_value + date_value
def define_groups(dataset, k):
elements = []
groups = []
for i in range(0, dataset_diseases_size):
elements.append(dataset.iloc[[i]])
for i in range(0, math.ceil(dataset_diseases_size / k)):
current_index = k * (i + 1)
min_index = current_index - k
max_index = current_index
groups.append(elements[min_index:max_index])
return groups
def anonymize_gender(group, k):
group_copy = []
genders = []
for i in range(0, len(group)):
group_copy.append(list(group)[i]['genero'])
for i in range(0, len(group)):
genders.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:])
if len(set(genders)) > 1:
for i in range(0, len(group)):
group[i].is_copy = False
group[i]['genero'] = '*'
def anonymize_state(group, k):
group_copy = []
states = []
for i in range (0, len(group)):
group_copy.append(list(group)[i]['estado'])
for i in range(0, len(group)):
states.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:])
if len(set(states)) > 1:
for i in range(0, len(group)):
group[i].is_copy = False
group[i]['estado'] = '*'
def anonymize_city(group, k):
group_copy = []
cities = []
for i in range (0, len(group)):
group_copy.append(list(group)[i]['cidade'])
for i in range(0, len(group)):
cities.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:])
if len(set(cities)) > 1:
for i in range(0, len(group)):
group[i].is_copy = False
group[i]['cidade'] = group[i]['estado']
def anonymize_birth_date(group, k):
group_copy = []
birth_days = []
birth_months = []
birth_years = []
birth_date_annonimizated = ''
for i in range(0, len(group)):
group_copy.append(list(group)[i]['data_nascimento'])
for i in range(0, len(group)):
raw_birth_date = group_copy[i].to_string()
last_blank_space_index = raw_birth_date.rfind(' ') + 1
birth_date = raw_birth_date[last_blank_space_index:]
last_index_slash = birth_date.rfind('/')
first_index_slash = birth_date.find('/')
birth_days.append(birth_date[:first_index_slash])
birth_months.append(birth_date[first_index_slash + 1:last_index_slash])
birth_years.append(birth_date[last_index_slash + 1:])
if len(set(birth_days)) > 1:
birth_date_annonimizated = '**/'
else:
birth_date_annonimizated = str(birth_days[0]) + '/'
if len(set(birth_months)) > 1:
birth_date_annonimizated = str(birth_date_annonimizated) + '**/'
else:
birth_date_annonimizated = str(birth_date_annonimizated) + (str(birth_months[0]) + '/')
if len(set(birth_years)) > 1:
birth_date_annonimizated = str(birth_date_annonimizated) + '****'
else:
birth_date_annonimizated = str(birth_date_annonimizated) + (str(birth_years[0]))
for i in range(0, len(group)):
group[i].is_copy = False
group[i]['data_nascimento'] = birth_date_annonimizated
def compare_birth_date(first_date, second_date):
if (first_date == second_date): return True
first_date_splitted = first_date.split('/')
second_date_splitted = second_date.split('/')
is_same_date = False
for i in range(0, 3):
is_year_anonnimized = i == 2 and first_date_splitted[i] == '****'
is_day_or_month_annonimized = first_date_splitted[i] == '**'
if (is_year_anonnimized or is_day_or_month_annonimized):
is_same_date = True
continue
is_same_date = first_date_splitted[i] == second_date_splitted[i]
return is_same_date
def compare_gender(annonimized_gender, background_gender):
is_gender_annonimized = annonimized_gender == '*'
is_same_gender = annonimized_gender == background_gender
return True if (is_gender_annonimized or is_same_gender) else False
def compare_city(annonimized_city, anninimized_state, background_city, background_state):
is_city_annonimized = (annonimized_city == '*' or annonimized_city == anninimized_state)
is_same_city = annonimized_city == background_city
return True if (is_city_annonimized or is_same_city) else False
def compare_state(annonimized_state, background_state):
is_state_annonimized = annonimized_state == '*'
is_same_state = annonimized_state == background_state
return True if (is_state_annonimized or is_same_state) else False
def apply_delta_presence(dataset, max_delta, min_delta, k):
gender_index = 1
birth_date_index = 2
city_index = 3
state_index = 4
groups = define_groups(dataset, k)
number_groups = math.ceil(dataset_diseases_size / k)
matches = [None] * number_groups
for i in range(0, number_groups):
anonymize_gender(groups[i], k)
anonymize_state(groups[i], k)
anonymize_city(groups[i], k)
anonymize_birth_date(groups[i], k)
for i in range(0, number_groups):
number_matches = 0
for index, row in dataset_background.iterrows():
match_birth_date = compare_birth_date(groups[i][0]['data_nascimento'].values[0], row[birth_date_index])
match_gender = compare_gender(groups[i][0]['genero'].values[0], row[gender_index])
match_city = compare_city(groups[i][0]['cidade'].values[0], groups[i][0]['estado'].values[0], row[city_index], row[state_index])
match_state = compare_state(groups[i][0]['estado'].values[0], row[state_index])
if (match_birth_date and match_gender and match_city and match_state): number_matches += 1
matches[i] = number_matches
for i in range(0, number_groups):
probability_existence = k / matches[i]
if (probability_existence < min_delta):
return 1
if (probability_existence > max_delta):
return 2
return 0
enough = False
k = 1
while (not enough):
print('K = ', k)
registers_values = []
for i in range(0, dataset_diseases_size):
registers_values.append(get_register_value(dataset_diseases.iloc[[i]]))
dataset_diseases['register_value'] = registers_values
dataset = dataset_diseases.sort_values(['register_value'])
result = apply_delta_presence(dataset, 0.4, 0.1, k)
if k == dataset_diseases_size:
print('K = Tamanho do Dataset Anonimizado')
if k > dataset_diseases_size:
print('K = É maior que o Dataset Anonimizado')
if k == 0:
print('Tamanho Mínimo do K é Zero.')
break
if result == 0:
print('Objetivo Atingido')
enough = True
if result == 1:
print('Aumentar o Tamanho do Grupo')
k += 1
if result == 2:
print('Diminuir o Tamanho do Grupo')
k -= 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment