Last active
May 4, 2018 20:14
-
-
Save steniowagner/a5b0397a4dedccf6b9aa6567d52af847 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
import datetime | |
import numpy | |
import math | |
import csv | |
dataset_background = pandas.read_csv('background.csv', skiprows = 1, header = None, names = ['nome', 'genero', 'data_nascimento', 'cidade', 'estado']) | |
dataset_diseases = pandas.read_csv('doencas.csv', skiprows = 1, header = None, names = ['genero', 'data_nascimento', 'cidade', 'estado', 'doenca']) | |
dataset_diseases_size = len(dataset_diseases) | |
cities = set(dataset_diseases['cidade']) | |
states = set(dataset_diseases['estado']) | |
def get_gender_value(raw_gender): | |
gender = raw_gender.to_string() | |
index_substring = gender.rfind(' ') + 1 | |
return 0 if gender[index_substring:] == 'Female' else 1 | |
def get_city_value(raw_city): | |
city_string = raw_city.to_string() | |
city = (city_string[city_string.find(' '):])[4:] | |
return list(cities).index(city) | |
def get_state_value(raw_state): | |
state = raw_state.to_string() | |
index_substring = state.rfind(' ') + 1 | |
return list(states).index(state[index_substring:]) | |
def get_days_from_date(raw_birth_date): | |
birth_date_string = raw_birth_date.to_string() | |
index_substring = birth_date_string.rfind(' ') + 1 | |
month, day, year = birth_date_string[index_substring:].split('/') | |
birth_date = datetime.date(int(year), int(month), int(day)) | |
current_date = datetime.datetime.now().date() | |
date_value = str(current_date - birth_date) | |
index_substring = date_value.find(' ') | |
return int(date_value[0:index_substring]) | |
def get_register_value(register): | |
gender_value = get_gender_value(register['genero']) | |
state_value = get_state_value(register['estado']) | |
city_value = get_city_value(register['cidade']) | |
date_value = get_days_from_date(register['data_nascimento']) | |
return gender_value + state_value + city_value + date_value | |
def define_groups(dataset, k): | |
elements = [] | |
groups = [] | |
for i in range(0, dataset_diseases_size): | |
elements.append(dataset.iloc[[i]]) | |
for i in range(0, math.ceil(dataset_diseases_size / k)): | |
current_index = k * (i + 1) | |
min_index = current_index - k | |
max_index = current_index | |
groups.append(elements[min_index:max_index]) | |
return groups | |
def anonymize_gender(group, k): | |
group_copy = [] | |
genders = [] | |
for i in range(0, len(group)): | |
group_copy.append(list(group)[i]['genero']) | |
for i in range(0, len(group)): | |
genders.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:]) | |
if len(set(genders)) > 1: | |
for i in range(0, len(group)): | |
group[i].is_copy = False | |
group[i]['genero'] = '*' | |
def anonymize_state(group, k): | |
group_copy = [] | |
states = [] | |
for i in range (0, len(group)): | |
group_copy.append(list(group)[i]['estado']) | |
for i in range(0, len(group)): | |
states.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:]) | |
if len(set(states)) > 1: | |
for i in range(0, len(group)): | |
group[i].is_copy = False | |
group[i]['estado'] = '*' | |
def anonymize_city(group, k): | |
group_copy = [] | |
cities = [] | |
for i in range (0, len(group)): | |
group_copy.append(list(group)[i]['cidade']) | |
for i in range(0, len(group)): | |
cities.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:]) | |
if len(set(cities)) > 1: | |
for i in range(0, len(group)): | |
group[i].is_copy = False | |
group[i]['cidade'] = group[i]['estado'] | |
def anonymize_birth_date(group, k): | |
group_copy = [] | |
birth_days = [] | |
birth_months = [] | |
birth_years = [] | |
birth_date_annonimizated = '' | |
for i in range(0, len(group)): | |
group_copy.append(list(group)[i]['data_nascimento']) | |
for i in range(0, len(group)): | |
raw_birth_date = group_copy[i].to_string() | |
last_blank_space_index = raw_birth_date.rfind(' ') + 1 | |
birth_date = raw_birth_date[last_blank_space_index:] | |
last_index_slash = birth_date.rfind('/') | |
first_index_slash = birth_date.find('/') | |
birth_days.append(birth_date[:first_index_slash]) | |
birth_months.append(birth_date[first_index_slash + 1:last_index_slash]) | |
birth_years.append(birth_date[last_index_slash + 1:]) | |
if len(set(birth_days)) > 1: | |
birth_date_annonimizated = '**/' | |
else: | |
birth_date_annonimizated = str(birth_days[0]) + '/' | |
if len(set(birth_months)) > 1: | |
birth_date_annonimizated = str(birth_date_annonimizated) + '**/' | |
else: | |
birth_date_annonimizated = str(birth_date_annonimizated) + (str(birth_months[0]) + '/') | |
if len(set(birth_years)) > 1: | |
birth_date_annonimizated = str(birth_date_annonimizated) + '****' | |
else: | |
birth_date_annonimizated = str(birth_date_annonimizated) + (str(birth_years[0])) | |
for i in range(0, len(group)): | |
group[i].is_copy = False | |
group[i]['data_nascimento'] = birth_date_annonimizated | |
def compare_birth_date(first_date, second_date): | |
if (first_date == second_date): return True | |
first_date_splitted = first_date.split('/') | |
second_date_splitted = second_date.split('/') | |
is_same_date = False | |
for i in range(0, 3): | |
is_year_anonnimized = i == 2 and first_date_splitted[i] == '****' | |
is_day_or_month_annonimized = first_date_splitted[i] == '**' | |
if (is_year_anonnimized or is_day_or_month_annonimized): | |
is_same_date = True | |
continue | |
is_same_date = first_date_splitted[i] == second_date_splitted[i] | |
return is_same_date | |
def compare_gender(annonimized_gender, background_gender): | |
is_gender_annonimized = annonimized_gender == '*' | |
is_same_gender = annonimized_gender == background_gender | |
return True if (is_gender_annonimized or is_same_gender) else False | |
def compare_city(annonimized_city, anninimized_state, background_city, background_state): | |
is_city_annonimized = (annonimized_city == '*' or annonimized_city == anninimized_state) | |
is_same_city = annonimized_city == background_city | |
return True if (is_city_annonimized or is_same_city) else False | |
def compare_state(annonimized_state, background_state): | |
is_state_annonimized = annonimized_state == '*' | |
is_same_state = annonimized_state == background_state | |
return True if (is_state_annonimized or is_same_state) else False | |
def apply_delta_presence(dataset, max_delta, min_delta, k): | |
gender_index = 1 | |
birth_date_index = 2 | |
city_index = 3 | |
state_index = 4 | |
groups = define_groups(dataset, k) | |
number_groups = math.ceil(dataset_diseases_size / k) | |
matches = [None] * number_groups | |
for i in range(0, number_groups): | |
anonymize_gender(groups[i], k) | |
anonymize_state(groups[i], k) | |
anonymize_city(groups[i], k) | |
anonymize_birth_date(groups[i], k) | |
for i in range(0, number_groups): | |
number_matches = 0 | |
for index, row in dataset_background.iterrows(): | |
match_birth_date = compare_birth_date(groups[i][0]['data_nascimento'].values[0], row[birth_date_index]) | |
match_gender = compare_gender(groups[i][0]['genero'].values[0], row[gender_index]) | |
match_city = compare_city(groups[i][0]['cidade'].values[0], groups[i][0]['estado'].values[0], row[city_index], row[state_index]) | |
match_state = compare_state(groups[i][0]['estado'].values[0], row[state_index]) | |
if (match_birth_date and match_gender and match_city and match_state): number_matches += 1 | |
matches[i] = number_matches | |
for i in range(0, number_groups): | |
probability_existence = k / matches[i] | |
if (probability_existence < min_delta): | |
return 1 | |
if (probability_existence > max_delta): | |
return 2 | |
return 0 | |
enough = False | |
k = 1 | |
while (not enough): | |
print('K = ', k) | |
registers_values = [] | |
for i in range(0, dataset_diseases_size): | |
registers_values.append(get_register_value(dataset_diseases.iloc[[i]])) | |
dataset_diseases['register_value'] = registers_values | |
dataset = dataset_diseases.sort_values(['register_value']) | |
result = apply_delta_presence(dataset, 0.4, 0.1, k) | |
if k == dataset_diseases_size: | |
print('K = Tamanho do Dataset Anonimizado') | |
if k > dataset_diseases_size: | |
print('K = É maior que o Dataset Anonimizado') | |
if k == 0: | |
print('Tamanho Mínimo do K é Zero.') | |
break | |
if result == 0: | |
print('Objetivo Atingido') | |
enough = True | |
if result == 1: | |
print('Aumentar o Tamanho do Grupo') | |
k += 1 | |
if result == 2: | |
print('Diminuir o Tamanho do Grupo') | |
k -= 1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment