Skip to content

Instantly share code, notes, and snippets.

@steniowagner
Created April 25, 2018 03:17
Show Gist options
  • Save steniowagner/a0538571ebffdbc1393a937d8c34177a to your computer and use it in GitHub Desktop.
Save steniowagner/a0538571ebffdbc1393a937d8c34177a to your computer and use it in GitHub Desktop.
import pandas
import datetime
import numpy
import math
import csv
from random import randint
dataset = pandas.read_csv('epidemias.csv', skiprows = 1, header = None, names = ['nome', 'genero', 'data_nascimento', 'cidade', 'estado', 'doencax'])
dataset_size = len(dataset)
cities = set(dataset['cidade'])
states = set(dataset['estado'])
diseases = set(dataset['doencax'])
def get_gender_value(raw_gender):
gender = raw_gender.to_string()
index_substring = gender.rfind(' ') + 1
return 0 if gender[index_substring:] == 'Female' else 1
def get_city_value(raw_city):
city_string = raw_city.to_string()
city = (city_string[city_string.find(' '):])[4:]
return list(cities).index(city)
def get_state_value(raw_state):
state = raw_state.to_string()
index_substring = state.rfind(' ') + 1
return list(states).index(state[index_substring:])
def get_days_from_date(raw_birth_date):
birth_date_string = raw_birth_date.to_string()
index_substring = birth_date_string.rfind(' ') + 1
month, day, year = birth_date_string[index_substring:].split('/')
birth_date = datetime.date(int(year), int(month), int(day))
current_date = datetime.datetime.now().date()
date_value = str(current_date - birth_date)
index_substring = date_value.find(' ')
return int(date_value[0:index_substring])
def get_register_value(register):
gender_value = get_gender_value(register['genero'])
state_value = get_state_value(register['estado'])
city_value = get_city_value(register['cidade'])
date_value = get_days_from_date(register['data_nascimento']) * 0.001
return gender_value + state_value + city_value + date_value
def define_groups(dataset, k):
elements = []
groups = []
for i in range(0, dataset_size):
elements.append(dataset.iloc[[i]])
for i in range(0, math.ceil(dataset_size / k)):
current_index = k * (i + 1)
min_index = current_index - k
max_index = current_index
groups.append(elements[min_index:max_index])
return groups
def anonymize_gender(group, k):
group_copy = []
genders = []
for i in range(0, k):
group_copy.append(list(group)[i]['genero'])
for i in range(0, k):
genders.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:])
if len(set(genders)) > 1:
for i in range(0, k):
group[i].is_copy = False
group[i]['genero'] = '*'
def anonymize_state(group, k):
group_copy = []
states = []
for i in range (0, k):
group_copy.append(list(group)[i]['estado'])
for i in range(0, k):
states.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:])
if len(set(states)) > 1:
for i in range(0, k):
group[i].is_copy = False
group[i]['estado'] = '*'
def anonymize_city(group, k):
group_copy = []
cities = []
for i in range (0, k):
group_copy.append(list(group)[i]['cidade'])
for i in range(0, k):
cities.append(group_copy[i].to_string()[group_copy[i].to_string().find(' '):][4:])
if len(set(cities)) > 1:
for i in range(0, k):
group[i].is_copy = False
group[i]['cidade'] = group[i]['estado']
def anonymize_birth_date(group, k):
group_copy = []
birth_days = []
birth_months = []
birth_years = []
birth_date_annonimizated = ''
for i in range(0, k):
group_copy.append(list(group)[i]['data_nascimento'])
for i in range(0, k):
raw_birth_date = group_copy[i].to_string()
last_blank_space_index = raw_birth_date.rfind(' ') + 1
birth_date = raw_birth_date[last_blank_space_index:]
last_index_slash = birth_date.rfind('/')
first_index_slash = birth_date.find('/')
birth_days.append(birth_date[:first_index_slash])
birth_months.append(birth_date[first_index_slash + 1:last_index_slash])
birth_years.append(birth_date[last_index_slash + 1:])
if len(set(birth_days)) > 1:
birth_date_annonimizated = '**/'
else:
birth_date_annonimizated = str(birth_days[0]) + '/'
if len(set(birth_months)) > 1:
birth_date_annonimizated = str(birth_date_annonimizated) + '**/'
else:
birth_date_annonimizated = str(birth_date_annonimizated) + (str(birth_months[0]) + '/')
if len(set(birth_years)) > 1:
birth_date_annonimizated = str(birth_date_annonimizated) + '****'
else:
birth_date_annonimizated = str(birth_date_annonimizated) + (str(birth_years[0]))
for i in range(0, k):
group[i].is_copy = False
group[i]['data_nascimento'] = birth_date_annonimizated
def get_random_unique_disease(group_diseases, diseases_dataset):
diseases_unique = set(group_diseases)
found_new_disease = False
while (not found_new_disease):
index_random_disease = randint(0, len(diseases_dataset) - 1)
random_disease = list(diseases_dataset)[index_random_disease]
found_new_disease = not(random_disease in diseases_unique)
return random_disease
def is_group_diversified(group_diseases, l):
return len(set(group_diseases)) >= l
def diversify_diseases(group, k, l):
group_diseases = []
for i in range(0, k):
group_diseases.append(group[i]['doencax'].values[0])
index_group_diseases = 0
while (not is_group_diversified(group_diseases, l) and index_group_diseases < l):
group_diseases[index_group_diseases] = get_random_unique_disease(group_diseases, diseases)
group[index_group_diseases]['doencax'] = group_diseases[index_group_diseases]
index_group_diseases += 1
def analyse(dataset, k, l):
dataset_resulting = pandas.DataFrame(index = numpy.arange(0, dataset_size), columns = ['nome', 'genero', 'data_nascimento', 'cidade', 'estado', 'doencax', 'register_value'])
groups = define_groups(dataset, k)
number_groups = math.ceil(dataset_size / k)
for i in range(0, number_groups):
anonymize_gender(groups[i], k)
anonymize_state(groups[i], k)
anonymize_city(groups[i], k)
anonymize_birth_date(groups[i], k)
diversify_diseases(groups[i], k, l)
index_dataset_resulting = 0
for i in range(0, number_groups):
for j in range(0, k):
dataset_resulting.loc[index_dataset_resulting]['nome'] = groups[i][j]['nome'].values[0]
dataset_resulting.loc[index_dataset_resulting]['genero'] = groups[i][j]['genero'].values[0]
dataset_resulting.loc[index_dataset_resulting]['data_nascimento'] = groups[i][j]['data_nascimento'].values[0]
dataset_resulting.loc[index_dataset_resulting]['cidade'] = groups[i][j]['cidade'].values[0]
dataset_resulting.loc[index_dataset_resulting]['estado'] = groups[i][j]['estado'].values[0]
dataset_resulting.loc[index_dataset_resulting]['doencax'] = groups[i][j]['doencax'].values[0]
dataset_resulting.loc[index_dataset_resulting]['register_value'] = groups[i][j]['register_value'].values[0]
index_dataset_resulting += 1
return dataset_resulting
registers_values = []
for i in range(0, len(dataset)):
registers_values.append(get_register_value(dataset.iloc[[i]]))
dataset['nome'] = '*'
dataset['register_value'] = registers_values
dataset = dataset.sort_values(['register_value'])
dataset_annonimizated = analyse(dataset, 10, 2)
dataset_annonimizated.to_csv('Trabalho2_output1.csv', sep = ',', encoding = 'utf-8')
dataset_annonimizated = analyse(dataset, 10, 5)
dataset_annonimizated.to_csv('Trabalho2_output2.csv', sep = ',', encoding = 'utf-8')
dataset_annonimizated = analyse(dataset, 20, 2)
dataset_annonimizated.to_csv('Trabalho2_output3.csv', sep = ',', encoding = 'utf-8')
dataset_annonimizated = analyse(dataset, 20, 5)
dataset_annonimizated.to_csv('Trabalho2_output4.csv', sep = ',', encoding = 'utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment