Created
May 13, 2016 01:42
-
-
Save abarmat/90854b104fe8412fa80fbbf799cf0b3f to your computer and use it in GitHub Desktop.
Datamining UBA Add Publication Years Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import logging | |
import pandas as pd | |
import matplotlib | |
import matplotlib.pyplot as plt | |
from sklearn import preprocessing | |
from itertools import groupby | |
from operator import itemgetter | |
logging.basicConfig( | |
format='%(asctime)s (%(levelname)s) %(message)s', | |
datefmt='[%Y-%m-%d %H:%M:%S]', | |
level=logging.INFO | |
) | |
YEARS = range(2005, 2015) | |
DATA_FILENAME = 'datasets/departamentos-en-venta-{}.csv' | |
def gen_key_from_fields(e): | |
return hashlib.md5(':'.join(map(str, [ | |
e.CALLE, e.NUMERO, e.M2, e.ORIENT | |
# e.BAULERA, e.COCHERA, e.LAVADERO, e.TERRAZA | |
]))).hexdigest() | |
def merge_datasets(years, remove_dupl_keys=False): | |
dfs = [] | |
for year in years: | |
logging.info(DATA_FILENAME.format(year)) | |
try: | |
data = pd.read_csv(DATA_FILENAME.format(year), sep=';') | |
convert_attribute_types(data) | |
add_year_published(data, year) | |
add_key(data) | |
if remove_dupl_keys: | |
data = data.drop_duplicates(['HASH_ID']) | |
dfs.append(data) | |
except: | |
logging.exception("Error opening year {}".format(year)) | |
df = pd.concat(dfs) | |
return df[dfs[0].columns] | |
def sanitize_geo(value): | |
v = str(value).replace('.', '') | |
a = v[0:3] | |
b = v[3:] | |
return '{}.{}'.format(a,b) | |
def convert_attribute_types(data): | |
"""Convert the attribute types to the adjust to reality""" | |
data['NUMERO'] = data['NUMERO'].astype('string') | |
data['COMUNA'] = data['COMUNA'].astype('string') | |
for field in ['LAT', 'LON']: | |
data[field] = data.apply(lambda row: sanitize_geo(row[field]), axis=1) | |
def show_types(data): | |
print('Total Columns: {}'.format(len(data.columns))) | |
for col in data: | |
print ' ', col, data[col].dtype | |
def boxplot(data): | |
data.boxplot() | |
plt.show() | |
def std_normalize(data): | |
data = data.select_dtypes(['int64', 'float64']) | |
data = data.fillna(0) | |
scaler = preprocessing.StandardScaler() | |
x_scaled = scaler.fit_transform(data.values) | |
return pd.DataFrame(x_scaled) | |
def add_year_published(data, year): | |
data.loc[:,'YEAR_PUB'] = str(year) | |
def add_key(data): | |
logging.info('Adding publication ID...') | |
data.loc[:,'HASH_ID'] = data.apply(lambda e: gen_key_from_fields(e), axis=1) | |
return data | |
def get_duplicated_hashes(data): | |
hashes = [ | |
(key, val) for key, val in data.groupby(['HASH_ID']).count()['CALLE'].iteritems() | |
if val > 1 | |
] | |
logging.info('total: {} collisions: {}'.format(len(data), len(hashes))) | |
for key, val in hashes: | |
logging.info(' h:{{{}}}'.format(key)) | |
return hashes | |
# def remove_duplicated(data): | |
# pre_len = len(data) | |
# hashes = [e[0] for e in get_duplicated_hashes(data)] | |
# data = data[~data.HASH_ID.isin(hashes)] | |
# logging.info('{} => {}'.format(pre_len, len(data))) | |
# return data | |
def consecutives(l): | |
for k, g in groupby(enumerate(l), lambda (i, x): i-x): | |
yield map(itemgetter(1), g) | |
def years_published(data): | |
""" | |
This function calculates the max number of consecutive years a property has been published | |
Creates a list of HASH_ID: [year1, year2...] then look for consecutive | |
years and count them | |
""" | |
pivot = data.pivot(index='HASH_ID', columns='YEAR_PUB', values='YEAR_PUB') | |
d = {} | |
for e in pivot.itertuples(): | |
d[e.Index] = [] | |
for year in YEARS: | |
for index, e in pivot[str(year)].iteritems(): | |
if e == str(year): | |
d[index].append(year) | |
return [ | |
(key, max([len(e) for e in list(consecutives(year_list))])) | |
for key, year_list in d.items() | |
] | |
def main(): | |
# Merge all files removing duplicated HASH_ID | |
data = merge_datasets(YEARS, remove_dupl_keys=True) | |
show_types(data) | |
# Create a dataframe with key:HASH_ID value:YEAR_PUB_COUNT | |
yp = pd.DataFrame(years_published(data), columns=['HASH_ID', 'YEAR_PUB_CONT']) | |
yp = yp.set_index(['HASH_ID']) | |
# Join original merged dataset with years_published | |
data = merge_datasets(YEARS, remove_dupl_keys=False) | |
data = data.join(yp, on='HASH_ID') | |
data.to_csv('pubyear_2005_2007_20160512.csv', sep=';', index=False) | |
# print data.describe() | |
# data = std_normalize(data) | |
# boxplot(data) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment