Skip to content

Instantly share code, notes, and snippets.

@abarmat
Created May 13, 2016 01:42
Show Gist options
  • Save abarmat/90854b104fe8412fa80fbbf799cf0b3f to your computer and use it in GitHub Desktop.
Save abarmat/90854b104fe8412fa80fbbf799cf0b3f to your computer and use it in GitHub Desktop.
Datamining UBA Add Publication Years Script
import hashlib
import logging
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
from itertools import groupby
from operator import itemgetter
logging.basicConfig(
format='%(asctime)s (%(levelname)s) %(message)s',
datefmt='[%Y-%m-%d %H:%M:%S]',
level=logging.INFO
)
YEARS = range(2005, 2015)
DATA_FILENAME = 'datasets/departamentos-en-venta-{}.csv'
def gen_key_from_fields(e):
return hashlib.md5(':'.join(map(str, [
e.CALLE, e.NUMERO, e.M2, e.ORIENT
# e.BAULERA, e.COCHERA, e.LAVADERO, e.TERRAZA
]))).hexdigest()
def merge_datasets(years, remove_dupl_keys=False):
dfs = []
for year in years:
logging.info(DATA_FILENAME.format(year))
try:
data = pd.read_csv(DATA_FILENAME.format(year), sep=';')
convert_attribute_types(data)
add_year_published(data, year)
add_key(data)
if remove_dupl_keys:
data = data.drop_duplicates(['HASH_ID'])
dfs.append(data)
except:
logging.exception("Error opening year {}".format(year))
df = pd.concat(dfs)
return df[dfs[0].columns]
def sanitize_geo(value):
v = str(value).replace('.', '')
a = v[0:3]
b = v[3:]
return '{}.{}'.format(a,b)
def convert_attribute_types(data):
"""Convert the attribute types to the adjust to reality"""
data['NUMERO'] = data['NUMERO'].astype('string')
data['COMUNA'] = data['COMUNA'].astype('string')
for field in ['LAT', 'LON']:
data[field] = data.apply(lambda row: sanitize_geo(row[field]), axis=1)
def show_types(data):
print('Total Columns: {}'.format(len(data.columns)))
for col in data:
print ' ', col, data[col].dtype
def boxplot(data):
data.boxplot()
plt.show()
def std_normalize(data):
data = data.select_dtypes(['int64', 'float64'])
data = data.fillna(0)
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(data.values)
return pd.DataFrame(x_scaled)
def add_year_published(data, year):
data.loc[:,'YEAR_PUB'] = str(year)
def add_key(data):
logging.info('Adding publication ID...')
data.loc[:,'HASH_ID'] = data.apply(lambda e: gen_key_from_fields(e), axis=1)
return data
def get_duplicated_hashes(data):
hashes = [
(key, val) for key, val in data.groupby(['HASH_ID']).count()['CALLE'].iteritems()
if val > 1
]
logging.info('total: {} collisions: {}'.format(len(data), len(hashes)))
for key, val in hashes:
logging.info(' h:{{{}}}'.format(key))
return hashes
# def remove_duplicated(data):
# pre_len = len(data)
# hashes = [e[0] for e in get_duplicated_hashes(data)]
# data = data[~data.HASH_ID.isin(hashes)]
# logging.info('{} => {}'.format(pre_len, len(data)))
# return data
def consecutives(l):
for k, g in groupby(enumerate(l), lambda (i, x): i-x):
yield map(itemgetter(1), g)
def years_published(data):
"""
This function calculates the max number of consecutive years a property has been published
Creates a list of HASH_ID: [year1, year2...] then look for consecutive
years and count them
"""
pivot = data.pivot(index='HASH_ID', columns='YEAR_PUB', values='YEAR_PUB')
d = {}
for e in pivot.itertuples():
d[e.Index] = []
for year in YEARS:
for index, e in pivot[str(year)].iteritems():
if e == str(year):
d[index].append(year)
return [
(key, max([len(e) for e in list(consecutives(year_list))]))
for key, year_list in d.items()
]
def main():
# Merge all files removing duplicated HASH_ID
data = merge_datasets(YEARS, remove_dupl_keys=True)
show_types(data)
# Create a dataframe with key:HASH_ID value:YEAR_PUB_COUNT
yp = pd.DataFrame(years_published(data), columns=['HASH_ID', 'YEAR_PUB_CONT'])
yp = yp.set_index(['HASH_ID'])
# Join original merged dataset with years_published
data = merge_datasets(YEARS, remove_dupl_keys=False)
data = data.join(yp, on='HASH_ID')
data.to_csv('pubyear_2005_2007_20160512.csv', sep=';', index=False)
# print data.describe()
# data = std_normalize(data)
# boxplot(data)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment