jxnl · March 8, 2017 22:53
diff --git a/features.py b/features.py

 # coding: utf-8

 # # Loading and Transforming json data

 # In[1]:

 import pandas as pd
 import numpy as np
 import pickle
 from scipy import sparse

 data_path = "./"
 train_file = data_path + "train.json"
 test_file = data_path + "test.json"
 train_df = pd.read_json(train_file)
 test_df = pd.read_json(test_file)
 listing_id = test_df.listing_id.values


 # In[2]:

 y_map = {'low': 2, 'medium': 1, 'high': 0}
 train_df['interest_level2'] = train_df['interest_level'].apply(lambda x: y_map[x])
 y_train = train_df.interest_level2.values

 train_df.drop(['listing_id', 'interest_level2'], axis=1, inplace=1)
 test_df.drop('listing_id', axis=1, inplace=1)


 # In[3]:

 ntrain = train_df.shape[0]


 # ## Categoricals

 # In[4]:

 categorical_id = ["building_id", "manager_id"]


 # In[5]:

 n_fake_data = 5

 for cat in categorical_id:
    df = train_df.groupby(
        [cat, "interest_level"]
    ).agg(
        {cat: len}
    ).unstack(
    ).fillna(0)[cat]
    agg_prior = df.sum(0)
    agg_prior = agg_prior / agg_prior.sum()
    df = df + n_fake_data * agg_prior
    df.columns = [cat + "_" + c for c in df.columns]
    train_df = train_df.join(df, on=cat)
    test_df = test_df.join(df, on=cat)

    for c in df.columns:
        m = min(test_df[c])
        test_df.fillna(m, inplace=1)

    normalization = train_df[df.columns].sum(1)

    for c in df.columns:
        train_df[c + "_norm"] = train_df[c] / normalization
        test_df[c + "_norm"] = test_df[c] / normalization


 # In[6]:

 train_test = pd.concat((train_df, test_df), axis=0).reset_index(drop=True)


 # In[7]:

 train_test.drop('interest_level', 1, inplace=1)


 # In[8]:

 train_test['Zero_building_id'] = (train_test.building_id == 0).apply(int)


 # In[9]:

 for cat in categorical_id:
    cat_percentile = train_test[cat].value_counts().sort_values().cumsum()
    cat_percentile = cat_percentile/cat_percentile.max()
    cat_rank = pd.qcut(cat_percentile, 25).rank(method='dense').sort_values()
    train_test = train_test.join(cat_rank, on=cat, rsuffix="_rank")
    train_test = train_test.join(cat_percentile, on=cat, rsuffix="_percentile")


 # ## Descriptions

 # In[10]:

 import string

 train_test['desc'] = train_test['description']
 train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
 train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))

 string.punctuation.__add__('!!')
 string.punctuation.__add__('(')
 string.punctuation.__add__(')')

 remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

 train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
 train_test['desc_letters_count'] = train_test['description'].str.strip().apply(len)
 train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))


 # In[11]:

 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 dsc_counts = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=10000)
 dsc_sparse = dsc_counts.fit_transform(train_test.desc.str.lower())


 # In[12]:

 train_test.drop("desc", axis=1, inplace=1)
 train_test.drop("description", axis=1, inplace=1)


 # ## Features

 # In[13]:

 from sklearn.feature_extraction.text import CountVectorizer

 feat_counts = CountVectorizer(analyzer=lambda _: _)
 feat_sparse = feat_counts.fit_transform(train_test.features)
 train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
 train_test.drop("features", axis=1, inplace=1)


 # ## Dates

 # In[14]:

 train_test['Date'] = pd.to_datetime(train_test['created'])
 train_test['Year'] = train_test['Date'].dt.year
 train_test['Month'] = train_test['Date'].dt.month
 train_test['Day'] = train_test['Date'].dt.day
 train_test['Wday'] = train_test['Date'].dt.dayofweek
 train_test['Yday'] = train_test['Date'].dt.dayofyear
 train_test['hour'] = train_test['Date'].dt.hour

 train_test = train_test.drop(['Date', 'created'], axis=1)


 # ## Address

 # In[15]:

 train_test['address1'] = train_test['display_address']
 train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

 address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
 }


 def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)


 train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
 train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

 new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']

 for col in new_cols:
    train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

 train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)


 # ## Bedrooms

 # In[16]:

 from scipy.stats import boxcox

 bc_price, tmp = boxcox(train_test.price)
 train_test['bc_price'] = bc_price

 train_test.drop('price', axis=1, inplace=True)
 train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))

 train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True)
 train_test.drop('bathrooms', axis=1, inplace=True)

 train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True)
 train_test.drop('bedrooms', axis=1, inplace=True)


 # ## Photos

 # In[17]:

 train_test["n_photos"] = train_test["photos"].apply(len)
 train_test.drop("photos", axis=1, inplace=True)


 #
 # ## One hot

 # In[18]:

 from sklearn import preprocessing

 categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
 for feat in categoricals:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))
    train_test.drop(feat, axis=1, inplace=1)

 print("[log] - training")

 # # XGBOOST

 # In[19]:

 train_test_cv1_sparse = sparse.hstack((train_test, dsc_sparse, feat_sparse)).tocsc()

 x_train = train_test_cv1_sparse[:ntrain, :]
 x_test = train_test_cv1_sparse[ntrain:, :]

 print(x_train.shape)
 print(x_test.shape)

 # In[20]:
 pickle.dump(x_train, open("x_train.m", "wb"))
 pickle.dump(y_train, open("y_train.m", "wb"))
 pickle.dump(x_test, open("x_test.m", "wb"))
 pickle.dump(listing_id, open("listing_id.m", "wb"))

	# coding: utf-8

	# # Loading and Transforming json data

	# In[1]:

	import pandas as pd
	import numpy as np
	import pickle
	from scipy import sparse

	data_path = "./"
	train_file = data_path + "train.json"
	test_file = data_path + "test.json"
	train_df = pd.read_json(train_file)
	test_df = pd.read_json(test_file)
	listing_id = test_df.listing_id.values


	# In[2]:

	y_map = {'low': 2, 'medium': 1, 'high': 0}
	train_df['interest_level2'] = train_df['interest_level'].apply(lambda x: y_map[x])
	y_train = train_df.interest_level2.values

	train_df.drop(['listing_id', 'interest_level2'], axis=1, inplace=1)
	test_df.drop('listing_id', axis=1, inplace=1)


	# In[3]:

	ntrain = train_df.shape[0]


	# ## Categoricals

	# In[4]:

	categorical_id = ["building_id", "manager_id"]


	# In[5]:

	n_fake_data = 5

	for cat in categorical_id:
	df = train_df.groupby(
	[cat, "interest_level"]
	).agg(
	{cat: len}
	).unstack(
	).fillna(0)[cat]
	agg_prior = df.sum(0)
	agg_prior = agg_prior / agg_prior.sum()
	df = df + n_fake_data * agg_prior
	df.columns = [cat + "_" + c for c in df.columns]
	train_df = train_df.join(df, on=cat)
	test_df = test_df.join(df, on=cat)

	for c in df.columns:
	m = min(test_df[c])
	test_df.fillna(m, inplace=1)

	normalization = train_df[df.columns].sum(1)

	for c in df.columns:
	train_df[c + "_norm"] = train_df[c] / normalization
	test_df[c + "_norm"] = test_df[c] / normalization


	# In[6]:

	train_test = pd.concat((train_df, test_df), axis=0).reset_index(drop=True)


	# In[7]:

	train_test.drop('interest_level', 1, inplace=1)


	# In[8]:

	train_test['Zero_building_id'] = (train_test.building_id == 0).apply(int)


	# In[9]:

	for cat in categorical_id:
	cat_percentile = train_test[cat].value_counts().sort_values().cumsum()
	cat_percentile = cat_percentile/cat_percentile.max()
	cat_rank = pd.qcut(cat_percentile, 25).rank(method='dense').sort_values()
	train_test = train_test.join(cat_rank, on=cat, rsuffix="_rank")
	train_test = train_test.join(cat_percentile, on=cat, rsuffix="_percentile")


	# ## Descriptions

	# In[10]:

	import string

	train_test['desc'] = train_test['description']
	train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a website_redacted ', ''))
	train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))

	string.punctuation.__add__('!!')
	string.punctuation.__add__('(')
	string.punctuation.__add__(')')

	remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

	train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
	train_test['desc_letters_count'] = train_test['description'].str.strip().apply(len)
	train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))


	# In[11]:

	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	dsc_counts = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=10000)
	dsc_sparse = dsc_counts.fit_transform(train_test.desc.str.lower())


	# In[12]:

	train_test.drop("desc", axis=1, inplace=1)
	train_test.drop("description", axis=1, inplace=1)


	# ## Features

	# In[13]:

	from sklearn.feature_extraction.text import CountVectorizer

	feat_counts = CountVectorizer(analyzer=lambda _: _)
	feat_sparse = feat_counts.fit_transform(train_test.features)
	train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
	train_test.drop("features", axis=1, inplace=1)


	# ## Dates

	# In[14]:

	train_test['Date'] = pd.to_datetime(train_test['created'])
	train_test['Year'] = train_test['Date'].dt.year
	train_test['Month'] = train_test['Date'].dt.month
	train_test['Day'] = train_test['Date'].dt.day
	train_test['Wday'] = train_test['Date'].dt.dayofweek
	train_test['Yday'] = train_test['Date'].dt.dayofyear
	train_test['hour'] = train_test['Date'].dt.hour

	train_test = train_test.drop(['Date', 'created'], axis=1)


	# ## Address

	# In[15]:

	train_test['address1'] = train_test['display_address']
	train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

	address_map = {
	'w': 'west',
	'st.': 'street',
	'ave': 'avenue',
	'st': 'street',
	'e': 'east',
	'n': 'north',
	's': 'south'
	}


	def address_map_func(s):
	s = s.split(' ')
	out = []
	for x in s:
	if x in address_map:
	out.append(address_map[x])
	else:
	out.append(x)
	return ' '.join(out)


	train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
	train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

	new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']

	for col in new_cols:
	train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

	train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)


	# ## Bedrooms

	# In[16]:

	from scipy.stats import boxcox

	bc_price, tmp = boxcox(train_test.price)
	train_test['bc_price'] = bc_price

	train_test.drop('price', axis=1, inplace=True)
	train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))

	train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True)
	train_test.drop('bathrooms', axis=1, inplace=True)

	train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True)
	train_test.drop('bedrooms', axis=1, inplace=True)


	# ## Photos

	# In[17]:

	train_test["n_photos"] = train_test["photos"].apply(len)
	train_test.drop("photos", axis=1, inplace=True)


	#
	# ## One hot

	# In[18]:

	from sklearn import preprocessing

	categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
	for feat in categoricals:
	lbl = preprocessing.LabelEncoder()
	lbl.fit(list(train_test[feat].values))
	train_test[feat] = lbl.transform(list(train_test[feat].values))
	train_test.drop(feat, axis=1, inplace=1)

	print("[log] - training")

	# # XGBOOST

	# In[19]:

	train_test_cv1_sparse = sparse.hstack((train_test, dsc_sparse, feat_sparse)).tocsc()

	x_train = train_test_cv1_sparse[:ntrain, :]
	x_test = train_test_cv1_sparse[ntrain:, :]

	print(x_train.shape)
	print(x_test.shape)

	# In[20]:
	pickle.dump(x_train, open("x_train.m", "wb"))
	pickle.dump(y_train, open("y_train.m", "wb"))
	pickle.dump(x_test, open("x_test.m", "wb"))
	pickle.dump(listing_id, open("listing_id.m", "wb"))