Skip to content

Instantly share code, notes, and snippets.

@jxnl
Created March 8, 2017 22:53
Show Gist options
  • Save jxnl/802b20c44d1efb76b06b15340dc184c3 to your computer and use it in GitHub Desktop.
Save jxnl/802b20c44d1efb76b06b15340dc184c3 to your computer and use it in GitHub Desktop.
# coding: utf-8
# # Loading and Transforming json data
# In[1]:
import pandas as pd
import numpy as np
import pickle
from scipy import sparse
data_path = "./"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
listing_id = test_df.listing_id.values
# In[2]:
y_map = {'low': 2, 'medium': 1, 'high': 0}
train_df['interest_level2'] = train_df['interest_level'].apply(lambda x: y_map[x])
y_train = train_df.interest_level2.values
train_df.drop(['listing_id', 'interest_level2'], axis=1, inplace=1)
test_df.drop('listing_id', axis=1, inplace=1)
# In[3]:
ntrain = train_df.shape[0]
# ## Categoricals
# In[4]:
categorical_id = ["building_id", "manager_id"]
# In[5]:
n_fake_data = 5
for cat in categorical_id:
df = train_df.groupby(
[cat, "interest_level"]
).agg(
{cat: len}
).unstack(
).fillna(0)[cat]
agg_prior = df.sum(0)
agg_prior = agg_prior / agg_prior.sum()
df = df + n_fake_data * agg_prior
df.columns = [cat + "_" + c for c in df.columns]
train_df = train_df.join(df, on=cat)
test_df = test_df.join(df, on=cat)
for c in df.columns:
m = min(test_df[c])
test_df.fillna(m, inplace=1)
normalization = train_df[df.columns].sum(1)
for c in df.columns:
train_df[c + "_norm"] = train_df[c] / normalization
test_df[c + "_norm"] = test_df[c] / normalization
# In[6]:
train_test = pd.concat((train_df, test_df), axis=0).reset_index(drop=True)
# In[7]:
train_test.drop('interest_level', 1, inplace=1)
# In[8]:
train_test['Zero_building_id'] = (train_test.building_id == 0).apply(int)
# In[9]:
for cat in categorical_id:
cat_percentile = train_test[cat].value_counts().sort_values().cumsum()
cat_percentile = cat_percentile/cat_percentile.max()
cat_rank = pd.qcut(cat_percentile, 25).rank(method='dense').sort_values()
train_test = train_test.join(cat_rank, on=cat, rsuffix="_rank")
train_test = train_test.join(cat_percentile, on=cat, rsuffix="_percentile")
# ## Descriptions
# In[10]:
import string
train_test['desc'] = train_test['description']
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a website_redacted ', ''))
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))
string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
train_test['desc_letters_count'] = train_test['description'].str.strip().apply(len)
train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))
# In[11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
dsc_counts = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=10000)
dsc_sparse = dsc_counts.fit_transform(train_test.desc.str.lower())
# In[12]:
train_test.drop("desc", axis=1, inplace=1)
train_test.drop("description", axis=1, inplace=1)
# ## Features
# In[13]:
from sklearn.feature_extraction.text import CountVectorizer
feat_counts = CountVectorizer(analyzer=lambda _: _)
feat_sparse = feat_counts.fit_transform(train_test.features)
train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test.drop("features", axis=1, inplace=1)
# ## Dates
# In[14]:
train_test['Date'] = pd.to_datetime(train_test['created'])
train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour
train_test = train_test.drop(['Date', 'created'], axis=1)
# ## Address
# In[15]:
train_test['address1'] = train_test['display_address']
train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())
address_map = {
'w': 'west',
'st.': 'street',
'ave': 'avenue',
'st': 'street',
'e': 'east',
'n': 'north',
's': 'south'
}
def address_map_func(s):
s = s.split(' ')
out = []
for x in s:
if x in address_map:
out.append(address_map[x])
else:
out.append(x)
return ' '.join(out)
train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))
new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']
for col in new_cols:
train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)
train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
# ## Bedrooms
# In[16]:
from scipy.stats import boxcox
bc_price, tmp = boxcox(train_test.price)
train_test['bc_price'] = bc_price
train_test.drop('price', axis=1, inplace=True)
train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))
train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True)
train_test.drop('bathrooms', axis=1, inplace=True)
train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True)
train_test.drop('bedrooms', axis=1, inplace=True)
# ## Photos
# In[17]:
train_test["n_photos"] = train_test["photos"].apply(len)
train_test.drop("photos", axis=1, inplace=True)
#
# ## One hot
# In[18]:
from sklearn import preprocessing
categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
for feat in categoricals:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_test[feat].values))
train_test[feat] = lbl.transform(list(train_test[feat].values))
train_test.drop(feat, axis=1, inplace=1)
print("[log] - training")
# # XGBOOST
# In[19]:
train_test_cv1_sparse = sparse.hstack((train_test, dsc_sparse, feat_sparse)).tocsc()
x_train = train_test_cv1_sparse[:ntrain, :]
x_test = train_test_cv1_sparse[ntrain:, :]
print(x_train.shape)
print(x_test.shape)
# In[20]:
pickle.dump(x_train, open("x_train.m", "wb"))
pickle.dump(y_train, open("y_train.m", "wb"))
pickle.dump(x_test, open("x_test.m", "wb"))
pickle.dump(listing_id, open("listing_id.m", "wb"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment