Created
March 8, 2017 22:53
-
-
Save jxnl/802b20c44d1efb76b06b15340dc184c3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| # # Loading and Transforming json data | |
| # In[1]: | |
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| from scipy import sparse | |
| data_path = "./" | |
| train_file = data_path + "train.json" | |
| test_file = data_path + "test.json" | |
| train_df = pd.read_json(train_file) | |
| test_df = pd.read_json(test_file) | |
| listing_id = test_df.listing_id.values | |
| # In[2]: | |
| y_map = {'low': 2, 'medium': 1, 'high': 0} | |
| train_df['interest_level2'] = train_df['interest_level'].apply(lambda x: y_map[x]) | |
| y_train = train_df.interest_level2.values | |
| train_df.drop(['listing_id', 'interest_level2'], axis=1, inplace=1) | |
| test_df.drop('listing_id', axis=1, inplace=1) | |
| # In[3]: | |
| ntrain = train_df.shape[0] | |
| # ## Categoricals | |
| # In[4]: | |
| categorical_id = ["building_id", "manager_id"] | |
| # In[5]: | |
| n_fake_data = 5 | |
| for cat in categorical_id: | |
| df = train_df.groupby( | |
| [cat, "interest_level"] | |
| ).agg( | |
| {cat: len} | |
| ).unstack( | |
| ).fillna(0)[cat] | |
| agg_prior = df.sum(0) | |
| agg_prior = agg_prior / agg_prior.sum() | |
| df = df + n_fake_data * agg_prior | |
| df.columns = [cat + "_" + c for c in df.columns] | |
| train_df = train_df.join(df, on=cat) | |
| test_df = test_df.join(df, on=cat) | |
| for c in df.columns: | |
| m = min(test_df[c]) | |
| test_df.fillna(m, inplace=1) | |
| normalization = train_df[df.columns].sum(1) | |
| for c in df.columns: | |
| train_df[c + "_norm"] = train_df[c] / normalization | |
| test_df[c + "_norm"] = test_df[c] / normalization | |
| # In[6]: | |
| train_test = pd.concat((train_df, test_df), axis=0).reset_index(drop=True) | |
| # In[7]: | |
| train_test.drop('interest_level', 1, inplace=1) | |
| # In[8]: | |
| train_test['Zero_building_id'] = (train_test.building_id == 0).apply(int) | |
| # In[9]: | |
| for cat in categorical_id: | |
| cat_percentile = train_test[cat].value_counts().sort_values().cumsum() | |
| cat_percentile = cat_percentile/cat_percentile.max() | |
| cat_rank = pd.qcut(cat_percentile, 25).rank(method='dense').sort_values() | |
| train_test = train_test.join(cat_rank, on=cat, rsuffix="_rank") | |
| train_test = train_test.join(cat_percentile, on=cat, rsuffix="_percentile") | |
| # ## Descriptions | |
| # In[10]: | |
| import string | |
| train_test['desc'] = train_test['description'] | |
| train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a website_redacted ', '')) | |
| train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', '')) | |
| string.punctuation.__add__('!!') | |
| string.punctuation.__add__('(') | |
| string.punctuation.__add__(')') | |
| remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) | |
| train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map)) | |
| train_test['desc_letters_count'] = train_test['description'].str.strip().apply(len) | |
| train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' '))) | |
| # In[11]: | |
| from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
| dsc_counts = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=10000) | |
| dsc_sparse = dsc_counts.fit_transform(train_test.desc.str.lower()) | |
| # In[12]: | |
| train_test.drop("desc", axis=1, inplace=1) | |
| train_test.drop("description", axis=1, inplace=1) | |
| # ## Features | |
| # In[13]: | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| feat_counts = CountVectorizer(analyzer=lambda _: _) | |
| feat_sparse = feat_counts.fit_transform(train_test.features) | |
| train_test['features_count'] = train_test['features'].apply(lambda x: len(x)) | |
| train_test.drop("features", axis=1, inplace=1) | |
| # ## Dates | |
| # In[14]: | |
| train_test['Date'] = pd.to_datetime(train_test['created']) | |
| train_test['Year'] = train_test['Date'].dt.year | |
| train_test['Month'] = train_test['Date'].dt.month | |
| train_test['Day'] = train_test['Date'].dt.day | |
| train_test['Wday'] = train_test['Date'].dt.dayofweek | |
| train_test['Yday'] = train_test['Date'].dt.dayofyear | |
| train_test['hour'] = train_test['Date'].dt.hour | |
| train_test = train_test.drop(['Date', 'created'], axis=1) | |
| # ## Address | |
| # In[15]: | |
| train_test['address1'] = train_test['display_address'] | |
| train_test['address1'] = train_test['address1'].apply(lambda x: x.lower()) | |
| address_map = { | |
| 'w': 'west', | |
| 'st.': 'street', | |
| 'ave': 'avenue', | |
| 'st': 'street', | |
| 'e': 'east', | |
| 'n': 'north', | |
| 's': 'south' | |
| } | |
| def address_map_func(s): | |
| s = s.split(' ') | |
| out = [] | |
| for x in s: | |
| if x in address_map: | |
| out.append(address_map[x]) | |
| else: | |
| out.append(x) | |
| return ' '.join(out) | |
| train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map)) | |
| train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x)) | |
| new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south'] | |
| for col in new_cols: | |
| train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0) | |
| train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1) | |
| # ## Bedrooms | |
| # In[16]: | |
| from scipy.stats import boxcox | |
| bc_price, tmp = boxcox(train_test.price) | |
| train_test['bc_price'] = bc_price | |
| train_test.drop('price', axis=1, inplace=True) | |
| train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x)) | |
| train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True) | |
| train_test.drop('bathrooms', axis=1, inplace=True) | |
| train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True) | |
| train_test.drop('bedrooms', axis=1, inplace=True) | |
| # ## Photos | |
| # In[17]: | |
| train_test["n_photos"] = train_test["photos"].apply(len) | |
| train_test.drop("photos", axis=1, inplace=True) | |
| # | |
| # ## One hot | |
| # In[18]: | |
| from sklearn import preprocessing | |
| categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object'] | |
| for feat in categoricals: | |
| lbl = preprocessing.LabelEncoder() | |
| lbl.fit(list(train_test[feat].values)) | |
| train_test[feat] = lbl.transform(list(train_test[feat].values)) | |
| train_test.drop(feat, axis=1, inplace=1) | |
| print("[log] - training") | |
| # # XGBOOST | |
| # In[19]: | |
| train_test_cv1_sparse = sparse.hstack((train_test, dsc_sparse, feat_sparse)).tocsc() | |
| x_train = train_test_cv1_sparse[:ntrain, :] | |
| x_test = train_test_cv1_sparse[ntrain:, :] | |
| print(x_train.shape) | |
| print(x_test.shape) | |
| # In[20]: | |
| pickle.dump(x_train, open("x_train.m", "wb")) | |
| pickle.dump(y_train, open("y_train.m", "wb")) | |
| pickle.dump(x_test, open("x_test.m", "wb")) | |
| pickle.dump(listing_id, open("listing_id.m", "wb")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment