Last active
March 30, 2017 19:36
-
-
Save rmania/b1847215bc95c0f72056d9160133e676 to your computer and use it in GitHub Desktop.
machine learning preprocessing and feature generation code snippets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.preprocessing import Imputer | |
# some functions executing some basic preprocessing steps for ml | |
predict_cols = ['x'] | |
feature_cols = list(set(df.columns) - set(predict_cols) - {'source'}) | |
# PREPROCESSING STEPS | |
def _preprocess(df, predict_cols, feature_cols, do_outlier_removal=False): | |
col_dtypes = df[list(set(df.columns) - set(predict_cols))].dtypes | |
cat_features = [c for c, dtype in col_dtypes.iteritems() if dtype not in ['int64', 'int32', 'float64']] | |
num_features = [c for c, dtype in col_dtypes.iteritems() if dtype in ['int64', 'int32', 'float64']] | |
print("Encoding...") | |
# drop, impute na. This mask will id the rows | |
#mask = ~df[cat_features].isnull() | |
for c in cat_features: | |
df.loc[:, c] = LabelEncoder().fit_transform(df.loc[:, c].fillna('unknown')) | |
# agg = df.groupby(c).size().to_frame('size').reset_index() | |
# df = pd.merge(df, agg, on=c) | |
# df = df.drop(c, axis=1).rename(columns={'size': c}) | |
print("Imputing...") # outcomment if not needed | |
imp = Imputer(missing_values=np.nan, strategy="median", axis=0) | |
# Impute numerical features | |
df[num_features] = imp.fit_transform(df[num_features]) | |
df[num_features] = df[num_features].fillna(-1000) | |
if do_outlier_removal: | |
for col in df.columns.values: | |
outliers = np.where(_is_outlier(df.loc[:, (col)])) # refers to outlier function | |
df.ix[:, (col)].iloc[outliers] = median | |
print("Dropping NaN prediction rows...") | |
# remove na`s in target cols. Otherwise impute | |
#df = df.dropna(subset=predict_cols, axis=0) | |
# Impute targets | |
df[predict_cols] = df[predict_cols].fillna(-1) | |
return df | |
def _is_outlier(points, thresh=3.5): | |
""" | |
Remove points based on their "median absolute deviation". | |
Returns a boolean array with True if points are outliers and False | |
otherwise. | |
Parameters: | |
----------- | |
points : An numobservations by numdimensions array of observations | |
thresh : The modified z-score to use as a threshold. Observations with | |
a modified z-score (based on the median absolute deviation) greater | |
than this value will be classified as outliers. | |
Returns: | |
-------- | |
mask : A numobservations-length boolean array. | |
References: | |
---------- | |
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and | |
Handle Outliers", The ASQC Basic References in Quality Control: | |
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. | |
""" | |
if len(points.shape) == 1: | |
points = points[:,None] | |
median = np.median(points, axis=0) | |
diff = np.sum((points - median)**2, axis=-1) | |
diff = np.sqrt(diff) | |
med_abs_deviation = np.median(diff) | |
modified_z_score = 0.6745 * diff / med_abs_deviation # tweak if necessary | |
return modified_z_score > thresh | |
# create datetime features | |
def add_datetime_features(df, date_col = None): | |
""" | |
Adds features that are derived from datetime: | |
Input : dataframe and the column the date features should be extracted from. | |
date, year, month, week number, dayofweek, dayofyear. | |
""" | |
# Convert datetime_sch to datetime. | |
df[date_col] = pd.to_datetime(df[date_col]) | |
df = df.sort_values(date_col, ascending=True) | |
# Get hour from datetime. | |
df = df.assign(hour=lambda x: x[date_col].dt.hour) | |
df = df.assign(time_qhour=lambda x: (x[date_col].dt.minute/15).astype(int) + x['hour']*4) | |
# Get date from datetime. | |
df['date'] = df[date_col].dt.date | |
# Compute day-like features below from date and merge that back onto dataframe. | |
date_features = pd.DataFrame(df['date'].unique(), columns=['date']) | |
# Compute features from dates. | |
date_features = date_features.assign(year=lambda x: [x.year for x in x['date']]) | |
date_features = date_features.assign(month=lambda x: [x.month for x in x['date']]) | |
date_features = date_features.assign(week=lambda x: [x.isocalendar()[1] for x in x['date']]) | |
date_features = date_features.assign(dayofweek=lambda x: [x.weekday() for x in x['date']]) | |
date_features = date_features.assign(dayofyear=lambda x: [x.timetuple().tm_yday for x in x['date']]) | |
# Merge back onto dataframe. | |
n_rows_before = df.shape[0] | |
df = pd.merge(df, date_features, on=['date'], suffixes=('_old', '')) | |
# Check that no rows are dropped. | |
assert df.shape[0] == n_rows_before | |
return df | |
def add_holiday_features(df, holidays_file): | |
""" | |
https://gist.github.com/rok?direction=asc&sort=updated | |
Adds holiday features. | |
columns dc, dn and ds mean the days up until the next vacation in c (Central), s (South) and n(North) regions | |
in Holland | |
""" | |
holidays = pd.read_csv(holidays_file, parse_dates=['dt']) | |
# Convert to datetime so that we can merge on it. | |
holidays.dt = holidays.dt.dt.date | |
df = pd.merge(df, holidays, left_on=['date'], right_on=['dt'], how='left', suffixes=('old', '')) | |
df = df.drop(['dt', 'regions'], axis=1) | |
return df | |
class Parameters: | |
def __init__(self): | |
""" | |
class to define parameters for the ML model, notably on the creation of predefined train test sets and periods | |
""" | |
self.predict_cols = ['X'] | |
self.feature_cols = [] # fill with columns | |
# possible train, validate , test splits | |
self.year_train = 2015 | |
self.year_test = 2016 | |
self.start_day = 1 | |
self.start_month = 1 | |
def get_start_date(self, period): | |
if period=='train': | |
return dt.date(self.year_train, self.start_month, self.start_day) | |
elif period=='test': | |
return dt.date(self.year_test, self.start_month, self.start_day) | |
else: | |
assert False | |
def get_end_date(self, period): | |
if period=='train': | |
return dt.date(self.year_train + 1, self.start_month, self.start_day) | |
elif period=='test': | |
return dt.date(self.year_test + 1, self.start_month, self.start_day) | |
else: | |
assert False | |
def get_dates(self, period): | |
return [self.get_start_date(period), self.get_end_date(period)] | |
p = Parameters() | |
# use this f.i. as such: | |
hist_train = ml.select_period(history, *p.get_dates('train')) | |
hist_test = ml.select_period(history, *p.get_dates('test')) | |
X_train = hist_train[p.feature_cols].values | |
y_train = hist_train[p.predict_cols].values.ravel() | |
X_test = hist_test[p.feature_cols].values | |
y_test = hist_test[p.predict_cols].values.ravel() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment