Skip to content

Instantly share code, notes, and snippets.

@Corwinpro
Last active March 9, 2019 12:31
Show Gist options
  • Save Corwinpro/334d20a5f7b1d415fbd59a62f22fc535 to your computer and use it in GitHub Desktop.
Save Corwinpro/334d20a5f7b1d415fbd59a62f22fc535 to your computer and use it in GitHub Desktop.
train_times, test_times = train_df[times], test_df[times]
train_session_start_hour = train_times['time1'].apply(lambda ts: ts.hour).values
test_session_start_hour = test_times['time1'].apply(lambda ts: ts.hour).values
def add_day_features(df):
session_start_hour = df['time1'].apply(lambda ts: ts.hour)
morning = ((session_start_hour >= 7) & (session_start_hour <= 11)).astype('int').values.reshape(-1, 1)
day = ((session_start_hour >= 12) & (session_start_hour <= 18)).astype('int').values.reshape(-1, 1)
evening = ((session_start_hour >= 19) & (session_start_hour <= 23)).astype('int').values.reshape(-1, 1)
df['morning'] = morning
df['day'] = day
df['evening'] = evening
add_day_features(train_df)
add_day_features(test_df)
train_df['week'] = train_df['time1'].apply(lambda ts: ts.week).astype(int)
test_df['week'] = test_df['time1'].apply(lambda ts: ts.week).astype(int)
train_df['session_durations'] = (train_times.max(axis=1) - train_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
test_df['session_durations'] = (test_times.max(axis=1) - test_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
scaler = StandardScaler()
train_df['session_durations'] = scaler.fit_transform(train_df['session_durations'].values.reshape(-1, 1))
test_df['session_durations'] = scaler.transform(test_df['session_durations'].values.reshape(-1, 1))
train_df['short_visits'] = train_times.min(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
test_df['short_visits'] = test_times.min(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
train_df['short_visits'] = train_df['short_visits'].apply(lambda t: t < -0.2).astype(int)
test_df['short_visits'] = test_df['short_visits'].apply(lambda t: t < -0.2).astype(int)
train_df['long_visits'] = train_times.max(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
test_df['long_visits'] = test_times.max(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
train_df['long_visits'] = train_df['long_visits'].apply(lambda t: t > 0.5).astype(int)
test_df['long_visits'] = test_df['long_visits'].apply(lambda t: t > 0.5).astype(int)
train_df['year_month'] = train_times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
test_df['year_month'] = test_times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
scaler = StandardScaler()
train_df['year_month'] = scaler.fit_transform(train_df['year_month'].values.reshape(-1, 1))
test_df['year_month'] = scaler.transform(test_df['year_month'].values.reshape(-1, 1))
train_df['year'] = train_times['time1'].apply(lambda t: t.year - 2013).values.reshape(-1, 1)
test_df['year'] = test_times['time1'].apply(lambda t: t.year - 2013).values.reshape(-1, 1)
days = ['day%s' % i for i in range(0, 7)]
train_df['day_of_week'] = train_times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
test_df['day_of_week'] = test_df['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
for i,day in enumerate(days):
train_df[day] = train_df['day_of_week'].apply(lambda x: x == i).astype('int')
test_df[day]= test_df['day_of_week'].apply(lambda x: x == i).astype('int')
scaler = StandardScaler()
train_df['day_of_week'] = scaler.fit_transform(train_df['day_of_week'].values.reshape(-1, 1))
test_df['day_of_week'] = scaler.transform(test_df['day_of_week'].values.reshape(-1, 1))
train_df['is_weekday'] = train_times['time1'].apply(lambda t: t.weekday() < 5).astype('int').values.reshape(-1, 1)
test_df['is_weekday'] = test_df['time1'].apply(lambda t: t.weekday() < 5).astype('int').values.reshape(-1, 1)
train_df['season'] = train_df['time1'].apply(lambda ts: ts.quarter + (ts.year - 2013)*4).astype('int')
test_df['season'] = test_df['time1'].apply(lambda ts: ts.quarter + (ts.year - 2013)*4).astype('int')
quarters = ['quart%s' % i for i in range(0, 7)]
for i, quarter in enumerate(quarters):
train_df[quarter] = train_df['season'].apply(lambda ts: ts == i).astype('int')
test_df[quarter] = test_df['season'].apply(lambda ts: ts == i).astype('int')
scaler = StandardScaler()
train_df['season'] = scaler.fit_transform(train_df['season'].values.reshape(-1, 1))
test_df['season'] = scaler.transform(test_df['season'].values.reshape(-1, 1))
train_df['n_unique_sites'] = train_df[sites].nunique(axis=1) / 10.
test_df['n_unique_sites'] = test_df[sites].nunique(axis=1) / 10.
train_df['few_sites'] = train_df['n_unique_sites'].apply(lambda x: x < 0.3).astype('int')
test_df['few_sites'] = test_df['n_unique_sites'].apply(lambda x: x < 0.3).astype('int')
train_df['lotsof_sites'] = train_df['n_unique_sites'].apply(lambda x: x > 0.9).astype('int')
test_df['lotsof_sites'] = test_df['n_unique_sites'].apply(lambda x: x > 0.9).astype('int')
train_df['time_per_site'] = train_df['session_durations'] / train_df['n_unique_sites']
test_df['time_per_site'] = test_df['session_durations'] / test_df['n_unique_sites']
# str values
train_df[sites]=train_df[sites].fillna(float(0))
train_df['str0']=train_df[sites].apply(lambda x: str( " ".join([str(a) for a in x.values if a != 0])), axis=1)
path_to_site_dict=os.path.join(_path, 'site_dic.pkl')
with open(path_to_site_dict, 'rb') as f:
site2id = pickle.load(f)
# create an inverse id _> site mapping
id2site = {v:k for (k, v) in site2id.items()}
train_df['str']=train_df[sites].apply(lambda x: " ".join( [id2site[a] for a in x.values if a != 0]), axis=1)
train_df['str_short'] = train_df['str'].apply(lambda site: site.replace('www.',''))
train_df['str_short'] = train_df['str_short'].apply(lambda site: site.replace('www2.',''))
train_df['str_short'] = train_df['str_short'].apply(lambda site: re.sub("\d+", '', site))
test_df['str']=test_df[sites].apply(lambda x: " ".join( [id2site[a] for a in x.values if a != 0]), axis=1)
test_df['str_short'] = test_df['str'].apply(lambda site: site.replace('www.',''))
test_df['str_short'] = test_df['str_short'].apply(lambda site: site.replace('www2.',''))
test_df['str_short'] = test_df['str_short'].apply(lambda site: re.sub("\d+", '', site))
test_df[sites]=test_df[sites].fillna(float(0))
test_df['str0']=test_df[sites].apply(lambda x: str( " ".join([str(a) for a in x.values if a != 0])), axis=1)
train_df['has_videos'] = train_df['str_short'].apply(lambda site: ('youtube' in site) or ('ytimg' in site) or ('watch' in site) or ('video' in site) or ('film' in site)).astype('int')
test_df['has_videos'] = test_df['str_short'].apply(lambda site: ('youtube' in site) or ('ytimg' in site) or ('watch' in site) or ('video' in site) or ('film' in site)).astype('int')
features_list = ['morning', 'day', 'evening', 'session_durations', 'day_of_week', 'season', 'year', 'few_sites'] + \
[quarters[3], quarters[5]] + \
['has_vk', 'has_videos'] + ['short_visits'] + \
[days[6]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment