Corwinpro · March 9, 2019 12:31
diff --git a/alice_mlcourse.py b/alice_mlcourse.py
 train_times, test_times = train_df[times], test_df[times]
 train_session_start_hour = train_times['time1'].apply(lambda ts: ts.hour).values
 test_session_start_hour = test_times['time1'].apply(lambda ts: ts.hour).values

 def add_day_features(df):
    session_start_hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((session_start_hour >= 7) & (session_start_hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((session_start_hour >= 12) & (session_start_hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((session_start_hour >= 19) & (session_start_hour <= 23)).astype('int').values.reshape(-1, 1)
    df['morning'] = morning
    df['day'] = day
    df['evening'] = evening

 add_day_features(train_df)
 add_day_features(test_df)

 train_df['week'] = train_df['time1'].apply(lambda ts: ts.week).astype(int)
 test_df['week'] = test_df['time1'].apply(lambda ts: ts.week).astype(int)

 train_df['session_durations'] = (train_times.max(axis=1) - train_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
 test_df['session_durations'] = (test_times.max(axis=1) - test_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
 scaler = StandardScaler()
 train_df['session_durations'] = scaler.fit_transform(train_df['session_durations'].values.reshape(-1, 1))
 test_df['session_durations'] = scaler.transform(test_df['session_durations'].values.reshape(-1, 1))

 train_df['short_visits'] = train_times.min(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
 test_df['short_visits'] = test_times.min(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139

 train_df['short_visits'] = train_df['short_visits'].apply(lambda t: t < -0.2).astype(int)
 test_df['short_visits'] = test_df['short_visits'].apply(lambda t: t < -0.2).astype(int)

 train_df['long_visits'] = train_times.max(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
 test_df['long_visits'] = test_times.max(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139

 train_df['long_visits'] = train_df['long_visits'].apply(lambda t: t > 0.5).astype(int)
 test_df['long_visits'] = test_df['long_visits'].apply(lambda t: t > 0.5).astype(int)

 train_df['year_month'] = train_times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
 test_df['year_month'] = test_times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
 scaler = StandardScaler()
 train_df['year_month'] = scaler.fit_transform(train_df['year_month'].values.reshape(-1, 1))
 test_df['year_month'] = scaler.transform(test_df['year_month'].values.reshape(-1, 1))

 train_df['year'] = train_times['time1'].apply(lambda t: t.year - 2013).values.reshape(-1, 1)
 test_df['year'] = test_times['time1'].apply(lambda t: t.year - 2013).values.reshape(-1, 1)

 days = ['day%s' % i for i in range(0, 7)]
 train_df['day_of_week'] = train_times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
 test_df['day_of_week'] = test_df['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)

 for i,day in enumerate(days):
    train_df[day] = train_df['day_of_week'].apply(lambda x: x == i).astype('int')
    test_df[day]= test_df['day_of_week'].apply(lambda x: x == i).astype('int')

 scaler = StandardScaler()
 train_df['day_of_week'] = scaler.fit_transform(train_df['day_of_week'].values.reshape(-1, 1))
 test_df['day_of_week'] = scaler.transform(test_df['day_of_week'].values.reshape(-1, 1))

 train_df['is_weekday'] = train_times['time1'].apply(lambda t: t.weekday() < 5).astype('int').values.reshape(-1, 1)
 test_df['is_weekday'] = test_df['time1'].apply(lambda t: t.weekday()  < 5).astype('int').values.reshape(-1, 1)

 train_df['season'] = train_df['time1'].apply(lambda ts: ts.quarter + (ts.year - 2013)*4).astype('int')
 test_df['season'] = test_df['time1'].apply(lambda ts: ts.quarter + (ts.year - 2013)*4).astype('int')

 quarters = ['quart%s' % i for i in range(0, 7)]
 for i, quarter in enumerate(quarters):
    train_df[quarter] = train_df['season'].apply(lambda ts: ts == i).astype('int')
    test_df[quarter] = test_df['season'].apply(lambda ts: ts == i).astype('int')

 scaler = StandardScaler()
 train_df['season'] = scaler.fit_transform(train_df['season'].values.reshape(-1, 1))
 test_df['season'] = scaler.transform(test_df['season'].values.reshape(-1, 1))

 train_df['n_unique_sites'] = train_df[sites].nunique(axis=1) / 10.
 test_df['n_unique_sites'] = test_df[sites].nunique(axis=1) / 10.

 train_df['few_sites'] = train_df['n_unique_sites'].apply(lambda x: x < 0.3).astype('int')
 test_df['few_sites'] = test_df['n_unique_sites'].apply(lambda x: x < 0.3).astype('int')

 train_df['lotsof_sites'] = train_df['n_unique_sites'].apply(lambda x: x > 0.9).astype('int')
 test_df['lotsof_sites'] = test_df['n_unique_sites'].apply(lambda x: x > 0.9).astype('int')

 train_df['time_per_site'] = train_df['session_durations'] / train_df['n_unique_sites']
 test_df['time_per_site'] = test_df['session_durations'] / test_df['n_unique_sites']

 # str values
 train_df[sites]=train_df[sites].fillna(float(0))
 train_df['str0']=train_df[sites].apply(lambda x: str( " ".join([str(a) for a in x.values if a != 0])), axis=1)
 path_to_site_dict=os.path.join(_path, 'site_dic.pkl')
 with open(path_to_site_dict, 'rb') as f:
    site2id = pickle.load(f)
 # create an inverse id _> site mapping
 id2site = {v:k for (k, v) in site2id.items()}
 train_df['str']=train_df[sites].apply(lambda x: " ".join( [id2site[a] for a in x.values if a != 0]), axis=1)
 train_df['str_short'] = train_df['str'].apply(lambda site: site.replace('www.',''))
 train_df['str_short'] = train_df['str_short'].apply(lambda site: site.replace('www2.',''))
 train_df['str_short'] = train_df['str_short'].apply(lambda site: re.sub("\d+", '', site))

 test_df['str']=test_df[sites].apply(lambda x: " ".join( [id2site[a] for a in x.values if a != 0]), axis=1)
 test_df['str_short'] = test_df['str'].apply(lambda site: site.replace('www.',''))
 test_df['str_short'] = test_df['str_short'].apply(lambda site: site.replace('www2.',''))
 test_df['str_short'] = test_df['str_short'].apply(lambda site: re.sub("\d+", '', site))

 test_df[sites]=test_df[sites].fillna(float(0))
 test_df['str0']=test_df[sites].apply(lambda x: str( " ".join([str(a) for a in x.values if a != 0])), axis=1)

 train_df['has_videos'] = train_df['str_short'].apply(lambda site: ('youtube' in site) or ('ytimg' in site) or ('watch' in site) or ('video' in site) or ('film' in site)).astype('int')
 test_df['has_videos'] = test_df['str_short'].apply(lambda site: ('youtube' in site) or ('ytimg' in site) or ('watch' in site) or ('video' in site) or ('film' in site)).astype('int')

 features_list = ['morning', 'day', 'evening', 'session_durations', 'day_of_week', 'season', 'year', 'few_sites'] + \
                [quarters[3], quarters[5]] + \
                ['has_vk', 'has_videos'] + ['short_visits'] + \
                [days[6]]
	train_times, test_times = train_df[times], test_df[times]
	train_session_start_hour = train_times['time1'].apply(lambda ts: ts.hour).values
	test_session_start_hour = test_times['time1'].apply(lambda ts: ts.hour).values

	def add_day_features(df):
	session_start_hour = df['time1'].apply(lambda ts: ts.hour)
	morning = ((session_start_hour >= 7) & (session_start_hour <= 11)).astype('int').values.reshape(-1, 1)
	day = ((session_start_hour >= 12) & (session_start_hour <= 18)).astype('int').values.reshape(-1, 1)
	evening = ((session_start_hour >= 19) & (session_start_hour <= 23)).astype('int').values.reshape(-1, 1)
	df['morning'] = morning
	df['day'] = day
	df['evening'] = evening

	add_day_features(train_df)
	add_day_features(test_df)

	train_df['week'] = train_df['time1'].apply(lambda ts: ts.week).astype(int)
	test_df['week'] = test_df['time1'].apply(lambda ts: ts.week).astype(int)

	train_df['session_durations'] = (train_times.max(axis=1) - train_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
	test_df['session_durations'] = (test_times.max(axis=1) - test_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
	scaler = StandardScaler()
	train_df['session_durations'] = scaler.fit_transform(train_df['session_durations'].values.reshape(-1, 1))
	test_df['session_durations'] = scaler.transform(test_df['session_durations'].values.reshape(-1, 1))

	train_df['short_visits'] = train_times.min(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
	test_df['short_visits'] = test_times.min(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139

	train_df['short_visits'] = train_df['short_visits'].apply(lambda t: t < -0.2).astype(int)
	test_df['short_visits'] = test_df['short_visits'].apply(lambda t: t < -0.2).astype(int)

	train_df['long_visits'] = train_times.max(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139
	test_df['long_visits'] = test_times.max(axis=1).astype('datetime64[ms]').astype(int) / 1.e16 - 139

	train_df['long_visits'] = train_df['long_visits'].apply(lambda t: t > 0.5).astype(int)
	test_df['long_visits'] = test_df['long_visits'].apply(lambda t: t > 0.5).astype(int)

	train_df['year_month'] = train_times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
	test_df['year_month'] = test_times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
	scaler = StandardScaler()
	train_df['year_month'] = scaler.fit_transform(train_df['year_month'].values.reshape(-1, 1))
	test_df['year_month'] = scaler.transform(test_df['year_month'].values.reshape(-1, 1))

	train_df['year'] = train_times['time1'].apply(lambda t: t.year - 2013).values.reshape(-1, 1)
	test_df['year'] = test_times['time1'].apply(lambda t: t.year - 2013).values.reshape(-1, 1)

	days = ['day%s' % i for i in range(0, 7)]
	train_df['day_of_week'] = train_times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
	test_df['day_of_week'] = test_df['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)

	for i,day in enumerate(days):
	train_df[day] = train_df['day_of_week'].apply(lambda x: x == i).astype('int')
	test_df[day]= test_df['day_of_week'].apply(lambda x: x == i).astype('int')

	scaler = StandardScaler()
	train_df['day_of_week'] = scaler.fit_transform(train_df['day_of_week'].values.reshape(-1, 1))
	test_df['day_of_week'] = scaler.transform(test_df['day_of_week'].values.reshape(-1, 1))

	train_df['is_weekday'] = train_times['time1'].apply(lambda t: t.weekday() < 5).astype('int').values.reshape(-1, 1)
	test_df['is_weekday'] = test_df['time1'].apply(lambda t: t.weekday() < 5).astype('int').values.reshape(-1, 1)

	train_df['season'] = train_df['time1'].apply(lambda ts: ts.quarter + (ts.year - 2013)*4).astype('int')
	test_df['season'] = test_df['time1'].apply(lambda ts: ts.quarter + (ts.year - 2013)*4).astype('int')

	quarters = ['quart%s' % i for i in range(0, 7)]
	for i, quarter in enumerate(quarters):
	train_df[quarter] = train_df['season'].apply(lambda ts: ts == i).astype('int')
	test_df[quarter] = test_df['season'].apply(lambda ts: ts == i).astype('int')

	scaler = StandardScaler()
	train_df['season'] = scaler.fit_transform(train_df['season'].values.reshape(-1, 1))
	test_df['season'] = scaler.transform(test_df['season'].values.reshape(-1, 1))

	train_df['n_unique_sites'] = train_df[sites].nunique(axis=1) / 10.
	test_df['n_unique_sites'] = test_df[sites].nunique(axis=1) / 10.

	train_df['few_sites'] = train_df['n_unique_sites'].apply(lambda x: x < 0.3).astype('int')
	test_df['few_sites'] = test_df['n_unique_sites'].apply(lambda x: x < 0.3).astype('int')

	train_df['lotsof_sites'] = train_df['n_unique_sites'].apply(lambda x: x > 0.9).astype('int')
	test_df['lotsof_sites'] = test_df['n_unique_sites'].apply(lambda x: x > 0.9).astype('int')

	train_df['time_per_site'] = train_df['session_durations'] / train_df['n_unique_sites']
	test_df['time_per_site'] = test_df['session_durations'] / test_df['n_unique_sites']

	# str values
	train_df[sites]=train_df[sites].fillna(float(0))
	train_df['str0']=train_df[sites].apply(lambda x: str( " ".join([str(a) for a in x.values if a != 0])), axis=1)
	path_to_site_dict=os.path.join(_path, 'site_dic.pkl')
	with open(path_to_site_dict, 'rb') as f:
	site2id = pickle.load(f)
	# create an inverse id _> site mapping
	id2site = {v:k for (k, v) in site2id.items()}
	train_df['str']=train_df[sites].apply(lambda x: " ".join( [id2site[a] for a in x.values if a != 0]), axis=1)
	train_df['str_short'] = train_df['str'].apply(lambda site: site.replace('www.',''))
	train_df['str_short'] = train_df['str_short'].apply(lambda site: site.replace('www2.',''))
	train_df['str_short'] = train_df['str_short'].apply(lambda site: re.sub("\d+", '', site))

	test_df['str']=test_df[sites].apply(lambda x: " ".join( [id2site[a] for a in x.values if a != 0]), axis=1)
	test_df['str_short'] = test_df['str'].apply(lambda site: site.replace('www.',''))
	test_df['str_short'] = test_df['str_short'].apply(lambda site: site.replace('www2.',''))
	test_df['str_short'] = test_df['str_short'].apply(lambda site: re.sub("\d+", '', site))

	test_df[sites]=test_df[sites].fillna(float(0))
	test_df['str0']=test_df[sites].apply(lambda x: str( " ".join([str(a) for a in x.values if a != 0])), axis=1)

	train_df['has_videos'] = train_df['str_short'].apply(lambda site: ('youtube' in site) or ('ytimg' in site) or ('watch' in site) or ('video' in site) or ('film' in site)).astype('int')
	test_df['has_videos'] = test_df['str_short'].apply(lambda site: ('youtube' in site) or ('ytimg' in site) or ('watch' in site) or ('video' in site) or ('film' in site)).astype('int')

	features_list = ['morning', 'day', 'evening', 'session_durations', 'day_of_week', 'season', 'year', 'few_sites'] + \
	[quarters[3], quarters[5]] + \
	['has_vk', 'has_videos'] + ['short_visits'] + \
	[days[6]]