a-agmon · August 18, 2019 14:17
diff --git a/feature_engineering.py b/feature_engineering.py

 from sklearn import preprocessing
 # Drop unnecessary columns
 df_raw = df_raw.drop(columns=['SrcAddr','DstAddr','TotBytes','Sport','Dport','StartTime','sTos','dTos'])

 #fill nulls
 for feature_name in df_raw.columns:
    if df_raw[feature_name].isnull().values.sum() > 0:
        val = "none" if df_raw[feature_name].dtypes == object else -1 
        print(f"Filling nulls with value:{val} in column:{feature_name}")
        df_raw[feature_name] = df_raw[feature_name].fillna(value=val)
        

 # define some encoding functions
 def encode_field(df,field):
    encoder = preprocessing.LabelEncoder()
    print(f"encoding field:{field}")
    return  encoder.fit_transform(df[field])

 def hot_encode(df, feature):
    return pd.get_dummies(
            encode_field(df, feature), prefix=feature, drop_first=True)

 def group_less_frequent_values(df, feature, min_prc):
    categories = df[feature].value_counts()    
    for category in categories.index:                
        # how many times this category shows in the DS?
        freq = categories[category]        
        # if less than what we want (min_prc)
        if(freq < min_prc):
            new_val = "LF" if df[feature].dtypes == object else 99
            df.loc[df[feature] == category, feature] = new_val
 
 # process the categorical features        
 categorical_features = ['State','Proto','Dir']
 # classes that are not frequent in the data (less than 1%) will be grouped. 
 one_p = 0.01 * len(df_raw.index) 
 #loog through the categorical 
 for feature in categorical_features:
    #group the less frequent ones
    group_less_frequent_values(df_raw, feature, one_p)
    #index and then hot encode
    df_raw = pd.concat([df_raw, hot_encode(df_raw,feature)],axis=1)
    

 ## now remove what we dont need 
 df_raw = df_raw.drop(columns=categorical_features)
 print("Done processing!")

	from sklearn import preprocessing
	# Drop unnecessary columns
	df_raw = df_raw.drop(columns=['SrcAddr','DstAddr','TotBytes','Sport','Dport','StartTime','sTos','dTos'])

	#fill nulls
	for feature_name in df_raw.columns:
	if df_raw[feature_name].isnull().values.sum() > 0:
	val = "none" if df_raw[feature_name].dtypes == object else -1
	print(f"Filling nulls with value:{val} in column:{feature_name}")
	df_raw[feature_name] = df_raw[feature_name].fillna(value=val)


	# define some encoding functions
	def encode_field(df,field):
	encoder = preprocessing.LabelEncoder()
	print(f"encoding field:{field}")
	return encoder.fit_transform(df[field])

	def hot_encode(df, feature):
	return pd.get_dummies(
	encode_field(df, feature), prefix=feature, drop_first=True)

	def group_less_frequent_values(df, feature, min_prc):
	categories = df[feature].value_counts()
	for category in categories.index:
	# how many times this category shows in the DS?
	freq = categories[category]
	# if less than what we want (min_prc)
	if(freq < min_prc):
	new_val = "LF" if df[feature].dtypes == object else 99
	df.loc[df[feature] == category, feature] = new_val

	# process the categorical features
	categorical_features = ['State','Proto','Dir']
	# classes that are not frequent in the data (less than 1%) will be grouped.
	one_p = 0.01 * len(df_raw.index)
	#loog through the categorical
	for feature in categorical_features:
	#group the less frequent ones
	group_less_frequent_values(df_raw, feature, one_p)
	#index and then hot encode
	df_raw = pd.concat([df_raw, hot_encode(df_raw,feature)],axis=1)


	## now remove what we dont need
	df_raw = df_raw.drop(columns=categorical_features)
	print("Done processing!")