Created
August 18, 2019 14:17
-
-
Save a-agmon/01c5413b6a4f83ab2ae77a4ad799d3b1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import preprocessing | |
# Drop unnecessary columns | |
df_raw = df_raw.drop(columns=['SrcAddr','DstAddr','TotBytes','Sport','Dport','StartTime','sTos','dTos']) | |
#fill nulls | |
for feature_name in df_raw.columns: | |
if df_raw[feature_name].isnull().values.sum() > 0: | |
val = "none" if df_raw[feature_name].dtypes == object else -1 | |
print(f"Filling nulls with value:{val} in column:{feature_name}") | |
df_raw[feature_name] = df_raw[feature_name].fillna(value=val) | |
# define some encoding functions | |
def encode_field(df,field): | |
encoder = preprocessing.LabelEncoder() | |
print(f"encoding field:{field}") | |
return encoder.fit_transform(df[field]) | |
def hot_encode(df, feature): | |
return pd.get_dummies( | |
encode_field(df, feature), prefix=feature, drop_first=True) | |
def group_less_frequent_values(df, feature, min_prc): | |
categories = df[feature].value_counts() | |
for category in categories.index: | |
# how many times this category shows in the DS? | |
freq = categories[category] | |
# if less than what we want (min_prc) | |
if(freq < min_prc): | |
new_val = "LF" if df[feature].dtypes == object else 99 | |
df.loc[df[feature] == category, feature] = new_val | |
# process the categorical features | |
categorical_features = ['State','Proto','Dir'] | |
# classes that are not frequent in the data (less than 1%) will be grouped. | |
one_p = 0.01 * len(df_raw.index) | |
#loog through the categorical | |
for feature in categorical_features: | |
#group the less frequent ones | |
group_less_frequent_values(df_raw, feature, one_p) | |
#index and then hot encode | |
df_raw = pd.concat([df_raw, hot_encode(df_raw,feature)],axis=1) | |
## now remove what we dont need | |
df_raw = df_raw.drop(columns=categorical_features) | |
print("Done processing!") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment