Skip to content

Instantly share code, notes, and snippets.

@a-agmon
Created August 18, 2019 14:17
Show Gist options
  • Save a-agmon/01c5413b6a4f83ab2ae77a4ad799d3b1 to your computer and use it in GitHub Desktop.
Save a-agmon/01c5413b6a4f83ab2ae77a4ad799d3b1 to your computer and use it in GitHub Desktop.
from sklearn import preprocessing
# Drop unnecessary columns
df_raw = df_raw.drop(columns=['SrcAddr','DstAddr','TotBytes','Sport','Dport','StartTime','sTos','dTos'])
#fill nulls
for feature_name in df_raw.columns:
if df_raw[feature_name].isnull().values.sum() > 0:
val = "none" if df_raw[feature_name].dtypes == object else -1
print(f"Filling nulls with value:{val} in column:{feature_name}")
df_raw[feature_name] = df_raw[feature_name].fillna(value=val)
# define some encoding functions
def encode_field(df,field):
encoder = preprocessing.LabelEncoder()
print(f"encoding field:{field}")
return encoder.fit_transform(df[field])
def hot_encode(df, feature):
return pd.get_dummies(
encode_field(df, feature), prefix=feature, drop_first=True)
def group_less_frequent_values(df, feature, min_prc):
categories = df[feature].value_counts()
for category in categories.index:
# how many times this category shows in the DS?
freq = categories[category]
# if less than what we want (min_prc)
if(freq < min_prc):
new_val = "LF" if df[feature].dtypes == object else 99
df.loc[df[feature] == category, feature] = new_val
# process the categorical features
categorical_features = ['State','Proto','Dir']
# classes that are not frequent in the data (less than 1%) will be grouped.
one_p = 0.01 * len(df_raw.index)
#loog through the categorical
for feature in categorical_features:
#group the less frequent ones
group_less_frequent_values(df_raw, feature, one_p)
#index and then hot encode
df_raw = pd.concat([df_raw, hot_encode(df_raw,feature)],axis=1)
## now remove what we dont need
df_raw = df_raw.drop(columns=categorical_features)
print("Done processing!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment