vincentkernn VincentTatan

I fight phishing with ML @ Google. I use advanced ML algorithms and MLOps to protect Chrome, Gmail and Android users. Please reach out to me on Linkedin.

187 followers · 1 following

Google LLC
Singapore
https://medium.com/@vincentkernn

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

VincentTatan / logistic_regression.py

Created May 9, 2020 12:50

	def create_logistic_regressions(X_train,y_train,figsize=(10,10)):
	logreg = LogisticRegression(solver='lbfgs')
	logreg.fit(X_train, y_train)

	coefficients = logreg.coef_
	intercept = logreg.intercept_

	df_logreg = pd.DataFrame({'Feature':X_train.columns,'Coef':logreg.coef_[0]})
	fig, ax = plt.subplots(figsize=figsize)
	sns.barplot(x="Coef", y="Feature", data=df_logreg, ax=ax)

VincentTatan / decision_tree.py

Created May 9, 2020 12:50

	def create_and_visualize_tree(X_train,y_train,max_depth=3):
	decision_tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=1,random_state=1)
	decision_tree = decision_tree.fit(X_train, y_train)
	tree_str = export_graphviz(decision_tree, feature_names=X_train.columns,
	filled=True, out_file=None)
	graph = pydotplus.graph_from_dot_data(tree_str)
	graph.write_png('dt.png')
	display(Image('dt.png'))
	return decision_tree

VincentTatan / confusion_metrics.py

Created May 9, 2020 12:51

	def plot_confusion_matrix(cm_list,target_names,title_list,cmap=None,normalize=True,float_format_str='{:,.2f}'):
	plt.figure(figsize=(10,5))
	print('{}_count={:d}\n{}_count={:d}'.format(target_names[0],cm_list[0][0].sum(),target_names[1],cm_list[0][1].sum()))
	stats_list = []

	for i in range(len(cm_list)):
	model_name = title_list[i]
	cm = cm_list[i]

	actual_phishy= cm[0]

VincentTatan / roc_auc_curve.py

Created May 9, 2020 12:51

	def take_roc_curve(X_test,model):
	y_preds = model.predict_proba(X_test)
	preds = y_preds[:,1]

	fpr, tpr, _ = metrics.roc_curve(y_test, preds)
	precision, recall, _ = metrics.precision_recall_curve(y_test, preds)
	auc_score = metrics.auc(fpr, tpr)

	plt.figure(figsize=(10,5))

VincentTatan / generate_logs_from_classifiers.py

Created May 9, 2020 12:52

	def generate_logs_from_classifiers(classifiers):
	log_cols=["Classifier", "Accuracy", "Log Loss"]
	log = pd.DataFrame(columns=log_cols)

	for clf in classifiers:
	name = clf.__class__.__name__
	print('Processing {} classifier'.format(name))

	clf.fit(X_train, y_train)
	train_predictions = clf.predict(X_test)

VincentTatan / tokenization.py

Last active May 22, 2020 00:47

	from tensorflow.keras.preprocessing.text import Tokenizer

	sentences = [
	'I eat chicken',
	'I do not eat fish',
	'Did you eat fish?'
	]

	tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
	tokenizer.fit_on_texts(sentences)

VincentTatan / sequencing.py

Created May 22, 2020 00:48

	from tensorflow.keras.preprocessing.sequence import pad_sequences

	sequences = tokenizer.texts_to_sequences(sentences)

	padded = pad_sequences(sequences, maxlen=5)
	print("\nThe Word Index = " , word_index)
	print("\nThe Sequences = " , sequences)
	print("\nThe Padded Sequences:")
	print(padded)

VincentTatan / embedding.py

Created May 22, 2020 06:54

	model = tf.keras.Sequential([
	tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
	tf.keras.layers.GlobalAveragePooling1D(),
	tf.keras.layers.Dense(24, activation='relu'),
	tf.keras.layers.Dense(1, activation='sigmoid')
	])
	model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

VincentTatan / Binning.py

Last active November 22, 2020 08:24

	def fixed_width_cut(df,feature,labels=['Low','Medium','High']):
	feature_slice, retbins = pd.cut(df[feature], len(labels) ,retbins=True, labels=labels)
	retbins = [ '%.2f' % elem for elem in retbins ]
	return feature_slice,retbins

	def quartile_cut(df,feature,labels=['Low','Medium','High']):
	feature_slice, retbins = pd.qcut(df[feature], q=len(labels),retbins=True,labels=labels)
	retbins = [ '%.2f' % elem for elem in retbins ]
	return feature_slice,retbins

OlderNewer