Will Koehrsen WillKoehrsen

🌆

building

Senior Software Engineer @gridstatus

WillKoehrsen / visualize_decision_tree.py

Last active May 24, 2024 23:59

How to visualize a single decision tree in Python

	from sklearn.datasets import load_iris
	iris = load_iris()

	# Model (can also use single decision tree)
	from sklearn.ensemble import RandomForestClassifier
	model = RandomForestClassifier(n_estimators=10)

	# Train
	model.fit(iris.data, iris.target)
	# Extract single tree

WillKoehrsen / visualize_decision_tree2.py

Created August 18, 2018 23:16

	from sklearn.datasets import load_iris

	iris = load_iris()

	from sklearn.ensemble import RandomForestClassifier

	# Limit max depth
	model = RandomForestClassifier(max_depth = 3, n_estimators=10)

	# Train

WillKoehrsen / umap_simple.py

Created August 16, 2018 21:10

	import umap as UMAP

	n_components = 3

	# Use default parameters
	umap = UMAP(n_components=n_components)

	# Fit and transform
	train_reduced = umap.fit_transform(train)
	test_reduced = umap.transform(test)

WillKoehrsen / rfecv.py

Created August 15, 2018 20:07

	from sklearn.metrics import f1_score, make_scorer
	from sklearn.feature_selection import RFECV
	from sklearn.ensemble import RandomForestClassifier

	# Custom scorer for cross validation
	scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')

	# Create a model for feature selection
	estimator = RandomForestClassifier(n_estimators = 100, n_jobs = -1)

WillKoehrsen / remove_correlated.py

Last active September 21, 2018 15:44

	import numpy as np

	threshold = 0.95

	# Create correlation matrix
	corr_matrix = data.corr()

	# Select upper triangle of correlation matrix
	upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

WillKoehrsen / missing_data.py

Created August 15, 2018 19:10

	import pandas as pd

	# Number of missing in each column
	missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'})

	# Create a percentage missing
	missing['percent'] = missing['total'] / len(data)

WillKoehrsen / join_feature_matrices.py

Created August 15, 2018 15:48

	base = '../input/fm/'
	fm_paths = [base + p for p in os.listdir(base) if 'fm.csv' in p]

	# List of dataframes
	fms = [pd.read_csv(path) for path in fm_paths]

	# Join rows together
	feature_matrix = pd.concat(fms, axis = 0)

WillKoehrsen / entityset_from_partition_pseudo_code.py

Created August 15, 2018 14:50

	import featuretools as ft

	def entityset_from_partition(path):
	"""Create an EntitySet from a partition of data"""

	# Read in data from path
	app = pd.read_csv('%s/app.csv' % path)
	... # Read in 6 other files

	# Create the entityset and add tables and relationships

WillKoehrsen / partition_pseudo_code.py

Created August 15, 2018 14:40

	# Chunk size for 104 sections
	chunk_size = len(clients) // 103

	# Create list of lists of clients in each section
	client_lists = [clients[i:i + chunk_size] for i in range(0, len(clients), chunk_size)]

	def create_partition(client_list, partition_number):
	"""Creates and saves a dataset with users in user_list"""

	# Subset data

WillKoehrsen / feature_matrix_dask.py

Last active August 7, 2018 17:23

	from dask import delayed
	import os

	# Create list of all partitions
	paths = ['../input/partitions/%s' % file for file os.listdir('../input/partitions/')]

	start_index = 0

	# Iterate through 8 paths (one batch) at a time
	for i, end_index in enumerate(range(9, len(paths) + 5, 8)):