Skip to content

Instantly share code, notes, and snippets.

View WillKoehrsen's full-sized avatar
🌆
building

Will Koehrsen WillKoehrsen

🌆
building
View GitHub Profile
@WillKoehrsen
WillKoehrsen / visualize_decision_tree.py
Last active May 24, 2024 23:59
How to visualize a single decision tree in Python
from sklearn.datasets import load_iris
iris = load_iris()
# Model (can also use single decision tree)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
# Train
model.fit(iris.data, iris.target)
# Extract single tree
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.ensemble import RandomForestClassifier
# Limit max depth
model = RandomForestClassifier(max_depth = 3, n_estimators=10)
# Train
import umap as UMAP
n_components = 3
# Use default parameters
umap = UMAP(n_components=n_components)
# Fit and transform
train_reduced = umap.fit_transform(train)
test_reduced = umap.transform(test)
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# Create a model for feature selection
estimator = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
import numpy as np
threshold = 0.95
# Create correlation matrix
corr_matrix = data.corr()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
import pandas as pd
# Number of missing in each column
missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'})
# Create a percentage missing
missing['percent'] = missing['total'] / len(data)
base = '../input/fm/'
fm_paths = [base + p for p in os.listdir(base) if 'fm.csv' in p]
# List of dataframes
fms = [pd.read_csv(path) for path in fm_paths]
# Join rows together
feature_matrix = pd.concat(fms, axis = 0)
import featuretools as ft
def entityset_from_partition(path):
"""Create an EntitySet from a partition of data"""
# Read in data from path
app = pd.read_csv('%s/app.csv' % path)
... # Read in 6 other files
# Create the entityset and add tables and relationships
# Chunk size for 104 sections
chunk_size = len(clients) // 103
# Create list of lists of clients in each section
client_lists = [clients[i:i + chunk_size] for i in range(0, len(clients), chunk_size)]
def create_partition(client_list, partition_number):
"""Creates and saves a dataset with users in user_list"""
# Subset data
from dask import delayed
import os
# Create list of all partitions
paths = ['../input/partitions/%s' % file for file os.listdir('../input/partitions/')]
start_index = 0
# Iterate through 8 paths (one batch) at a time
for i, end_index in enumerate(range(9, len(paths) + 5, 8)):