This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import load_iris | |
iris = load_iris() | |
# Model (can also use single decision tree) | |
from sklearn.ensemble import RandomForestClassifier | |
model = RandomForestClassifier(n_estimators=10) | |
# Train | |
model.fit(iris.data, iris.target) | |
# Extract single tree |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import load_iris | |
iris = load_iris() | |
from sklearn.ensemble import RandomForestClassifier | |
# Limit max depth | |
model = RandomForestClassifier(max_depth = 3, n_estimators=10) | |
# Train |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import umap as UMAP | |
n_components = 3 | |
# Use default parameters | |
umap = UMAP(n_components=n_components) | |
# Fit and transform | |
train_reduced = umap.fit_transform(train) | |
test_reduced = umap.transform(test) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import f1_score, make_scorer | |
from sklearn.feature_selection import RFECV | |
from sklearn.ensemble import RandomForestClassifier | |
# Custom scorer for cross validation | |
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro') | |
# Create a model for feature selection | |
estimator = RandomForestClassifier(n_estimators = 100, n_jobs = -1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Number of missing in each column | |
missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'}) | |
# Create a percentage missing | |
missing['percent'] = missing['total'] / len(data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
base = '../input/fm/' | |
fm_paths = [base + p for p in os.listdir(base) if 'fm.csv' in p] | |
# List of dataframes | |
fms = [pd.read_csv(path) for path in fm_paths] | |
# Join rows together | |
feature_matrix = pd.concat(fms, axis = 0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import featuretools as ft | |
def entityset_from_partition(path): | |
"""Create an EntitySet from a partition of data""" | |
# Read in data from path | |
app = pd.read_csv('%s/app.csv' % path) | |
... # Read in 6 other files | |
# Create the entityset and add tables and relationships |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Chunk size for 104 sections | |
chunk_size = len(clients) // 103 | |
# Create list of lists of clients in each section | |
client_lists = [clients[i:i + chunk_size] for i in range(0, len(clients), chunk_size)] | |
def create_partition(client_list, partition_number): | |
"""Creates and saves a dataset with users in user_list""" | |
# Subset data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dask import delayed | |
import os | |
# Create list of all partitions | |
paths = ['../input/partitions/%s' % file for file os.listdir('../input/partitions/')] | |
start_index = 0 | |
# Iterate through 8 paths (one batch) at a time | |
for i, end_index in enumerate(range(9, len(paths) + 5, 8)): |