Skip to content

Instantly share code, notes, and snippets.

@kristiewirth
Last active December 15, 2021 17:58
Show Gist options
  • Save kristiewirth/eb374fc7ae939158cd6dd0e059a8d09d to your computer and use it in GitHub Desktop.
Save kristiewirth/eb374fc7ae939158cd6dd0e059a8d09d to your computer and use it in GitHub Desktop.
Simplified bare bones script for building predictive models
import datto as dt
import pandas as pd
import spacy
from featurestore import FeatureStore
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.metrics import (
accuracy_score,
mean_squared_error,
median_absolute_error,
precision_score,
r2_score,
recall_score,
)
from sklearn.model_selection import train_test_split
from spacy.cli import download
# Internal package for Zapier for running SQL `pip install -i https://pypi.zapier.com/simple/ featurestore`
# Need to load in English language information into spacy the first time you run this
try:
nlp = spacy.load("en_core_web_sm")
except Exception:
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
def lematize(text: str):
# Lematizing means standardizing words to their root words, because they probably have similar meanings
# E.g. jumping -> jump / so that texts with jumping are listed the same as texts with jump
# More info: https://stackabuse.com/python-for-nlp-tokenization-stemming-and-lemmatization-with-spacy-library/
spacy_text = nlp(text)
return [token.lemma_ for token in spacy_text if not token.is_space]
# To use fs, you'll need to have the following variables defined in your .bash_profile file
# RS_DBNAME
# RS_HOST
# RS_PORT
# RS_USER
# RS_PASSWORD
fs = FeatureStore()
# Get data
# Non-Zapier equivalent is pd.read_sql
df = fs.read_redshift("""SELECT my_data FROM some_table""")
# https://docs.looker.com/sharing-and-publishing/downloading
# df = pd.read_csv("some_data_I_downloaded_maybe_looker_data.csv")
# Fill null values by type
df_nums = df.select_dtypes("number")
df[df_nums.columns] = df_nums.fillna(0)
df.fillna("", inplace=True)
###### CLASSIFICATION MODELS ######
# Models where you're predicting a category, yes/no
# Language processing - you can omit this step if doing a simpler problem
# E.g. using number of tasks for an account to predict upgrade or not
# Look into adding domain specific stop words as well as pre-built stop words
# Stop words -> words that are too common / not meaningful, so remove these from the dataset first
stop_words = set(list(ENGLISH_STOP_WORDS) + ["Zapier", "Zap"])
# Vectorize test (i.e. make adjusted count columns)
vectorizer = TfidfVectorizer(
stop_words=stop_words,
tokenizer=lematize,
# Not just word columns (1), also make columns of phrases (2 or 3 word phrases)
ngram_range=(1, 3),
# Means each word must appear in at least __ different texts to be included
min_df=2,
# Means each word can't appear in more than _% of texts to be included
max_df=0.4,
)
vectors = vectorizer.fit_transform(df["my_text_column"])
# Changing vectors back to a pandas DataFrame (easier to work with, preserves text column names)
vectors = pd.DataFrame(vectors.todense())
# Setting the word/phrase tokens as the column names
words = vectorizer.get_feature_names()
vectors.columns = words
cleaned_df = pd.concat([df, vectors], axis=1)
# Split into input data vs target (what you're predicting)
X = cleaned_df.drop("label_for_text", axis=1)
y = df["label_for_test"]
# Split further into training set (model learns from this)
# And testing set (model hasn't seen this, you can use it to make predictions and see how well it's doing by comparing predictions vs actual labels)
# This is called cross validation: https://machinelearningmastery.com/k-fold-cross-validation/
X_train, X_test, y_train, y_test = train_test_split(X, y)
# If you have any feature columns with categorical data (e.g. user role), you need to make these into dummy variables instead
# Basically all columns need to be numbers, you can't feed in text data
# You want to base this off your training set so that there isn't knowledge leaking into your test set (i.e. the model is cheating)
# Don't need to do this to text data you already vectorized -> it's already represented as numbers in your data
# More info: https://www.askpython.com/python/examples/creating-dummy-variables
X_train_dummies = pd.get_dummies(X_train)
X_test_dummies = X_test.reindex(columns=X_train_dummies.columns, fill_value=0)
# Train model on data
# Instantiate model
# This is just one common type of model, there's a whole bunch of models just within sklearn
# Available sklearn models: https://scikit-learn.org/stable/supervised_learning.html
model = RandomForestClassifier()
model.fit(X_train_dummies, y_train)
# Predict on your unseen test data
y_predicted = model.predict(X_test_dummies)
# See how well your model is doing
# Accuracy is a common measurement, but can be misleading
# Precision & recall can be a better summary of how well your model is doing
# Precision: out of everything I predicted to be spam, what percent is actually spam?
# Recall: out of all the spam text in the data, what percent did I catch (i.e. predicted as spam?)
# More info: https://towardsdatascience.com/whats-the-deal-with-accuracy-precision-recall-and-f1-f5d8b4db1021
print(accuracy_score(y_test, y_predicted))
print(precision_score(y_test, y_predicted))
print(recall_score(y_test, y_predicted))
###### REGRESSION MODELS ######
# Models where you're predicting a number
# E.g. predicting number of Zaps a user will activate
# Split into X, y as above
# Split into train and test sets
# Instead of a classifier, use a regression type model
# Make predictions as above
# Will need to measure different scoring metrics
print(mean_squared_error(y_test, y_predicted))
print(median_absolute_error(y_test, y_predicted))
print(r2_score(y_test, y_predicted))
###### datto METHODS ######
# Above are the basic steps, but I have lots of methods in my Python package that do similar things/simplify things too!
# https://github.com/kristiewirth/datto
# It can do way more stuff, but here's a few examples I use often
mr = dt.ModelResults()
# More detailed model results
mr.score_final_model("classification", X_test_dummies, y_test, model)
mr.score_final_model("regression", X_test_dummies, y_test, model)
# See coefficients for model, i.e. how each feature (piece of information) influences the predictions / which are most important
mr.coefficients_graph(
X_train_dummies, X_test_dummies, model, "classification", "filename"
)
# See some example individual predictions
mr.coefficients_individual_predictions(
model,
X_train_dummies,
X_train_dummies,
X_test_dummies,
"name_of_unique_id_per_row_like_account_id",
num_id_examples=5,
num_feature_examples=10,
model_type="classification",
class_names=y_test.columns,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment