Skip to content

Instantly share code, notes, and snippets.

View eugeneyan's full-sized avatar
👨‍💻
Learning. Building. Writing.

Eugene Yan eugeneyan

👨‍💻
Learning. Building. Writing.
View GitHub Profile
@eugeneyan
eugeneyan / test_train_time_and_latency.py
Created February 21, 2021 19:14
Test train time and latency
def test_dt_training_time(dummy_titanic):
X_train, y_train, X_test, y_test = dummy_titanic
# Standardize to use depth = 10
dt = DecisionTree(depth_limit=10)
latency_array = np.array([train_with_time(dt, X_train, y_train)[1] for i in range(100)])
time_p95 = np.quantile(latency_array, 0.95)
assert time_p95 < 1.0, 'Training time at 95th percentile should be < 1.0 sec'
@eugeneyan
eugeneyan / test_rf_acc_increase.py
Created February 21, 2021 19:15
Test RandomForest accuracy increase
def test_dt_increase_acc(dummy_titanic):
X_train, y_train, X_test, y_test = dummy_titanic
acc_list = []
auc_list = []
for num_trees in [1, 3, 7, 15]:
rf = RandomForest(num_trees=num_trees, depth_limit=7, col_subsampling=0.7, row_subsampling=0.7)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
pred_binary = np.round(pred)
@eugeneyan
eugeneyan / test_rf_better_at_same_depth.py
Created February 21, 2021 19:16
Test RandomForest performs better with same depth
def test_rf_better_than_dt(dummy_titanic):
X_train, y_train, X_test, y_test = dummy_titanic
dt = DecisionTree(depth_limit=10)
dt.fit(X_train, y_train)
rf = RandomForest(depth_limit=10, num_trees=7, col_subsampling=0.8, row_subsampling=0.8)
rf.fit(X_train, y_train)
pred_test_dt = dt.predict(X_test)
@eugeneyan
eugeneyan / testing_ml_setup.sh
Created February 21, 2021 19:17
testing-ml setup
# Clone and setup environment
git clone https://github.com/eugeneyan/testing-ml.git
cd testing-ml
make setup
# Run test suite
make check
@eugeneyan
eugeneyan / data-discovery-comparison.txt
Created February 28, 2021 05:04
Comparison of data discovery platforms
| | Search | Recommendations | Schemas & Description | Data Preview | Column Statistics | Space/cost metrics | Ownership | Top Users | Lineage | Change Notification | Open Source | Documentation | Supported Sources | Push or Pull |
|-----------------------------|--------|-----------------|-----------------------|--------------|-------------------|--------------------|-----------|-----------|---------|---------------------|-------------|---------------|-------------------------------------------------------|--------------|
| Amundsen (Lyft) | ✔ | ✔ | ✔ | ✔ | ✔ | | ✔ | ✔ | Todo | | ✔ | ✔ | Hive, Redshift, Druit, RDBMS, Presto, Snowflake, etc. | Pull |
| Datahub (LinkedIn) | ✔ | | ✔ | | |
for i in xrange(0, len(u2items)):
wi = math.pow(len(u2items[i]) + 5, -0.35)
for j in xrange(i + 1, len(u2items)):
intersection = u2items[i] & u2items[j]
wj = wi * math.pow(len(u2items[j]) + 5, -0.35)
for product_id in intersection:
i2i[product_id] = i2i.get(product_id, 0.0) + wj / (1 + len(intersection))
# u2items = array of users and their items
# u2items[i] = items user i clicked on
# Based on https://github.com/NVIDIA-Merlin/NVTabular/blob/main/conda/environments/nvtabular_dev_cuda11.0.yml
name: nvt_t4r
channels:
- rapidsai
- nvidia
- conda-forge
- defaults
dependencies:
- nvtabular
- python>=3.7
# Start a SageMaker notebook instance (ml.p3.2xlarge) and open a terminal
# Upload the conda yml from here: https://gist.github.com/eugeneyan/3435e05dd675b9ee2af164214536752d
# Install NVTabular
conda env create -f=SageMaker/nvt_t4r.yml
# Activate conda env
source anaconda3/etc/profile.d/conda.sh
conda activate nvt_t4r
"""
Iteratively loop through all files in DIR and add-commit-push them to REPO.
This script should sit in your obsidian vault.
"""
from pathlib import Path
from git import Repo
import os
DIR = '/Users/eugene/obsidian-vault/assets'
"""
Fixes tags that were converted to links during Obsidian import.
Specifically, it is the first 3 line and the line contains "tags:", convert all [[tag name]] to #tag-name
"""
import os
import re
from pathlib import Path
DIR = '/Users/eugeneya/obsidian-vault/'