Eugene Yan eugeneyan

👨‍💻

Learning. Building. Writing.

I build ML, RecSys, & LLM systems @amzn, and write @ eugeneyan.com.

eugeneyan / test_train_time_and_latency.py

Created February 21, 2021 19:14

Test train time and latency

	def test_dt_training_time(dummy_titanic):
	X_train, y_train, X_test, y_test = dummy_titanic

	# Standardize to use depth = 10
	dt = DecisionTree(depth_limit=10)
	latency_array = np.array([train_with_time(dt, X_train, y_train)[1] for i in range(100)])
	time_p95 = np.quantile(latency_array, 0.95)
	assert time_p95 < 1.0, 'Training time at 95th percentile should be < 1.0 sec'

eugeneyan / test_rf_acc_increase.py

Created February 21, 2021 19:15

Test RandomForest accuracy increase

	def test_dt_increase_acc(dummy_titanic):
	X_train, y_train, X_test, y_test = dummy_titanic

	acc_list = []
	auc_list = []
	for num_trees in [1, 3, 7, 15]:
	rf = RandomForest(num_trees=num_trees, depth_limit=7, col_subsampling=0.7, row_subsampling=0.7)
	rf.fit(X_train, y_train)
	pred = rf.predict(X_test)
	pred_binary = np.round(pred)

eugeneyan / test_rf_better_at_same_depth.py

Created February 21, 2021 19:16

Test RandomForest performs better with same depth

	def test_rf_better_than_dt(dummy_titanic):
	X_train, y_train, X_test, y_test = dummy_titanic

	dt = DecisionTree(depth_limit=10)
	dt.fit(X_train, y_train)

	rf = RandomForest(depth_limit=10, num_trees=7, col_subsampling=0.8, row_subsampling=0.8)
	rf.fit(X_train, y_train)

	pred_test_dt = dt.predict(X_test)

eugeneyan / testing_ml_setup.sh

Created February 21, 2021 19:17

testing-ml setup

	# Clone and setup environment
	git clone https://github.com/eugeneyan/testing-ml.git
	cd testing-ml
	make setup

	# Run test suite
	make check

eugeneyan / data-discovery-comparison.txt

Created February 28, 2021 05:04

Comparison of data discovery platforms

	\| \| Search \| Recommendations \| Schemas & Description \| Data Preview \| Column Statistics \| Space/cost metrics \| Ownership \| Top Users \| Lineage \| Change Notification \| Open Source \| Documentation \| Supported Sources \| Push or Pull \|
	\|-----------------------------\|--------\|-----------------\|-----------------------\|--------------\|-------------------\|--------------------\|-----------\|-----------\|---------\|---------------------\|-------------\|---------------\|-------------------------------------------------------\|--------------\|
	\| Amundsen (Lyft) \| ✔ \| ✔ \| ✔ \| ✔ \| ✔ \| \| ✔ \| ✔ \| Todo \| \| ✔ \| ✔ \| Hive, Redshift, Druit, RDBMS, Presto, Snowflake, etc. \| Pull \|
	\| Datahub (LinkedIn) \| ✔ \| \| ✔ \| \| \|

eugeneyan / swing.py

Created April 26, 2021 02:15

	for i in xrange(0, len(u2items)):
	wi = math.pow(len(u2items[i]) + 5, -0.35)
	for j in xrange(i + 1, len(u2items)):
	intersection = u2items[i] & u2items[j]
	wj = wi * math.pow(len(u2items[j]) + 5, -0.35)
	for product_id in intersection:
	i2i[product_id] = i2i.get(product_id, 0.0) + wj / (1 + len(intersection))

	# u2items = array of users and their items
	# u2items[i] = items user i clicked on

eugeneyan / nvt.yml

Created December 15, 2021 00:27

	# Based on https://github.com/NVIDIA-Merlin/NVTabular/blob/main/conda/environments/nvtabular_dev_cuda11.0.yml
	name: nvt_t4r
	channels:
	- rapidsai
	- nvidia
	- conda-forge
	- defaults
	dependencies:
	- nvtabular
	- python>=3.7

eugeneyan / setup_nvt_t4r_pytorch

Created December 15, 2021 00:35

	# Start a SageMaker notebook instance (ml.p3.2xlarge) and open a terminal

	# Upload the conda yml from here: https://gist.github.com/eugeneyan/3435e05dd675b9ee2af164214536752d

	# Install NVTabular
	conda env create -f=SageMaker/nvt_t4r.yml

	# Activate conda env
	source anaconda3/etc/profile.d/conda.sh
	conda activate nvt_t4r

eugeneyan / iterative_git.py

Last active January 16, 2023 17:48

	"""
	Iteratively loop through all files in DIR and add-commit-push them to REPO.

	This script should sit in your obsidian vault.
	"""
	from pathlib import Path
	from git import Repo
	import os

	DIR = '/Users/eugene/obsidian-vault/assets'

eugeneyan / convert_tag_format.py

Created January 19, 2023 00:04

	"""
	Fixes tags that were converted to links during Obsidian import.

	Specifically, it is the first 3 line and the line contains "tags:", convert all [[tag name]] to #tag-name
	"""
	import os
	import re
	from pathlib import Path

	DIR = '/Users/eugeneya/obsidian-vault/'

	\| \| Search \| Recommendations \| Schemas & Description \| Data Preview \| Column Statistics \| Space/cost metrics \| Ownership \| Top Users \| Lineage \| Change Notification \| Open Source \| Documentation \| Supported Sources \| Push or Pull \|
	\|-----------------------------\|--------\|-----------------\|-----------------------\|--------------\|-------------------\|--------------------\|-----------\|-----------\|---------\|---------------------\|-------------\|---------------\|-------------------------------------------------------\|--------------\|
	\| Amundsen (Lyft) \| ✔ \| ✔ \| ✔ \| ✔ \| ✔ \| \| ✔ \| ✔ \| Todo \| \| ✔ \| ✔ \| Hive, Redshift, Druit, RDBMS, Presto, Snowflake, etc. \| Pull \|
	\| Datahub (LinkedIn) \| ✔ \| \| ✔ \| \| \|