Skip to content

Instantly share code, notes, and snippets.

View jxmorris12's full-sized avatar
🐳
just chilling

Jack Morris jxmorris12

🐳
just chilling
View GitHub Profile
#!/usr/bin/env python3
from flask import Flask, render_template_string
app = Flask(__name__)
@app.route('/')
def dashboard():
return render_template_string(r'''
<!DOCTYPE html>
<html lang="en">
import multiprocessing
manager = multiprocessing.Manager()
all_hashes_set = manager.dict()
def deduplicate(examples, all_hashes_set):
print(len(all_hashes_set))
input_ids = examples['input_ids']
hashes = [
hash(tuple(input_ids[i]))
for i in range(len(input_ids))
@jxmorris12
jxmorris12 / torch_ddp_verify.py
Last active April 19, 2024 15:54
verify parameter weights & gradients in pytorch
def verify_ddp_weights_equal(model: torch.nn.Module, atol: float = 1e-5) -> None:
if hasattr(model, "module"):
model = model.module
world_size = get_world_size()
for name, param in model.named_parameters():
gathered_param = gather(param).reshape((world_size, -1))
absolute_diffs = (gathered_param[None, 0, :] - gathered_param).abs()
rank_params_eq = (absolute_diffs < atol).all()
assert rank_params_eq, f"❌ param [{name}] not equal - got max_absolute_diff={absolute_diffs.max()}"
@jxmorris12
jxmorris12 / slice._sparse_tensor.py
Created March 4, 2024 21:34
pytorch sparse tensor slice
import torch
def slice_sparse_tensor_rows(t: torch.sparse.Tensor, min_row: int, max_row: int) -> torch.sparse.Tensor:
row_idxs = t.indices()[0]
index_mask = (min_row <= row_idxs) & (row_idxs < max_row)
num_rows = (max_row - min_row)
num_cols = t.shape[1]
idxs = t.indices()[:, index_mask]
@jxmorris12
jxmorris12 / datasets_fast_load_from_disk.py
Created January 19, 2024 23:24
datasets_fast_load_from_disk.py
from typing import Iterable
import concurrent
imoprt datasets
import glob
import json
import multiprocessing
import os
def load_dataset_tables(
@jxmorris12
jxmorris12 / upload_dataset.py
Created October 25, 2023 17:49
load a dataset from JSON and upload it to huggingface
import argparse
import glob
import datasets
import pandas as pd
def load_datasets(data_folder):
train_file = glob.glob(f"{data_folder}/train*.jsonl")[0]
test_file = f"{data_folder}/test.jsonl"
dev_file = glob.glob(f"{data_folder}/dev*.jsonl")[0]
@jxmorris12
jxmorris12 / msmarco_corpus.py
Last active October 25, 2023 13:52
load msmarco corpus
from typing import Dict, Tuple
import logging
import os
import pathlib
import requests
import zipfile
import beir
import beir.datasets
Python
map a function to a list: — map (f, list) — NOT the other way around
set a breakpoint: import pdb; pdb.set_trace()
—> ACTUALLY starting in python 3.7 you can just do breakpoint() !
best way to profile any python code: pip install pyinstrument; python -m pyinstrument ./myprog.py
run a pytest test by pattern: pytest -k <pattern>
@jxmorris12
jxmorris12 / airpods.py
Created November 2, 2020 14:49
automatically connect Mac to Bluetooth headphones
#!/usr/bin/env python
# jm8wx 11/2/20
import subprocess
import re
airpods_name = "Jack’s AirPods Pro"
def _color(s):
return "\033[94m" + s + "\033[0m"
@jxmorris12
jxmorris12 / git-fatfiles
Created October 4, 2019 19:58
print large stuff in your git repo
git rev-list --all --objects | \
sed -n $(git rev-list --objects --all | \
cut -f1 -d' ' | \
git cat-file --batch-check | \
grep blob | \
sort -n -k 3 | \
tail -n40 | \
while read hash type size; do
echo -n "-e s/$hash/$size/p ";
done) | \