Jack Morris jxmorris12

🐳

just chilling

Computer Science PhD Student at Cornell University. Interested in artificial intelligence and natural language processing.

jxmorris12 / o3_pretraining_interface.py

Created February 18, 2025 23:27

	#!/usr/bin/env python3
	from flask import Flask, render_template_string

	app = Flask(__name__)

	@app.route('/')
	def dashboard():
	return render_template_string(r'''
	<!DOCTYPE html>
	<html lang="en">

jxmorris12 / hf_dedup.py

Created December 5, 2024 19:49

	import multiprocessing

	manager = multiprocessing.Manager()
	all_hashes_set = manager.dict()
	def deduplicate(examples, all_hashes_set):
	print(len(all_hashes_set))
	input_ids = examples['input_ids']
	hashes = [
	hash(tuple(input_ids[i]))
	for i in range(len(input_ids))

jxmorris12 / torch_ddp_verify.py

Last active April 19, 2024 15:54

verify parameter weights & gradients in pytorch

	def verify_ddp_weights_equal(model: torch.nn.Module, atol: float = 1e-5) -> None:
	if hasattr(model, "module"):
	model = model.module

	world_size = get_world_size()
	for name, param in model.named_parameters():
	gathered_param = gather(param).reshape((world_size, -1))
	absolute_diffs = (gathered_param[None, 0, :] - gathered_param).abs()
	rank_params_eq = (absolute_diffs < atol).all()
	assert rank_params_eq, f"❌ param [{name}] not equal - got max_absolute_diff={absolute_diffs.max()}"

jxmorris12 / slice._sparse_tensor.py

Created March 4, 2024 21:34

pytorch sparse tensor slice

	import torch

	def slice_sparse_tensor_rows(t: torch.sparse.Tensor, min_row: int, max_row: int) -> torch.sparse.Tensor:
	row_idxs = t.indices()[0]
	index_mask = (min_row <= row_idxs) & (row_idxs < max_row)

	num_rows = (max_row - min_row)
	num_cols = t.shape[1]

	idxs = t.indices()[:, index_mask]

jxmorris12 / datasets_fast_load_from_disk.py

Created January 19, 2024 23:24

datasets_fast_load_from_disk.py

jxmorris12 / upload_dataset.py

Created October 25, 2023 17:49

load a dataset from JSON and upload it to huggingface

	import argparse
	import glob

	import datasets
	import pandas as pd

	def load_datasets(data_folder):
	train_file = glob.glob(f"{data_folder}/train*.jsonl")[0]
	test_file = f"{data_folder}/test.jsonl"
	dev_file = glob.glob(f"{data_folder}/dev*.jsonl")[0]

jxmorris12 / msmarco_corpus.py

Last active October 25, 2023 13:52

load msmarco corpus

	from typing import Dict, Tuple

	import logging
	import os
	import pathlib
	import requests
	import zipfile

	import beir
	import beir.datasets

jxmorris12 / tips & tricks

Created May 24, 2023 19:45

	Python

	map a function to a list: — map (f, list) — NOT the other way around

	set a breakpoint: import pdb; pdb.set_trace()
	—> ACTUALLY starting in python 3.7 you can just do breakpoint() !

	best way to profile any python code: pip install pyinstrument; python -m pyinstrument ./myprog.py

	run a pytest test by pattern: pytest -k <pattern>

jxmorris12 / airpods.py

Created November 2, 2020 14:49

automatically connect Mac to Bluetooth headphones

	#!/usr/bin/env python
	# jm8wx 11/2/20

	import subprocess
	import re

	airpods_name = "Jack’s AirPods Pro"

	def _color(s):
	return "\033[94m" + s + "\033[0m"

jxmorris12 / git-fatfiles

Created October 4, 2019 19:58

print large stuff in your git repo