Skip to content

Instantly share code, notes, and snippets.

@cccntu
cccntu / csv.py
Created February 8, 2021 11:58
python mmap to concatenate csv files
❯ rm out.csv
❯ cat 1.py
from glob import glob
import mmap
files = glob("data/*")
files.sort(key=lambda x: int(x.split("/")[-1].split(".")[0]))
write_f = open("out.csv", "w+b")
import sys
# imports utils and imports
from src import *
from dataclasses import dataclass
from typing import Optional
from omegaconf import OmegaConf
from tqdm.auto import tqdm
import time
class LengthedGeneratorWrapper:
"""wraps an infinite generator with length for tqdm"""
def __init__(self, infinite_generator, len):
self.generator = infinite_generator
self.len = len
def __len__(self):
return self.len
@cccntu
cccntu / mc4-timestamp-analysis.ipynb
Created September 17, 2021 15:22
mc4-timestamp-analysis.ipynb
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@cccntu
cccntu / parse-c4-date-from-url.ipynb
Created September 21, 2021 07:35
parse-c4-date-from-url.ipynb
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@cccntu
cccntu / number token.py
Created October 1, 2021 15:38
tokens that are numbers in gpt2 tokenizer
from transformers import AutoTokenizer, AutoModelWithLMHead
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# model = AutoModelWithLMHead.from_pretrained("gpt2")
import re
nums = []
@cccntu
cccntu / M1_GPU.md
Last active January 24, 2022 17:12
Run huggingface transformers on M1 Macs, on GPU!

Run huggingface transformers on M1 Macs, on GPU

  • Requirement: macOS 12 Monterey

  • First, install a conda env with M1 support

wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
chmod +x ~/Downloads/Miniforge3-MacOSX-arm64.sh
sh ~/Downloads/Miniforge3-MacOSX-arm64.sh
"""Generates a document causal attention mask based on a document ID tensor"""
from typing import List, Union
import torch
from torch import Tensor
from torch.nn.attention.flex_attention import _mask_mod_signature, or_masks
from attn_gym.masks import causal_mask
import torch
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, dtype=torch.float32, mod=True):
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
t = torch.arange(end, device=freqs.device) # type: ignore
angles = torch.outer(t, freqs) # type: ignore
if mod:
angles = angles % (2 * torch.pi)
# Now convert angles to the desired lower precision
angles = angles.to(dtype)