Created
December 15, 2022 01:57
-
-
Save thesephist/9a9a7e509f36eeb26686a9c303744b22 to your computer and use it in GitHub Desktop.
Data prep script to fine-tune GPT-3 on my past writing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import json | |
from tqdm import tqdm | |
from transformers import GPT2TokenizerFast | |
os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
FILENAME = './thesephist.jsonl' | |
CORPUS_DIRS = [ | |
'/Users/thesephist/src/www/content/posts', | |
'/Users/thesephist/src/dotink/content/posts', | |
'/Users/thesephist/src/coffee/content/note', | |
] | |
MAX_TOKENS_PER_SAMPLE = 2048 # davinci (pre-002) has a max context length of 2048, not 4096 | |
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2-xl') | |
def count_tokens(s: str) -> int: | |
return len(tokenizer(s).input_ids) | |
# find all the files | |
files = [] | |
for corpus_dir in CORPUS_DIRS: | |
for draft in os.listdir(corpus_dir): | |
files.append(( | |
os.path.join(corpus_dir, draft), | |
corpus_dir.split(os.sep)[-2] + os.sep + draft, | |
)) | |
# read all my writing and collect them up in a list | |
drafts = [] | |
docs = [] | |
for (filepath, draft) in (bar := tqdm(files)): | |
bar.set_description('Preprocessing files') | |
with open(filepath, 'r') as f: | |
lines = f.readlines() | |
# get rid of all the Markdown front matter | |
if lines[0] == '---\n': | |
lines = lines[1:] | |
while lines[0] != '---\n': | |
lines = lines[1:] | |
lines = lines[1:] | |
content = ''.join(lines).strip() | |
paras = [p.strip() for p in content.split('\n\n') if p.strip() != ''] | |
# split long documents into short ones that fit in one GPT-3 context each | |
paras_sofar = [] | |
for p in paras: | |
tokens_sofar = count_tokens('\n\n'.join(paras_sofar + [p])) | |
if tokens_sofar > MAX_TOKENS_PER_SAMPLE: | |
drafts.append(draft) | |
docs.append('\n\n'.join(paras_sofar)) | |
paras_sofar = [] | |
paras_sofar.append(p) | |
if len(paras_sofar) > 0: | |
drafts.append(draft) | |
docs.append('\n\n'.join(paras_sofar)) | |
# tokenize using the GPT-2 tokenizer, which approximates GPT-3's tokenizer | |
# extremely well in my testing | |
docs_bar = tqdm(docs) | |
docs_bar.set_description('Tokenizing') | |
token_counts = [count_tokens(doc) for doc in docs_bar] | |
finetune_docs = [{ 'prompt': '', 'completion': text.strip() } for text in docs] | |
# report some stats | |
count = len(finetune_docs) | |
size = len(''.join(docs)) | |
tokens_estimate = sum(token_counts) | |
mean_tokens = tokens_estimate / len(docs) | |
print(f'''Creating finetuning JSONL data with: | |
{count} drafts | |
{size//1000} KB | |
{tokens_estimate:.2f} tokens ({mean_tokens:.2f}tok/sample)''') | |
print(f'''At this size, training for 4 epochs will cost: | |
ada ${0.0004 * tokens_estimate / 1000 * 4:.2f} | |
babbage ${0.0006 * tokens_estimate / 1000 * 4:.2f} | |
curie ${0.003 * tokens_estimate / 1000 * 4:.2f} | |
davinci ${0.03 * tokens_estimate / 1000 * 4:.2f}''') | |
# write dataset JSONL file | |
with open(FILENAME, 'w+') as f: | |
for doc in finetune_docs: | |
json.dump(doc, f) | |
f.write('\n') | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# environment | |
export OPENAI_API_KEY=sk-XXXX | |
export FILENAME=thesephist.jsonl | |
export MODEL_NAME=davinci | |
# train | |
source venv/bin/activate | |
openai api fine_tunes.create -t $FILENAME -m $MODEL_NAME --suffix "text-davinci-thesephist" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment