Skip to content

Instantly share code, notes, and snippets.

@GilesBathgate
Last active May 17, 2024 09:09
Show Gist options
  • Save GilesBathgate/a7a0a18276a2a79836cb6cb44d8656c2 to your computer and use it in GitHub Desktop.
Save GilesBathgate/a7a0a18276a2a79836cb6cb44d8656c2 to your computer and use it in GitHub Desktop.
Preprocess tinystories for Llama.c
import glob
import json
import os
import multiprocessing
from functools import partial
import random
import re
DATA_CACHE_DIR = "data"
BOYS_NAMES = ["Aaron","Abel","Abigail","Acorn","Adam","Aidan","Aiden","Al","Alan","Albert","Alby","Alex","Alexander","Alfie","Alfred","Ali","Allen","Amos","Andre","Andrew","Andy","Anthony","Antonio","Archie","Arthur","Avi","Bae","Bailey","Barry","Bart","Baxter","Ben","Benjamin","Benji","Bennie","Benny","Bert","Bill","Billy","Blake","Blaze","Bo","Bob","Bobbie","Bobby","Bobo","Boris","Boy","Brad","Braden","Brady","Brayden","Brent","Brett","Brian","Brix","Brody","Bruce","Bruno","Bryan","Bryce","Bubba","Bud","Budd","Bumpkin","Burt","Buster","Buzzy","Cai","Caleb","Calvin","Cape","Captain","Carl","Carlos","Carter","Cat","Chad","Charles","Charlie","Chen","Chet","Chip","Chris","Christopher","Chuck","Clifford","Clive","Clyde","Cody","Cole","Colin","Connor","Conor","Cooper","Corey","Craig","Dale","Damien","Dan","Daniel","Danny","Dare","Darren","Darryl","Dave","Davey","David","Davy","Dean","Dennis","Denny","Deon","Derek","Diego","Dip","Dirk","Dizzy","Donny","Doug","Drew","Dustin","Dusty","Dylan","Ed","Eddie","Edgar","Edward","Eli","Elliot","Elmo","Emile","Emmett","Eppy","Eric","Erik","Ernie","Ethan","Evan","Everett","Ezra","Farley","Felix","Fergus","Fernando","Finley","Finn","Flint","Flynn","Francis","Frank","Frankie","Fred","Freddie","Freddy","Fritz","Gabriel","Gage","Gary","Geordie","George","Gerry","Gil","Gilbert","Gill","Gio","Goofy","Gordan","Graham","Grant","Green","Greg","Gregory","Gus","Guy","Hank","Hans","Happy","Harold","Harry","Henry","Hudson","Hugo","Hunter","Hyeon","Ian","Infant","Isaac","Ivan","Jack","Jackie","Jackson","Jacky","Jacob","Jacoby","Jacques","Jaime","Jak","Jake","Jakob","James","Jamie","Jamison","Jamon","Jan","Jane","Jared","Jase","Jason","Jasper","Jax","Jaxon","Jay","Jayden","Jazz","Jeb","Jed","Jeff","Jeffrey","Jeremiah","Jeremy","Jerry","Jesse","Jet","Jhon","Jill","Jim","Jimbo","Jimmy","Jimy","Jin","Jo","Jobe","Joe","Joel","Joey","Joffrey","Jog","John","Johnnie","Johnny","Johns","Johny","Jojo","Jon","Jonah","Jonathan","Jones","Jonny","Jordan","Jorge","Jose","José","Joseph","Josh","Joshua","Juan","Julian","Julie","Julius","June","Junior","Justin","Kaden","Kai","Kari","Karl","Kaya","Keith","Ken","Kenny","Kevin","Kid","Kim","Kody","Kurt","Kyle","Lance","Landon","Larry","Lee","Lenny","Leo","Leon","Les","Levi","Lew","Lewis","Liam","Lily","Little","Logan","Lou","Louie","Louis","Luca","Lucas","Lucky","Lucy","Luis","Luka","Luke","Mac","Marco","Marcus","Mario","Mark","Markus","Marky","Marley","Martin","Marty","Marv","Mason","Mathew","Matt","Matthew","Matthews","Mattias","Matty","Max","Maxim","Michael","Mick","Mickey","Micky","Miguel","Mike","Mikey","Miles","Milo","Mitch","Mo","Mon","Mudd","Murray","Naan","Nate","Nathan","Navy","Ned","Niall","Nicey","Nick","Nicky","Nigel","Noah","Norman","Olaf","Olive","Oliver","Ollie","Olly","Oscar","Otis","Owen","Pablo","Paddy","Panda","Parker","Parry","Pat","Patrick","Paul","Peanut","Pedro","Pen","Penny","Percy","Perry","Pete","Peter","Phil","Philip","Phillip","Pierre","Ping","Pio","Poppy","Poul","Prince","Quinton","Raj","Ralph","Randall","Randy","Rascal","Ravi","Ray","Raymond","Red","Reggie","Rex","Rey","Rich","Richard","Richie","Rick","Ricky","Riley","Roam","Rob","Robbie","Robby","Robert","Robin","Rocky","Roger","Ron","Ronnie","Ronny","Rory","Ross","Roy","Rudi","Rudy","Rufus","Rupert","Ryan","Sam","Sammy","Samuel","Sandy","Scott","Sean","Sebastian","Seth","Shane","Shaun","Shawn","Sid","Simon","Small","Spike","Spud","Stan","Stanley","Stefan","Stephen","Steve","Steven","Stevie","Stu","Stuart","Sully","Tad","Tag","Taidai","Tango","Tariq","Taylor","Ted","Teddy","Terrance","Terry","Than","Theo","Thom","Thomas","Tiki","Tim","Timmy","Timothy","Tin","Tino","Tiny","Tobi","Toby","Tod","Todd","Toddy","Tom","Tomas","Tommie","Tommo","Tommy","Toni","Tony","Tooley","Tramp","Travis","Trent","Trevor","Trey","Tripp","Troy","True","Truman","Tucker","Ty","Tyle","Tyler","Victor","Vince","Wally","Walter","Warren","Wednesday","Wes","Wiggy","Wil","Wilbur","Wilf","Will","William","Willie","Willy","Wyatt","Yo","Youth","Yut","Zac","Zach","Zack","Zane","Zeke","Zero","Zip"]
GIRLS_NAMES = ["Aaliyah","Aaralyn","Abbey","Abbi","Abbie","Abby","Abi","Abigail","Ada","Addie","Addy","Adele","Adelina","Adelle","Adri","Agnes","Aila","Alberta","Alee","Alex","Alexa","Alexis","Ali","Alice","Alicia","Alina","Alison","Allie","Ally","Alta","Alyssa","Amanda","Amara","Amaya","Amber","Amelia","Amelie","Ami","Amie","Amira","Amy","Ana","Andi","Andrea","Andy","Angel","Angela","Angelica","Angelina","Angie","Ann","Anna","Annabel","Annabelle","Anne","Annie","Anya","April","Ari","Aria","Arianna","Ariel","Arry","Ash","Ashley","Ashlyn","Audrey","Auggy","Aumana","Ava","Avery","Baba","Bailey","Barb","Basma","Beatrice","Becca","Becky","Bell","Bella","Belle","Ben","Benita","Bertha","Bess","Bessie","Beth","Bethany","Betsy","Betty","Bibi","Billie","Blue","Blythe","Bob","Bobbi","Bobbie","Bonnie","Bree","Brianna","Brianne","Brie","Britney","Brittany","Brooke","Buffy","Bunny","Cali","Callie","Camilla","Candice","Cara","Carla","Carly","Carol","Carolina","Caroline","Carrie","Cassandra","Cassie","Cassy","Cat","Cathy","Catrina","Cecilia","Celeste","Celia","Chantelle","Charlie","Charlotte","Charu","Cherry","Chika","Chips","Chloe","Chloé","Chri","Chris","Christina","Cilla","Cinderella","Cindie","Cindy","Claire","Clara","Clare","Clarice","Clarissa","Cleo","Clover","Coco","Connie","Cookie","Cora","Cupcake","Cynthia","Daisy","Dana","Dani","Darla","Darlene","Dave","Davey","Dawn","Debbie","Debby","Della","Denise","Diana","Diane","Dina","Dinah","Doe","Dolly","Dolores","Donna","Dora","Doris","Dorothy","Dot","Edie","Eileen","Elaine","Eleanor","Elena","Elisa","Elise","Eliza","Elizabeth","Ella","Elle","Ellen","Ellie","Elly","Eloise","Elsa","Elsie","Ema","Emi","Emily","Emma","Emme","Emmi","Emmie","Emmy","Erica","Erin","Esther","Eva","Evelyn","Evie","Faith","Fan","Fanny","Faye","Felicity","Filly","Fiona","Flo","Flora","Frances","Freda","Freya","Gabby","Gaby","Gail","Gemma","Genie","Genny","George","Georgia","Georgie","Georgina","Gia","Gilda","Gina","Ginnie","Ginny","Gloria","Goldie","Grace","Gracie","Grandma","Greta","Gretta","Gwen","Gwendolyn","Hailey","Haley","Hanna","Hannah","Harriet","Haru","Hattie","Haylee","Hazel","Heather","Heidi","Helen","Helga","Hilda","Holly","Honey","Hope","Hoppy","Immy","Ina","Iona","Irene","Isabel","Isabella","Isabelle","Isla","Ivy","Izzy","Jack","Jacki","Jackie","Jacklyn","Jackson","Jacky","Jada","Jade","Jamie","Jammie","Jan","Jana","Jane","Janee","Janet","Janey","Janice","Janie","Janna","Jaqueline","Jaqui","Jasmine","Jay","Jaya","Jayda","Jayla","Jayne","Jazmine","Jazz","Jean","Jeannie","Jeff","Jelly","Jemima","Jemma","Jen","Jena","Jenn","Jenna","Jennie","Jennifer","Jenny","Jess","Jessica","Jessie","Jessy","Jet","Jill","Jillian","Jilly","Jimena","Jin","Jinny","Jo","Joan","Joanna","Joanne","Jodi","Jodie","Jody","Joe","Joey","Johanna","Johna","Jojo","Josie","Joy","Joyce","Juanita","Judith","Judy","Julia","Julianne","Julie","Juliet","June","Juniper","Kain","Kaitlyn","Kala","Kani","Kara","Karen","Kari","Karmen","Kat","Kata","Kate","Katherine","Kathy","Katie","Katy","Kay","Kaya","Kayla","Kaylee","Kaylie","Kelli","Kelly","Kelsey","Kelsie","Kia","Kiana","Kiki","Kiko","Kim","Kira","Kit","Kitty","Krista","Kristen","Kumari","Kya","Kyla","Kyra","Lacey","Lacy","Lady","Laia","Laila","Lala","Lalla","Lana","Lani","Laura","Lauren","Layla","Leah","Lee","Leena","Lena","Lettie","Letty","Lexa","Lexi","Lexie","Li","Lia","Liana","Libby","Lila","Lili","Lilian","Lilla","Lilli","Lillia","Lillian","Lillie","Lilly","Lily","Lin","Lina","Linda","Lindsey","Lisa","Liss","Lissa","Lissie","Lissy","Little","Liv","Livy","Liz","Liza","Lizzie","Lizzy","Lola","Lolli","Lori","Lorna","Lotta","Lotte","Lou","Louisa","Louise","Lu","Lucia","Lucie","Lucinda","Lucky","Lucy","Luisa","Lula","Lulu","Luna","Lydia","Lyla","Lynn","Mabel","Macey","Maci","Maddie","Maddy","Madeline","Madelyn","Madison","Mae","Maggie","Maggy","Mai","Maia","Maisie","Maisy","Maja","Makeeba","Mala","Malia","Mallory","Mandy","Mara","March","Marcy","Margaret","Marge","Margo","Margret","Maria","Mariah","Marian","Mariana","Marie","Marigold","Marina","Marisa","Mark","Marley","Marnie","Marry","Marta","Martha","Mary","Matilda","Mattie","Matty","Max","Maxine","May","Maya","Mayanna","Meena","Meera","Meg","Megan","Mei","Mel","Melanie","Melina","Melinda","Melissa","Melody","Melony","Meredith","Mia","Michelle","Midge","Mika","Mikey","Miley","Mill","Milla","Milli","Millie","Milly","Mimi","Mina","Mindy","Minnie","Mira","Mirabelle","Miranda","Mishu","Missy","Mita","Moira","Mollie","Molly","Momo","Mona","Monica","Mustia","Nadia","Nami","Nan","Nancy","Naomi","Nasti","Natasha","Nate","Neelam","Nell","Nella","Nellie","Nelly","Nia","Niki","Nikki","Nina","Noelle","Nora","Olga","Olive","Oliver","Olivia","Ollie","Olly","Owen","Page","Paige","Pam","Parker","Pat","Patch","Patrice","Patricia","Patty","Paulina","Pauline","Peach","Pearl","Peggy","Pene","Penelope","Penny","Peppa","Pepper","Perfect","Petal","Peter","Petunia","Phoebe","Pia","Pinky","Pip","Piper","Pippa","Polly","Poppy","Precious","Princess","Priscilla","Priya","Pumpkin","Rachel","Rae","Rainey","Rani","Rapunzel","Rebecca","Rebekah","Red","Ren","Renee","Ria","Rita","Riya","Roberta","Roberts","Roni","Rosa","Rose","Rosemary","Rosie","Rosy","Roxy","Ruby","Ruth","Ruthie","Sabrina","Sacha","Sadie","Salley","Sally","Salma","Sam","Sama","Samantha","Sammie","Sammy","Sana","Sandra","Sandy","Sara","Sarah","Sari","Sasha","Savannah","Savera","Selena","Selina","Selma","Senna","Serena","Sharon","Shawn","Shelby","Shelley","Shelly","Sherry","Shirley","Sia","Sienna","Sina","Sissy","Sita","Sofia","Sofie","Sona","Sonya","Sophia","Sophie","Stacey","Stacy","Star","Stella","Steph","Stephanie","Strawberry","Sue","Sugarpop","Suki","Summer","Sun","Sunflower","Sunny","Sunshine","Super","Susa","Susan","Susie","Susy","Suzi","Suzie","Suzy","Sydney","Sylvia","Sylvie","Tabby","Tami","Tammy","Tania","Tanya","Tara","Teddy","Tedi","Terra","Terry","Tess","Tessa","Thea","Tia","Tiffany","Tilda","Tilly","Timmy","Tina","Tiny","Tiya","Toby","Tom","Tomi","Tommy","Tonya","Tracy","Trina","Trixie","Trudy","True","Uma","Valia","Vanessa","Vera","Vicki","Vicky","Violet","Viv","Wednesday","Wendy","Will","Willa","Willow","Winnie","Yet","Youth","Yuki","Yvonne","Zara","Zelda","Zen","Zoe","Zoey"]
def process_shard(args, pattern, find, replace):
shard_id, shard = args
random.seed(int(shard_id) + 1337)
search_regex = re.compile(pattern)
replace_regex = re.compile(f"(\s){find}")
with open(shard, "r") as f:
data = json.load(f)
for example in data:
text = example["story"]
search_text = text.strip()
if search_regex.search(search_text):
newname = random.choice(replace)
while newname in text:
newname = random.choice(replace)
example["story"] = replace_regex.sub(rf"\1{newname}", text)
with open(shard, "w") as f:
json.dump(data, f)
def process(pattern, find, replace):
# iterate the shards and tokenize all of them one by one
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
fun = partial(process_shard, pattern=pattern, find=find, replace=replace)
# process all the shards in a process pool
with multiprocessing.Pool() as executor:
executor.map(fun, enumerate(shard_filenames))
#for args in enumerate(shard_filenames):
# process_shard(args, pattern=pattern, find=find, replace=replace)
# break
print(f"{find} Done.")
for name in ["Tom", "Jack", "Mark", "Sam", "Joe", "John", "Timmy", "Max", "Bob"]:
prefix = f"^Once upon a time, there was a little boy named {name}[^a-z]"
process(prefix, name, BOYS_NAMES)
for name in ["Sue", "Mia", "Lucy", "Amy", "Sally", "Jane", "Emma", "Lisa", "Anna"]:
prefix = f"^Once upon a time, there was a little girl named {name}[^a-z]"
process(prefix, name, GIRLS_NAMES)
#Still too many occurrences of Lily and Tim
process(" Lily[^a-z]", "Lily", GIRLS_NAMES)
process(" Tim[^a-z]", "Tim", BOYS_NAMES)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment