Skip to content

Instantly share code, notes, and snippets.

@eric-czech
Last active March 28, 2026 11:21
Show Gist options
  • Select an option

  • Save eric-czech/656b63dc78ac7792f5c5d824e0b5f103 to your computer and use it in GitHub Desktop.

Select an option

Save eric-czech/656b63dc78ac7792f5c5d824e0b5f103 to your computer and use it in GitHub Desktop.
Row counts for bolinas-dna genome interval datasets
"""Count rows in bolinas-dna HF datasets (training + validation)."""
from datasets import load_dataset
DATASETS = [
("CDS train", "bolinas-dna/genomes-v5-genome_set-animals-intervals-v5_255_128", "train"),
("Upstream train", "bolinas-dna/genomes-v5-genome_set-animals-intervals-v1_255_128", "train"),
("Downstream train", "bolinas-dna/genomes-v5-genome_set-animals-intervals-v15_255_128", "train"),
("CDS val", "bolinas-dna/genomes-v5-validation-intervals-v5_255_255", "validation"),
("Upstream val", "bolinas-dna/genomes-v5-validation-intervals-v1_255_255", "validation"),
("Downstream val", "bolinas-dna/genomes-v5-validation-intervals-v15_255_255", "validation"),
]
total_train = 0
for label, name, split in DATASETS:
ds = load_dataset(name, split=split, streaming=True)
count = 0
for batch in ds.iter(batch_size=50_000):
count += len(next(iter(batch.values())))
print(f"{label}: {count:,}")
if "train" in label:
total_train += count
print(f"\nTotal train: {total_train:,}")
print(f"Total tokens (x256 incl BOS): {total_train * 256:,}")
CDS train: 242,334,716
Upstream train: 68,286,166
Downstream train: 20,501,856
CDS val: 16,384
Upstream val: 16,384
Downstream val: 16,384
Total train: 331,122,738
Total tokens (x256 incl BOS): 84,767,420,928
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment