Skip to content

Instantly share code, notes, and snippets.

@mateu
Created August 21, 2025 19:35
Show Gist options
  • Save mateu/c75f042d038507b2275237f87228c162 to your computer and use it in GitHub Desktop.
Save mateu/c75f042d038507b2275237f87228c162 to your computer and use it in GitHub Desktop.
Split jsonl in manner that is determistic when the random_seed is fixed.
import orjson # pip install orjson
import hashlib
def deterministic_split_jsonl_orjson(input_file, train_file, test_file,
id_field='id', test_ratio=0.2, random_seed=42):
train_count = 0
test_count = 0
threshold = int(test_ratio * 2**32)
with open(input_file, 'rb') as infile, \
open(train_file, 'wb') as train_f, \
open(test_file, 'wb') as test_f:
for line_idx, line in enumerate(infile):
# orjson works with bytes
data = orjson.loads(line)
record_id = str(data[id_field])
hash_input = f"{record_id}_{random_seed}".encode('utf-8')
hash_value = int(hashlib.md5(hash_input).hexdigest()[:8], 16)
if hash_value < threshold:
test_f.write(line)
test_count += 1
else:
train_f.write(line)
train_count += 1
if line_idx % 1000000 == 0:
print(f"Processed {line_idx:,} lines...")
return train_count, test_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment