mateu · August 21, 2025 19:35
diff --git a/deterministic-split.py b/deterministic-split.py

 import orjson  # pip install orjson
 import hashlib

 def deterministic_split_jsonl_orjson(input_file, train_file, test_file, 
                                    id_field='id', test_ratio=0.2, random_seed=42):
    train_count = 0
    test_count = 0
    threshold = int(test_ratio * 2**32)
    
    with open(input_file, 'rb') as infile, \
         open(train_file, 'wb') as train_f, \
         open(test_file, 'wb') as test_f:
        
        for line_idx, line in enumerate(infile):
            # orjson works with bytes
            data = orjson.loads(line)
            record_id = str(data[id_field])
            
            hash_input = f"{record_id}_{random_seed}".encode('utf-8')
            hash_value = int(hashlib.md5(hash_input).hexdigest()[:8], 16)
            
            if hash_value < threshold:
                test_f.write(line)
                test_count += 1
            else:
                train_f.write(line)
                train_count += 1
            
            if line_idx % 1000000 == 0:
                print(f"Processed {line_idx:,} lines...")
    
    return train_count, test_count

	import orjson # pip install orjson
	import hashlib

	def deterministic_split_jsonl_orjson(input_file, train_file, test_file,
	id_field='id', test_ratio=0.2, random_seed=42):
	train_count = 0
	test_count = 0
	threshold = int(test_ratio * 2**32)

	with open(input_file, 'rb') as infile, \
	open(train_file, 'wb') as train_f, \
	open(test_file, 'wb') as test_f:

	for line_idx, line in enumerate(infile):
	# orjson works with bytes
	data = orjson.loads(line)
	record_id = str(data[id_field])

	hash_input = f"{record_id}_{random_seed}".encode('utf-8')
	hash_value = int(hashlib.md5(hash_input).hexdigest()[:8], 16)

	if hash_value < threshold:
	test_f.write(line)
	test_count += 1
	else:
	train_f.write(line)
	train_count += 1

	if line_idx % 1000000 == 0:
	print(f"Processed {line_idx:,} lines...")

	return train_count, test_count