Skip to content

Instantly share code, notes, and snippets.

@vchahun
Created November 13, 2012 03:32
Show Gist options
  • Select an option

  • Save vchahun/4063776 to your computer and use it in GitHub Desktop.

Select an option

Save vchahun/4063776 to your computer and use it in GitHub Desktop.
Deterministic dataset splitter
#!/usr/bin/env python
import sys
from hashlib import sha1
MAX = 1000000
def main():
if len(sys.argv) < 2:
sys.stderr.write('Usage: {0} prop1:fname1 prop2:fname2...\n'.format(sys.argv[0]))
sys.exit(1)
# Read arguments
split = [arg.split(':') for arg in sys.argv[1:]]
split = sorted(((float(proportion), fname) for proportion, fname in split), reverse=True)
# Normalize proportions
total = sum(proportion for proportion, _ in split)
for proportion, fname in split:
print('{0:.0%} -> {1}'.format(proportion/total, fname))
split = [(int(MAX*proportion/total), open(fname, 'w')) for proportion, fname in split]
# Re-normalize proportions (adjust last count)
other = sum(proportion for proportion, _ in split[:-1])
split[-1] = (MAX-other, split[-1][1])
for i, line in enumerate(sys.stdin):
h = (int(sha1(line).hexdigest(), base=16) + i) % MAX
for proportion, fp in split:
if h < proportion:
fp.write(line)
break
else:
h -= proportion
for _, fp in split:
fp.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment