Created
November 13, 2012 03:32
-
-
Save vchahun/4063776 to your computer and use it in GitHub Desktop.
Deterministic dataset splitter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import sys | |
| from hashlib import sha1 | |
| MAX = 1000000 | |
| def main(): | |
| if len(sys.argv) < 2: | |
| sys.stderr.write('Usage: {0} prop1:fname1 prop2:fname2...\n'.format(sys.argv[0])) | |
| sys.exit(1) | |
| # Read arguments | |
| split = [arg.split(':') for arg in sys.argv[1:]] | |
| split = sorted(((float(proportion), fname) for proportion, fname in split), reverse=True) | |
| # Normalize proportions | |
| total = sum(proportion for proportion, _ in split) | |
| for proportion, fname in split: | |
| print('{0:.0%} -> {1}'.format(proportion/total, fname)) | |
| split = [(int(MAX*proportion/total), open(fname, 'w')) for proportion, fname in split] | |
| # Re-normalize proportions (adjust last count) | |
| other = sum(proportion for proportion, _ in split[:-1]) | |
| split[-1] = (MAX-other, split[-1][1]) | |
| for i, line in enumerate(sys.stdin): | |
| h = (int(sha1(line).hexdigest(), base=16) + i) % MAX | |
| for proportion, fp in split: | |
| if h < proportion: | |
| fp.write(line) | |
| break | |
| else: | |
| h -= proportion | |
| for _, fp in split: | |
| fp.close() | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment