Last active
January 1, 2016 08:59
-
-
Save chenyanzhe/8121434 to your computer and use it in GitHub Desktop.
Split all files in the corresponding folder into 4 smaller parts accordingly by lines.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Usage: | |
# . | |
# ├── amazon | |
# │ ├── amazon_aa | |
# │ ├── amazon_ab | |
# │ ├── amazon_ac | |
# | ... | |
# │ ├── amazon_bu | |
# │ ├── amazon_bv | |
# ├── split.py | |
# ├── hollywood | |
# │ ├── hollywood_aa | |
# │ ├── hollywood_ab | |
# │ ├── hollywood_ac | |
# ... | |
# │ ├── hollywood_bu | |
# │ └── hollywood_bv | |
# └── wiki | |
# ├── wiki_aa | |
# ├── wiki_ab | |
# ├── wiki_ac | |
# ... | |
# ├── wiki_bu | |
# └── wiki_bv | |
# | |
# When running this scrpit, it will split all data files into 4 smaller ones | |
# e.g. amazon_aa --> { amazon_aa_0, amazon_aa_26867, amazon_aa_53734, amazon_aa_80606 } | |
# the number appended at last is the starting line number of newly splitted file | |
import os | |
def split_file(filepath, lines_per_file=100): | |
"""Split the file based on a number of lines.""" | |
lpf = lines_per_file | |
path, filename = os.path.split(filepath) | |
with open(filepath, 'r') as r: | |
basename = filename | |
try: | |
w = open(os.path.join(path, '{}_{}'.format(basename, 0)), 'w') | |
for i, line in enumerate(r): | |
if not i % lpf: | |
w.close() | |
filename = os.path.join(path, '{}_{}'.format(basename, i)) | |
w = open(filename, 'w') | |
w.write(line) | |
finally: | |
w.close() | |
if __name__ == '__main__': | |
base, scriptname = os.path.split(os.path.abspath(__file__)) | |
jobs = [ f for f in os.listdir(base) ] | |
jobs.remove(scriptname) | |
for job in jobs: | |
cwd = os.path.join(base, job) | |
files = [ f for f in os.listdir(cwd)] | |
for f in files: | |
num_lines = sum(1 for line in open(os.path.join(cwd, f))) | |
split_file(os.path.join(cwd, f), num_lines // 4 + 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment