Skip to content

Instantly share code, notes, and snippets.

@myles
Last active August 14, 2017 15:31
Show Gist options
  • Save myles/5d0db95838aeecdcd0c57972f688be9e to your computer and use it in GitHub Desktop.
Save myles/5d0db95838aeecdcd0c57972f688be9e to your computer and use it in GitHub Desktop.
Quick Python script for splitting large CSV files using Pandas and NumPy.
#!/usr/bin/env python3
import argparse
import math
import os.path
import pandas as pd
import numpy as np
def main(filepath):
if filepath.endswith('.csv'):
df_org = pd.read_csv(filepath)
elif filepath.endswith('.xlsx') or filepath.endswith('.xls'):
df_org = pd.read_excel(filepath)
else:
raise Exception("I don't know what the file is.")
row_count, column_count = df_org.shape
split_by = math.ceil(row_count / 5000)
dfs = np.array_split(df_org, split_by)
filename = os.path.basename(filepath).replace('.csv', '')
directory = os.path.dirname(filepath)
for index, df in enumerate(dfs):
df.to_csv(os.path.join(directory,
'{0}_{1}.csv'.format(filename,
index)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('large_csv_file')
args = parser.parse_args()
main(args.large_csv_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment