Last active
March 30, 2019 08:13
-
-
Save vikeshsingh37/99640336325d4e5a3feb49823ddc2197 to your computer and use it in GitHub Desktop.
A sample script to process a huge csv file in python which otherwise cannot be processed due to memory limitations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Split data into smaller chunks and process in parts | |
chunk_size = 100000 | |
required_data = pd.DataFrame() | |
for data in pd.read_csv(myfile,chunksize = chunk_size): | |
data["datetime"]= pd.to_datetime(data["timestamp"],unit = 's') | |
data["datetime"]=data["datetime"].dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata') | |
data["date"] =data["datetime"].dt.date | |
data["week"] =data["datetime"].dt.week | |
data["hour"] = data["datetime"].dt.hour | |
data["weekday"] = data["datetime"].dt.day_name() | |
required_data_chunk = data.groupby(["id","date","week","weekday","hour"])["datetime"].count() | |
required_data_chunk = pd.DataFrame(required_data_chunk) | |
required_data_chunk.reset_index(inplace=True) | |
required_data = required_data.append(required_data_chunk) | |
# Merge the processed data splits into one file | |
required_data_final = required_data[["id","date","week","weekday","hour","datetime"]].groupby( | |
["driver_id","date","week","weekday","hour"])["datetime"].sum() | |
required_data_final = pd.DataFrame(required_data_final) | |
required_data_final.reset_index(inplace=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment