Created
February 22, 2016 10:20
-
-
Save sahuk/7a6ccd24708be8294c76 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Converts a large CSV into SQL, can process some of the smaller chunks | |
Based on https://plot.ly/python/big-data-analytics-with-pandas-and-sqlite/ | |
Original code probably from https://github.com/chriddyp | |
""" | |
import pandas | |
from sqlalchemy import create_engine | |
import tqdm | |
CSV1 = 'movies.csv' | |
CSV2 = 'countries.csv' | |
SQL = 'db.sqlite3' | |
engine = create_engine('sqlite:///{}'.format(SQL)):w | |
def to_sqlite(csvfile, disk_engine, tablename): | |
chunksize = 50000 | |
for df in tqdm(pandas.read_csv(csvfile, chunksize=chunksize, iterator=True, encoding='utf-8')): | |
df.to_sql(tablename, disk_engine, if_exists='append') | |
# some sample SQL statements | |
# it's important to filter the dataset with a 'where' clause, or use the 'limit' statement | |
# otherwise we might end up loading the entire dataset into memory, which we want to avoid | |
first5 = "select * from movies limit 5" # Everything we know about the first 5 entries | |
names_of_top10 = "select title, score from movies order by score DESC limit 10" # the first 10 entries, by DESCending scoref |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is what I type in IPython. Every defined variable will tab complete, | |
# so I just type the first letters and hit the TAB key to get a suggestion of what I was trying to type. | |
%load csv2sql.py | |
to_sqlite(CSV1, engine, 'movies') | |
to_sqlite(CSV2, engine, 'countries') | |
df = pandas.read_sql_query(first5, disk_engine) | |
# display the dataframe etc | |
df = pandas.read_sql_query(names_of_top10, disk_engine) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment