Skip to content

Instantly share code, notes, and snippets.

@ssi-victoriano-eco
Created February 27, 2018 23:53
Show Gist options
  • Save ssi-victoriano-eco/f7fae5b1378454af5327ab5ef873c547 to your computer and use it in GitHub Desktop.
Save ssi-victoriano-eco/f7fae5b1378454af5327ab5ef873c547 to your computer and use it in GitHub Desktop.
How to push csv records to postgres with a million records with chunking (used majestic million public data)
name: majectic-1m
dependencies:
- python=3.6
- pip:
- pandas
- psycopg2
- boto3
- pandas
- sqlalchemy
- tqdm
import os, math, json
import pandas as pd
from tqdm import tqdm
from sqlalchemy import create_engine
data_types = {
'GlobalRank': int,
'TldRank': int,
'Domain': str,
'TLD': str,
'RefSubNets': int,
'RefIPs': int,
'IDN_Domain': str,
'IDN_TLD': str,
'PrevGlobalRank': int,
'PrevTldRank': int,
'PrevRefSubNets': int,
'PrevRefIPs': int,
}
def chunker(seq, size):
# from http://stackoverflow.com/a/434328
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
def pg_engine():
hostname = os.getenv('POSTGRES_HOST')
database = os.getenv('POSTGRES_DB')
username = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
return create_engine('postgresql://{username}:{password}@{hostname}:5432/{database}'.format(
hostname = hostname,
username = username,
password = password,
database = database
))
def insert_with_progress(df, con):
# chunksize = int(len(df) / 10000) # 10%
chunksize = int(math.ceil(len(df) / 100)) # 10%
with tqdm(total=len(df)) as pbar:
for i, cdf in enumerate(chunker(df, chunksize)):
replace = "replace" if i == 0 else "append"
cdf.to_sql(
con=con,
name="majestic_millions",
if_exists=replace,
index=True,
index_label='id'
)
pbar.update(chunksize)
engine = pg_engine()
df = pd.read_csv(os.path.abspath('../majestic_million.csv'), dtype=data_types)
insert_with_progress(df, engine)
version: "3"
services:
postgres:
image: postgres:10.2-alpine
container_name: postgres
restart: always
ports:
- "5432:5432"
environment:
POSTGRES_HOST: "${POSTGRES_HOST}"
POSTGRES_PORT: "${POSTGRES_PORT}"
POSTGRES_USER: "${POSTGRES_USER}"
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
POSTGRES_DB: majestic_millions
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment