Skip to content

Instantly share code, notes, and snippets.

@vernondcole
Created May 22, 2015 18:45
Show Gist options
  • Save vernondcole/9adedbab1899224a4eaf to your computer and use it in GitHub Desktop.
Save vernondcole/9adedbab1899224a4eaf to your computer and use it in GitHub Desktop.
how to do a django data migration on a massive Postgres data table
#!/usr/bin/python3
# this migration will call itself as a command-line program.
from __future__ import unicode_literals, print_function
import datetime, subprocess, psycopg2, sys, time
from django.db import migrations
from django.conf import settings
CHUNK_SIZE = 1000
def call_yourself(apps, schema_editor):
my_name = __file__
connection_string = 'host={} dbname={} user={} password={}'.format(
settings.DATABASES['default']['HOST'],
settings.DATABASES['default']['NAME'],
settings.DATABASES['default']['USER'],
settings.DATABASES['default']['PASSWORD']
)
subprocess.call(['chmod', '+x', my_name]) # must have the execute bit set
print('\n Calling myself as a subprocess...')
subprocess.Popen(['{} "{}"'.format(my_name, connection_string)], shell=True)
time.sleep(5.0) # a short pause, so that a test database will have time to be actually processed before proceeding
def do_nothing(apps, schema_editor):
pass # we need a callable for the reverse direction, so we use this.
class Migration(migrations.Migration):
dependencies = [
('phx', '0007_cdr_call_end_utc'),
]
operations = [
migrations.RunPython(call_yourself, do_nothing, atomic=False)
]
# Command-line code to perform our data migration, because we can't do it in a migration using an Atomic transaction
# this operates as a command-line program, not as a migrate script
# The python program repeatedly calls an SQL function which does the migration one small chunk at a time.
# this works only in PostgreSQL
# -- note: Python will fill in the CHUNK_SIZE in the LOOP statement using a .format() call
FUNC_SQL = """create or replace function calc_end_times() returns int as $$
declare
n int;
begin
for loopid in 0..{} loop
update phx_cdr set call_end_utc = call_utc + (exact_call_duration || ' second')::interval
where id = (select id from phx_cdr where call_end_utc is null limit 1);
exit when not FOUND;
end loop;
get diagnostics n = ROW_COUNT;
return n;
end;
$$ LANGUAGE plpgsql;
"""
RUN_SQL = "select calc_end_times();" # the data migration is done as a side-effect of this SELECT statement. (Yech!)
if __name__ == '__main__':
print('Migration now running as a command line program to convert table phx_cdr...')
conn = psycopg2.connect(sys.argv[1]) # create our own SQL connection without using django ORM.
print(' Connected to', conn.dsn)
c = conn.cursor()
c.execute(FUNC_SQL.format(CHUNK_SIZE)) # note: this intentional SQL injection is safe from external attack.
print(' Processing {:,} records per iteration.'.format(CHUNK_SIZE))
conn.commit()
c.execute('select 1') # create a True value for the "while" statement below
count = 0
tick = datetime.datetime.now()
while c.fetchone()[0] == 1:
c.execute(RUN_SQL)
conn.commit()
now = datetime.datetime.now()
elapsed = now - tick
count += CHUNK_SIZE
print('{:%H:%M:%S} last elapsed time={:6.2f} seconds. now done={:,}'.format(
now, elapsed.seconds + elapsed.microseconds/1000000.0, count)) # keep our user informed
time.sleep(0.2) # be nice. Let someone else have some system access for a moment.
tick = datetime.datetime.now()
c.close()
conn.close()
print('operation complete.')
@vernondcole
Copy link
Author

This is my solution to performing a data migration on a table which is too large to do in one massive transaction. This ran for two days while normal access to the database continued.
The name of the table, and the SQL to do the migration are hard-coded in FUNC_SQL. The database connection information is pulled from the django SETTINGS.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment