Created
May 22, 2015 18:45
-
-
Save vernondcole/9adedbab1899224a4eaf to your computer and use it in GitHub Desktop.
how to do a django data migration on a massive Postgres data table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# this migration will call itself as a command-line program. | |
from __future__ import unicode_literals, print_function | |
import datetime, subprocess, psycopg2, sys, time | |
from django.db import migrations | |
from django.conf import settings | |
CHUNK_SIZE = 1000 | |
def call_yourself(apps, schema_editor): | |
my_name = __file__ | |
connection_string = 'host={} dbname={} user={} password={}'.format( | |
settings.DATABASES['default']['HOST'], | |
settings.DATABASES['default']['NAME'], | |
settings.DATABASES['default']['USER'], | |
settings.DATABASES['default']['PASSWORD'] | |
) | |
subprocess.call(['chmod', '+x', my_name]) # must have the execute bit set | |
print('\n Calling myself as a subprocess...') | |
subprocess.Popen(['{} "{}"'.format(my_name, connection_string)], shell=True) | |
time.sleep(5.0) # a short pause, so that a test database will have time to be actually processed before proceeding | |
def do_nothing(apps, schema_editor): | |
pass # we need a callable for the reverse direction, so we use this. | |
class Migration(migrations.Migration): | |
dependencies = [ | |
('phx', '0007_cdr_call_end_utc'), | |
] | |
operations = [ | |
migrations.RunPython(call_yourself, do_nothing, atomic=False) | |
] | |
# Command-line code to perform our data migration, because we can't do it in a migration using an Atomic transaction | |
# this operates as a command-line program, not as a migrate script | |
# The python program repeatedly calls an SQL function which does the migration one small chunk at a time. | |
# this works only in PostgreSQL | |
# -- note: Python will fill in the CHUNK_SIZE in the LOOP statement using a .format() call | |
FUNC_SQL = """create or replace function calc_end_times() returns int as $$ | |
declare | |
n int; | |
begin | |
for loopid in 0..{} loop | |
update phx_cdr set call_end_utc = call_utc + (exact_call_duration || ' second')::interval | |
where id = (select id from phx_cdr where call_end_utc is null limit 1); | |
exit when not FOUND; | |
end loop; | |
get diagnostics n = ROW_COUNT; | |
return n; | |
end; | |
$$ LANGUAGE plpgsql; | |
""" | |
RUN_SQL = "select calc_end_times();" # the data migration is done as a side-effect of this SELECT statement. (Yech!) | |
if __name__ == '__main__': | |
print('Migration now running as a command line program to convert table phx_cdr...') | |
conn = psycopg2.connect(sys.argv[1]) # create our own SQL connection without using django ORM. | |
print(' Connected to', conn.dsn) | |
c = conn.cursor() | |
c.execute(FUNC_SQL.format(CHUNK_SIZE)) # note: this intentional SQL injection is safe from external attack. | |
print(' Processing {:,} records per iteration.'.format(CHUNK_SIZE)) | |
conn.commit() | |
c.execute('select 1') # create a True value for the "while" statement below | |
count = 0 | |
tick = datetime.datetime.now() | |
while c.fetchone()[0] == 1: | |
c.execute(RUN_SQL) | |
conn.commit() | |
now = datetime.datetime.now() | |
elapsed = now - tick | |
count += CHUNK_SIZE | |
print('{:%H:%M:%S} last elapsed time={:6.2f} seconds. now done={:,}'.format( | |
now, elapsed.seconds + elapsed.microseconds/1000000.0, count)) # keep our user informed | |
time.sleep(0.2) # be nice. Let someone else have some system access for a moment. | |
tick = datetime.datetime.now() | |
c.close() | |
conn.close() | |
print('operation complete.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is my solution to performing a data migration on a table which is too large to do in one massive transaction. This ran for two days while normal access to the database continued.
The name of the table, and the SQL to do the migration are hard-coded in FUNC_SQL. The database connection information is pulled from the django SETTINGS.