-
-
Save jerch/fd0fae0107ce7b153b7540111b2e89ab to your computer and use it in GitHub Desktop.
import re | |
from io import StringIO | |
from decimal import Decimal | |
# TODO: | |
# - investigate: late column casts better than early? (make temp table all text?) | |
# - encoder/decoder for all django field types | |
# - sanitize method interfaces of CopyConverter / more django-like | |
# - Do we need a temp file shim instead of StringIO for very big data? | |
# - Better with ByteIO instead of StringIO? | |
# - parse_copydata as generator | |
# - temp model abstraction needed? | |
TXT_REPL = { | |
'\\\\': '\\', | |
'\\b': '\b', | |
'\\f': '\f', | |
'\\n': '\n', | |
'\\r': '\r', | |
'\\t': '\t', | |
'\\v': '\v' | |
} | |
REX_DECODE_TEXT = re.compile(r'\\\\|\\[bfnrtv]') | |
BOOLEAN_REPL = { | |
't': True, | |
'f': False | |
} | |
ENCODERS = { | |
'bytea': lambda v: '\\\\x' + v.hex(), | |
'text': lambda v: (v.replace('\\', '\\\\') | |
.replace('\b', '\\b').replace('\f', '\\f').replace('\n', '\\n') | |
.replace('\r', '\\r').replace('\t', '\\t').replace('\v', '\\v')), | |
'int': lambda v: str(v), | |
'decimal': lambda v: str(v), | |
'float': lambda v: str(float(v)), | |
'boolean': lambda v: 't' if v else 'f', | |
} | |
DECODERS = { | |
'bytea': lambda v: bytes.fromhex(v[3:]), | |
'text': lambda v: REX_DECODE_TEXT.sub(lambda m: TXT_REPL[m.string[m.start():m.end()]], v), | |
'int': lambda v: int(v), | |
'decimal': lambda v: Decimal(v), | |
'float': lambda v: float(v), | |
'boolean': lambda v: BOOLEAN_REPL[v], | |
} | |
class CopyConverter: | |
def __init__(self, sep='\t', null='\\N'): | |
self.sep = sep | |
self.null = null | |
def encode(self, v, typename): | |
if v is None: | |
return self.null | |
return ENCODERS[typename](v) | |
def decode(self, v, typename): | |
if v == self.null: | |
return None | |
return DECODERS[typename](v) | |
def create_copydata(self, f, column_types, data, fields=None): | |
tmpl = self.sep.join(['{}'] * len(column_types)) | |
if fields is None: | |
for o in data: | |
line = [self.encode(dp, column_types[i]) for i, dp in enumerate(o)] | |
f.write(tmpl.format(*line) + '\n') | |
else: | |
for o in data: | |
line = [self.encode(getattr(o, fname), column_types[i]) for i, fname in enumerate(fields)] | |
f.write(tmpl.format(*line) + '\n') | |
f.seek(0) | |
def copy_from(self, table, columns, cursor, data, column_types, fields=None): | |
f = StringIO() | |
self.create_copydata(f, column_types, data, fields) | |
cursor.copy_from(f, table, sep=self.sep, null=self.null, size=8192, columns=columns) | |
def parse_copydata(self, f, column_types): | |
parsed = [] | |
f.seek(0) | |
for line in f: | |
line = line.rstrip('\n') | |
parsed.append( | |
tuple( | |
self.decode(col, column_types[i]) | |
for i, col in enumerate(line.split(self.sep)) | |
) | |
) | |
return parsed | |
def copy_to(self, table, columns, cursor, decoders): | |
f = StringIO() | |
cursor.copy_to(f, table, sep=self.sep, null=self.null, columns=columns) | |
return self.parse_copydata(f, decoders) | |
def copy_update(qs, objs, fieldnames, batch_size: int = 1000): | |
model = qs.model | |
# filter all non model local fields --> still handled by bulk_update | |
non_local_fieldnames = [] | |
local_fieldnames = [] | |
for f in fieldnames: | |
if model._meta.get_field(f) not in model._meta.local_fields: | |
non_local_fieldnames.append(f) | |
else: | |
local_fieldnames.append(f) | |
# avoid more expensive doubled updates | |
if non_local_fieldnames and len(local_fieldnames) < 2: | |
return model.objects.bulk_update(objs, fieldnames, batch_size) | |
if local_fieldnames: | |
from django.db import connections | |
tablename = model._meta.db_table | |
pk_field = model._meta.pk | |
if not pk_field: | |
return model.objects.bulk_update(objs, fieldnames, batch_size) | |
fields = [model._meta.get_field(f) for f in local_fieldnames] | |
connection = connections[qs.db] | |
with connection.cursor() as cur: | |
cur.execute('CREATE TEMPORARY TABLE my_temp (pk integer UNIQUE, f1 integer)') | |
cc = CopyConverter() | |
cc.copy_from('my_temp', ('pk', 'f1'), cur, objs, ('int', 'int'), ['pk'] + fieldnames) | |
cur.execute('CREATE INDEX some_idx_on_temp ON my_temp (pk)') | |
cur.execute(update_from_table(tablename, pk_field.column, fields)) | |
cur.execute('DROP TABLE my_temp') | |
# TODO: apply left overs | |
def update_from_table(tname, pkname, fields): | |
cols = ','.join(f'"{f.column}"=my_temp.{f.column}' for f in fields) | |
where = f'"{tname}"."{pkname}"=my_temp.pk' | |
return f'UPDATE "{tname}" SET {cols} FROM my_temp WHERE {where}' |
Last but not least here are some numbers with a COPY FROM
variant of bulk_create
. Test case is inserting of 1M records with 10 populated int columns and with no further index (beside unique on pk):
- django's
bulk_create
: 80s django-postgres-copy
: 9s (~9 times faster)copy_create
: 5.8s (~14 times faster)
New numbers comparing the fastest COPY FROM speed I was able to find for psycopg2 and psycopg3:
def copy_insert2(cur, data):
f = BytesIO()
for o in data:
f.write(f'{o.f1}\t{o.f2}\t{o.f3}\t{o.f4}\t{o.f5}\t{o.f6}\t{o.f7}\t{o.f8}\t{o.f9}\t{o.f10}\n'.encode('utf-8'))
f.seek(0)
cur.execute('CREATE TEMPORARY TABLE temp_table (pk serial, f1 int,f2 int,f3 int,f4 int,f5 int,f6 int,f7 int,f8 int,f9 int,f10 int)')
cur.copy_from(f, 'temp_table', size=131072, columns=('f1','f2','f3','f4','f5','f6','f7','f8','f9','f10'))
cur.execute('DROP TABLE temp_table')
f.close()
def copy_insert3(cur, data):
cur.execute('CREATE TEMPORARY TABLE temp_table (pk serial, f1 int,f2 int,f3 int,f4 int,f5 int,f6 int,f7 int,f8 int,f9 int,f10 int)')
with cur.copy("COPY temp_table (f1,f2,f3,f4,f5,f6,f7,f8,f9,f10) FROM STDIN") as copy:
counter = 0
lines = []
for o in data:
line = f'{o.f1}\t{o.f2}\t{o.f3}\t{o.f4}\t{o.f5}\t{o.f6}\t{o.f7}\t{o.f8}\t{o.f9}\t{o.f10}\n'.encode('utf-8')
lines.append(line)
counter += len(line)
if counter > 131072:
copy.write(b''.join(lines))
lines.clear()
counter = 0
if lines:
copy.write(b''.join(lines))
cur.execute('DROP TABLE temp_table')
Inserting 1M records with ints populated with 4-digit numbers (50MB of textual data):
- psycopg2
- runtime: 3.05s average
- CPU usage: python 45-55%, postgres 50-55%
- RAM usage: python 520 MB, postgres <64 MB
- psycopg3
- runtime: 1.97s average (!!)
- CPU usage: python 100%, postgres 85-90%
- RAM usage: python 450 MB, postgres <64 MB
Whats remarkable here is the fact, that psycopg3 with its file-like interface on the copy cursor can almost saturate the postgres process (python is still the slow poke though), while psycopg2 basically runs in "tandem mode" with higher memory needs due to the interim BytesIO
buffering.
The f-string line formatting as shown above is the fastest string formatting scheme I was able to find. The performance roughly degrades as following:
- make f-string customizable as
line_formatter=lambda o: f'...'
(~10% slower) - f-string join list comprehensions (~20% slower)
- pretemplating line with final
str.format(...)
call (~20% slower) - explicitly calling
str
on values (~60% slower) - use
getattr
to access object attributes (~80% slower - use attrgetter, its only 30% slower) - doing string handling all in bytestring domain (~150% slower - nope, Python3 lacks fast bytestring formatting stuff)
- explicit None-->NULL handling (~40% slower)
- custom field encoders, handling None-->NULL and type edge cases (3-10x slower, depending on level of custom encoding)
Custom field encoders are very expensive, and so is psycopg3's copy.write_row
(takes ~17s to finish). The best take clearly is the f-string way, with no additional encoding. Ofc this is not always possible, but still can be used to an advantage, if your data does not contain nullish values and is guaranteed to translate into postgres' text representation correctly from __str__
. For a more general usage pluggable encoders might be helpful.
So psycopg3's copy cursor with custom f-string line formatting is it, almost pushing postgres to the CPU limit.
Edit - Btw, removing the pk column above (which is indexed) lowers runtime to 1.89s (psycopg3) and 2.47s (psycopg2).
Edit2:
The value for psycopg3's write_row
above was from the python version. The C version runs the same test ~3.3s, which is really awesome, as it already contains proper field encoders. We prolly never will reach that number with python based field/line encoding, though we are not that far off with ~5s.
Furthermore I found a much better performing variant for psycopg2 with the help of a thread as described here: psycopg/psycopg2#1438 (comment)
Made a new copy_update
impl trying to use some of the perf insights from the last comment. It is way cleaner and better structured than my early attempt at the top, and may find its way into a python package, once I am done with the testing.
But first more charts, this time trying to find the break even between copy_update
and fast_update
, comparing 1 int column vs. 10 int column updates.
Note on sequential vs. indexed scan:
To get the tests done in a timely fashion, they ran on a rather smallish tables of only 10k records. No additional ANALYZE
was done, which pollutes the results with sequential vs. index scan attempts by postgres in the update command (shows up as "steps" in raw values). Since ANALYZE itself can be very expensive, I went with the "let postgres decide" idea for now, and averaged things out (across 100 runs 3 times). The uncertainty range for the tested ranges is roughly 30-50% because of this, and tends to get lower for bigger update counts (postgres prolly settles earlier to one strategy). Later on an explicit analyze
setting on the function interface might be useful for very big update batches.
Regarding the break even - if we allow 50% ratio uncertainty, the real break even should be in these areas:
- for 1 int column: between 294 and 4096+ row updates
- for 10 int columns: between 79 and 416 row updates
which gives us an value of ~350±50 for 1 to 10 int column updates. Below that value copy_update
tends to perform worse, above better. For higher column counts the value probably will drop further down. Also other column types might behave much different (note that integer conversion is rather cheap for python, and prolly also for postgres).
Another interesting number is to compare 1 vs. 10 columns update speed directly:
To my surprise copy_update
does not show a clear trend in the tested range, which indicates, that the data copying is very cheap, and the whole action still gets overly dominated by row count. Looking at the individual runtimes, thats indeed the case - for 4096 rows the update action does not move much for 1 vs. 10 columns, but takes 70-80% of the runtime. This suggests, that with further tweaking of the final UPDATE
from the temp table can save more runtime.
Update on ANALYZE - calling it after COPY FROM on the update "join" column of the temp table (id) lowers the initial copy/fast ratio down to 1.5 - 2. Break even moved down to ~150±50. So yes it is a good idea to do that, esp. for lowish row counts.
Decided not to continue working on a suitable COPY FROM update impl for django with psycopg2. The reason is quite simple - psycopg3's write_row
is really great in terms of speed, correctness and ease of use, which we never can reach with a python based implementation. Better to get django moved to psycopg3 first, then things be adopted from there with its superior copy support.
Hi @jerch
Thank you for your interesting results on the Django update speed. When I tried to update 380k rows in one dataset it took forever so I tried my own benchmark.
The x-axis shows the size of a dataset whereas the y-axis shows how many updates per second are performed. This is with and without the batch_size
parameter of the bulk_update()
.
I experience a major performance hit when using bulk_update()
without batch_size
at around 13-14k sized datasets.
Something feels really wrong with here.
@fantasticle Which db backend have you tested here? Could you also test your dataset against my early fast_update
impl here? Note that this gist is about postgres, and further speed gains from using COPY FROM vs. fast_update
.
Also note that fast_update
is not yet feature complete for general purpose django usage:
- fk field support still missing (have not yet implemented/tested the
..._id
side stepping) - f-expressions most likely never gonna work, needs at least spotting and bypassing those to
.update
- db routers not yet respected (wrongly uses the read db connection)
Beside that, it already should work with all other concrete field types.
Originally I planned to integrate fast_update
more with existing ORM internals, but found it impossible to map to any existing sql compiler. Furthermore the backend distinction in the ORM does not offer different paths for mysql vs. mariadb, thus I stopped that endeavour and went with direct SQL formatting for now. A better way prolly would establish a new compiler for UPDATE FROM VALUES in the ORM and use its primitives, but thats a huge rewrite for rainy days.
If someone is interested, I've started packaging things more seriously in https://github.com/netzkolchose/django-fast-update. Still lacks all sorts of tests, from manual testing fast_update
should mostly work though.
I also added a copy_update
draft from my playground tests, but thats only half done (still misses several encoders and array support, have no priority on this, as psycopg3 will make it obsolete soon).
Two new charts with a partially optimized data encoder path in
CopyConverter
, effects:copy_update
separates much earlier fromfast_update
(break even at ~1024 for 1 int column update, ~512 for 5 int columns updates) with slower runtime progress (note that the needed update from the temp table holds the biggest runtime slice, thus faster data encoding does not simply multiply speed by 3)chart for updating 1 int column per row:
chart for updating 5 int columns per row:
[Removed django's
bulk_update
from the charts as its exploding runtime makes them unreadable.]Another chart showing 10k, 100k 1M row updates (1 and 5 int columns):
This indicates, that perf of
fast_update
degrades alot with more columns per row to be updated, whilecopy_update
is almost the same.