-
-
Save jsheedy/efa9a69926a754bebf0e9078fd085df6 to your computer and use it in GitHub Desktop.
import time | |
import psycopg2 | |
from iter_file import IteratorFile | |
conn = psycopg2.connect(host="localhost", database="test") | |
# args = [(1,2), (3,4), (5,6)] | |
args = [(i,i+1) for i in range(1,1*10**4,2)] | |
def time_test(func): | |
def f(): | |
truncate_table() | |
t1 = time.time() | |
func() | |
t2 = time.time() | |
print(str(t2 - t1) + ": " + func.__name__ ) | |
return f | |
def truncate_table(): | |
with conn.cursor() as cur: | |
cur.execute("truncate t"); | |
conn.commit() | |
@time_test | |
def query_builder_insert(): | |
with conn.cursor() as cur: | |
records_list_template = ','.join(['%s'] * len(args)) | |
insert_query = 'insert into t (a, b) values {0}'.format(records_list_template) | |
cur.execute(insert_query, args) | |
conn.commit() | |
@time_test | |
def copy_from_insert(): | |
with conn.cursor() as cur: | |
f = IteratorFile(("{}\t{}".format(x[0], x[1]) for x in args)) | |
cur.copy_from(f, 't', columns=('a', 'b')) | |
conn.commit() | |
query_builder_insert() | |
copy_from_insert() |
# original | |
0.008857011795043945: query_builder_insert | |
0.0029380321502685547: copy_from_insert | |
# 10 records | |
0.00867605209350586: query_builder_insert | |
0.003248929977416992: copy_from_insert | |
# 10k records | |
0.041108131408691406: query_builder_insert | |
0.010066032409667969: copy_from_insert | |
# 1M records | |
3.464181900024414: query_builder_insert | |
0.47070908546447754: copy_from_insert | |
# 10M records | |
38.96936798095703: query_builder_insert | |
5.955034017562866: copy_from_insert |
You could replace "{}\t{}".format
with a call to the string join
method. Here's an example using a variable number of input records:
args = (
(0, 1),
(0, 1, 2),
(0, 1, 2, 3),
(0, 1)
)
# convert each column to a string before passing to join
generator = ("\t".join(map(str, x)) for x in args)
for row in generator:
print(row)
I haven't tested this against Postgres, but I don't think it would be OK with the variable number of columns. If that is so you'll have to scan the input data to find the longest record and then pad short records. In production, you might could use a heuristic maximum number of columns to obviate this extra scan.
The generator expression was getting a little long so I converted it to a generator function using the yield keyword. The tab separator is converted to a comma in this example such that the empty records will be visible on the terminal.
from iter_file import IteratorFile
args = (
(0, 1),
(0, 1, 2),
(0, 1, 2, 3),
(0, 1)
)
# get the maximum number of columns:
n_cols = max(map(len, args))
def generator():
for record in args:
# ensure that each record has the same number of columns
# by padding the appropriate number of empty columns to
# short records
padding = ("",) * (n_cols - len(record))
record = record + padding
yield ",".join(map(str, record))
f = IteratorFile(generator())
print(f.read())
This results in
0,1,,
0,1,2,
0,1,2,3
0,1,,
I want to mention that execute_values goes real fast when you increase the page_size (default is 100).
For those confused like me, the time unit here in the results is seconds
Hi there!
Could you please tell me where to find iter_file python library?
Hi there! Could you please tell me where to find iter_file python library?
@SergKS8 I didn't make a package for this, download the module from the gist link at the top and drop it into your project: https://gist.github.com/jsheedy/ed81cdf18190183b3b7d
It might be worth looking into @rileypeterson's suggestion above to use execute_values
with a larger page_size instead. I haven't touched this in a while and that suggestion is simpler and more flexible. I'd love to see benchmarks to see the relative performance.
Hi j, very useful py, I am relatively new to python and have a dumb question, using the IteratorFile, how do you make a dynamic loop for x, as in there may be 2+ x in each args :
IteratorFile(("{}\t{}".format(x[0], x[1]) for x in args))?