Last active
December 23, 2021 16:40
-
-
Save MattOates/6195743 to your computer and use it in GitHub Desktop.
Magical SQLite dumping directly over a pipe to the sqlite3 command line client, the .import command is roughly 5x faster at dumping data than using the Python SQLite bindings and doing looped inserts.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os, sys, tempfile, csv | |
import re, inspect, ast | |
from subprocess import Popen, PIPE, STDOUT | |
""" | |
Class for dumping data into a SQLite database. | |
You will need the sqlite3 command line client in your PATH, | |
even if you are on Windows and not UNIX (maybe) | |
The simplest use case is as follows: | |
# import the class | |
from sqlitemagic import SQLiteMagician | |
# Your data should be a list of equal length tuples | |
data = [(x,) for x in range(100)] | |
# Create an SQLiteMagician to do stuff for you | |
wizzard = SQLiteMagician('my_database.db') | |
# Conjure a table with all your data populated | |
# and the table created with the same name as | |
# the variable passed in, so 'data' in this case. | |
wizzard.conjure(data) | |
2013-08-10 Matt Oates | |
MattOates[AT]gmail[DOT]com | |
>:3 | |
""" | |
class SQLiteMagician(object): | |
#Default command line client to use to access SQLite3 | |
command = "sqlite3" | |
#Python to SQLite type map, anything not in here will be mapped to NONE | |
types = {"int":"INTEGER","str":"TEXT","float":"REAL"} | |
#Constructor, what db to use | |
def __init__(self,database): | |
self.database = database | |
#Using an example data record infer the SQLite field types. | |
#Field names will be made up if a header list was not specified | |
def _get_field_spec(self,data,header=None,fieldname="field"): | |
types = [ | |
"NONE" if t == "NoneType" else t | |
for t in [ SQLiteMagician.types.get( type( d ).__name__ ) | |
for d in data[0] | |
] | |
] | |
#If no header was passed in create a load of dumby field titles | |
if header == None: | |
header = ["field%s" % n for n in range(len(data[0]))] | |
#Return the fieldspec e.g. "field1 INTEGER, field2 TEXT" | |
return ", ".join(["%s %s" % field for field in zip(header,types)]) | |
#Convoluted frame stack walk and source scrape to get what the calling statement to a function looked like. | |
#Specifically return the name of the variable passed as parameter found at position pos in the parameter list. | |
def _caller_param_name(pos): | |
#The parameter name to return | |
param = None | |
#Get the frame object for this function call | |
thisframe = inspect.currentframe() | |
try: | |
#Get the parent calling frames details | |
frames = inspect.getouterframes(thisframe) | |
#Function this function was just called from that we wish to find the calling parameter name for | |
function = frames[1][3] | |
#Get all the details of where the calling statement was | |
frame,filename,line_number,function_name,source,source_index = frames[2] | |
#Read in the source file in the parent calling frame upto where the call was made | |
with open(filename) as source_file: | |
head=[source_file.next() for x in xrange(line_number)] | |
source_file.close() | |
#Build all lines of the calling statement, this deals with when a function is called with parameters listed on each line | |
lines = [] | |
#Compile a regex for matching the start of the function being called | |
regex = re.compile(r'\.?\s*%s\s*\(' % (function)) | |
#Work backwards from the parent calling frame line number until we see the start of the calling statement (usually the same line!!!) | |
for line in reversed(head): | |
lines.append(line.strip()) | |
if re.search(regex, line): | |
break | |
#Put the lines we have groked back into sourcefile order rather than reverse order | |
lines.reverse() | |
#Join all the lines that were part of the calling statement | |
call = "".join(lines) | |
#Grab the parameter list from the calling statement for the function we were called from | |
match = re.search('\.?\s*%s\s*\((.*)\)' % (function), call) | |
paramlist = match.group(1) | |
#If the function was called with no parameters raise an exception | |
if paramlist == "": | |
raise LookupError("Function called with no parameters.") | |
#Use the Python abstract syntax tree parser to create a parsed form of the function parameter list 'Name' nodes are variable names | |
parameter = ast.parse(paramlist).body[0].value | |
#If there were multiple parameters get the positional requested | |
if type(parameter).__name__ == 'Tuple': | |
#If we asked for a parameter outside of what was passed complain | |
if pos >= len(parameter.elts): | |
raise LookupError("The function call did not have a parameter at postion %s" % pos) | |
parameter = parameter.elts[pos] | |
#If there was only a single parameter and another was requested raise an exception | |
elif pos != 0: | |
raise LookupError("There was only a single calling parameter found. Parameter indices start at 0.") | |
#If the parameter was the name of a variable we can use it otherwise pass back None | |
if type(parameter).__name__ == 'Name': | |
param = parameter.id | |
finally: | |
#Remove the frame reference to prevent cyclic references screwing the garbage collector | |
del thisframe | |
#Return the parameter name we found | |
return param | |
#Create a table given a single record of data and perhaps a header of field names | |
def create_table(self,table,data,header=None,header_included=False): | |
#Grab the header if its the first record of the data | |
if header_included: | |
header = data.pop(0) | |
#Get the field spec for this table | |
field_spec = self._get_field_spec(data,header) | |
#Run the query using a subprocess to the sqlite3 command | |
sqlite3 = Popen([SQLiteMagician.command, self.database], stdout=PIPE, stdin=PIPE, stderr=STDOUT) | |
db = sqlite3.communicate(input='CREATE TABLE IF NOT EXISTS %s(%s);\n.quit\n' % (table,field_spec))[0] | |
#Dump a variable to an SQLite database using the maximum amount of magic possible | |
def conjure(self,data,table=None,header=None,header_included=False): | |
#If the table name was not specified use stack inspection to see what the data variable was called! | |
#Magical and horrible, but I want an interface thats incredibly simple for the simplest case of dumping a variable. | |
if not table: | |
table = _caller_param_name(0) | |
#Create a table to put the data into | |
self.create_table(table,data,header,header_included) | |
#Dump as normal | |
self.dump(data,table) | |
def dump(self,data,table): | |
#Name of the pipe, use tempfile to create some random filename usually in /tmp | |
data_pipe = tempfile.mktemp('datapump') | |
#Create the actual pipe 'file' where the OS knows you wanted a pipe type thing | |
os.mkfifo( data_pipe, 0644 ) | |
#Create a child process to run in parallel | |
child = os.fork() | |
#If child is 0 we are the child | |
if child == 0: | |
#Have the child send the command to sqlite3 | |
#Create a forked process running sqlite3 with a pipe that we can send commands to | |
sqlite3 = Popen([SQLiteMagician.command, self.database], stdout=PIPE, stdin=PIPE, stderr=STDOUT) | |
#Tell sqlite3 to import from our named pipe | |
db = sqlite3.communicate(input='.separator "\\t"\n.import %s %s\n.quit\n' % (data_pipe,table))[0] | |
sys.exit(1) | |
#If we are the parent process stream data to the child | |
else: | |
#File handle to pipe to write data into table | |
data_pipe_write_handle = open(data_pipe,'w') | |
#Have the original program send data down the pipe | |
writer = csv.writer(data_pipe_write_handle, delimiter="\t") | |
writer.writerows(data) | |
#Close the pipe once we are done! | |
data_pipe_write_handle.close() | |
#Wait for SQLite3 to finish importing, waiting on all child processes | |
os.wait() | |
#Remove the named pipe file we created because its junk and we dont want a clash | |
os.unlink(data_pipe) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If anyone has a robust (safe) way of getting the calling frames variable name for the parameter passed to the .conjure() method please drop me a hint.