Created
November 5, 2015 08:08
-
-
Save fabacab/fb62f19aa1d73b766dbc to your computer and use it in GitHub Desktop.
Convert a set of similarly-structured .xlsx files into a SQLite DB (using openpyxl and sqlite3).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Convert a set of similarly-structured .xlsx files into a SQLite DB. | |
# | |
# For example, say you have hundreds of Excel files in a directory | |
# called "big-analysis-project" and that each of these Excel files | |
# has a worksheet containing the same set of columns. Rather than | |
# having hundreds of separate Excel files, it would be handy to have | |
# all their data inside one relational database management system. | |
# | |
# Pass this script the path to the directory containing said files, | |
# and this script will create a SQLite database file in the current | |
# directory. | |
# | |
# From there, most operations are a breeze. For instance, looking at | |
# only the most recent timestamped entry for each user in the dataset: | |
# | |
# SELECT MAX(Last_Updated) AS Latest_Updated, Column_1, Column_2 FROM records GROUP BY Column_2 | |
# | |
import sys, glob, openpyxl, sqlite3 | |
def main(): | |
# TODO: Replace this with getopt. | |
if 2 == len(sys.argv) and '--help' == sys.argv[1]: | |
print('USAGE:') | |
print(sys.argv[0], '[directory]') | |
# Choose a directory. | |
try: | |
where = sys.argv[1] | |
except: | |
where = '.' | |
# Create a SQLite database for the results. | |
db_filename = 'excel-data-database.db' # TODO: Set this from user input. | |
conn = sqlite3.connect(db_filename) | |
db_fields = [] | |
i = 0 | |
for filepath in glob.glob(where + '/*.xlsx'): | |
print('Loading workbook from ' + filepath, file=sys.stderr) | |
try: | |
wb = openpyxl.load_workbook(filepath) | |
except openpyxl.formula.tokenizer.TokenizerError as e: | |
print(e, file=sys.stderr) | |
continue | |
ws = wb.active | |
if 0 == i: # if the first file, create the SQL table | |
headers = [] | |
for cell in ws.rows[0]: | |
t = { | |
ord(' '): '_', | |
ord('"'): '', | |
ord('?'): '', | |
ord('/'): '', | |
ord('('): '', | |
ord(')'): '' | |
} | |
headers.append(cell.value.translate(t)) | |
db_fields = headers | |
try: | |
conn.execute('CREATE TABLE records (' + ','.join(headers) + ')') | |
except sqlite3.Error as e: | |
print(e, file=sys.stderr) | |
for row in ws.get_squared_range(ws.min_column, ws.min_row + 1, ws.max_column, ws.max_row): | |
placeholders = [] | |
vals = [] | |
for cell in row: | |
placeholders.append('?') | |
vals.append(cell.value) | |
sql = 'INSERT INTO records (' + ','.join(db_fields) + ') VALUES (' + ','.join(placeholders[:len(db_fields)]) + ')' | |
try: | |
conn.execute(sql, tuple(vals[:len(db_fields)])) # make sure only db_fields number of columns | |
except sqlite3.InterfaceError as e: | |
print(e, file=sys.stderr) | |
conn.commit() | |
i = i + 1 | |
if __name__ == '__main__': | |
main() |
I get this error
Traceback (most recent call last):
File "xl2sqlite.py", line 83, in
main()
File "xl2sqlite.py", line 51, in main
for cell in ws.rows[0]:
TypeError: 'generator' object is not subscriptable
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Quite useful, thanks