Created
October 4, 2013 19:31
-
-
Save lebedov/6831387 to your computer and use it in GitHub Desktop.
Create generator for reading consecutive chunks of data from a database table using pandas.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """ | |
| Create generator for reading consecutive chunks of data from a database table using pandas. | |
| """ | |
| import pandas.io.sql | |
| import sqlite3 | |
| def read_db(conn, table, names=None, chunksize=None): | |
| """ | |
| Return generator for reading consecutive chunks of data from a table as | |
| DataFrames. | |
| Parameters | |
| ---------- | |
| conn : DB connection object | |
| table : str | |
| Table name. | |
| names : list of str | |
| Column names to read. | |
| chunksize : int | |
| Number of rows to return in each call to the generator. | |
| """ | |
| # Select all columns by default: | |
| if names is None: | |
| names_str = '*' | |
| else: | |
| names_str = ','.join(names) | |
| # Return all rows in table in first chunk by default: | |
| if chunksize is None: | |
| query = 'select %s from %s' % (names_str, table) | |
| done = False | |
| while True: | |
| if not done: | |
| result = pandas.io.sql.read_frame(query, conn) | |
| done = True | |
| yield result | |
| else: | |
| raise StopIteration | |
| else: | |
| offset = 0 | |
| while True: | |
| query = 'select %s from %s limit %i offset %i' % (names_str, table, chunksize, offset) | |
| result = pandas.io.sql.read_frame(query, conn) | |
| result.index = range(offset, offset+len(result)) | |
| if len(result): | |
| yield result | |
| else: | |
| raise StopIteration | |
| offset += chunksize | |
| if __name__ == '__main__': | |
| # Create a test database: | |
| conn = sqlite3.connect(':memory:') | |
| conn.execute("create table test (label str, val int)") | |
| data = [('a', 1), | |
| ('a', 2), | |
| ('b', 9), | |
| ('b', 10), | |
| ('b', 11), | |
| ('c', 12), | |
| ('c', 13), | |
| ('c', 14)] | |
| conn.executemany('insert into test values (?, ?)', data) | |
| conn.commit() | |
| # Extract all data in one chunk: | |
| rd = read_db(conn, 'test') | |
| for x in rd: | |
| print x | |
| # Extract data in several chunks: | |
| rd = read_db(conn, 'test', '*', 3) | |
| for x in rd: | |
| print x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment