Created
April 18, 2016 22:03
-
-
Save JamesGardiner/0c2f5c884453664ec672cdd557b84fc3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Reading databases in Pandas" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Here is a simple representation of how to read into a dataframe a (large) PostgreSQL database." | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "import json\nimport pandas as pd\nimport psycopg2", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "# Read in configuration options\n# Assumes a config.json file in the same\n# directory with minimum of host, database,\n# user, passw, schema and table key: value pairs\nwith open('config.json') as f:\n conf = json.load(f)", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "# Connection string with required parameters\nconn_str = \"host={} dbname={} user={} password={}\".format(conf['host'],\n conf['database'],\n conf['user'],\n conf['passw']\n )", | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "# Get a connection object\nconn = psycopg2.connect(conn_str)", | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "# This might fail if the DB is large and exceeds your machine's memory limitations\n# when being read in\ndf = pd.read_sql('select * from {}.{}'.format(conf['schema'],\n conf['table']),\n con=conn)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "# A better approach when dealing with large databases\n# or a machine with low memory is to create a generator\n# that iterates over a table x rows at a time\ndf = pd.DataFrame()\n\n# Iterating is far better for large databases\nfor chunk in pd.read_sql('select * from {}.{}'.format(conf['schema'],\n conf['table']),\n con=conn,\n chunksize=5000):\n df = df.append(chunk)", | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "# check it works\ndf.head()", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>pkey</th>\n <th>created</th>\n <th>first_name</th>\n <th>href</th>\n <th>id</th>\n <th>links</th>\n <th>surname</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>47024</td>\n <td>2016-03-31 19:16:25</td>\n <td>Candice Coker</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/181FC...</td>\n <td>181FC03A-FB8E-4C3D-8952-1D0DA0902AB3</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Morey</td>\n </tr>\n <tr>\n <th>1</th>\n <td>47025</td>\n <td>2016-03-31 19:16:25</td>\n <td>Jaroslaw</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/1FAD8...</td>\n <td>1FAD8549-7B84-4B12-9D6C-F6F2B9A85481</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Nowak</td>\n </tr>\n <tr>\n <th>2</th>\n <td>47026</td>\n <td>2016-03-31 19:16:25</td>\n <td>Derrick</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/20FEB...</td>\n <td>20FEBA57-1959-40ED-927C-AC919542D3C9</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Bennett</td>\n </tr>\n <tr>\n <th>3</th>\n <td>47027</td>\n <td>2016-03-31 19:16:25</td>\n <td>Ross</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/14AAC...</td>\n <td>14AACF47-BEE8-41E9-91A0-ADBE5B528AAA</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Forgan</td>\n </tr>\n <tr>\n <th>4</th>\n <td>47028</td>\n <td>2016-03-31 19:16:25</td>\n <td>Roxanne</td>\n <td>http://gtr.rcuk.ac.uk:80/gtr/api/persons/159AD...</td>\n <td>159ADF8A-1013-4C2E-BB96-93CDA266289C</td>\n <td>{'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g...</td>\n <td>Guenette</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"text/plain": " pkey created first_name \\\n0 47024 2016-03-31 19:16:25 Candice Coker \n1 47025 2016-03-31 19:16:25 Jaroslaw \n2 47026 2016-03-31 19:16:25 Derrick \n3 47027 2016-03-31 19:16:25 Ross \n4 47028 2016-03-31 19:16:25 Roxanne \n\n href \\\n0 http://gtr.rcuk.ac.uk:80/gtr/api/persons/181FC... \n1 http://gtr.rcuk.ac.uk:80/gtr/api/persons/1FAD8... \n2 http://gtr.rcuk.ac.uk:80/gtr/api/persons/20FEB... \n3 http://gtr.rcuk.ac.uk:80/gtr/api/persons/14AAC... \n4 http://gtr.rcuk.ac.uk:80/gtr/api/persons/159AD... \n\n id \\\n0 181FC03A-FB8E-4C3D-8952-1D0DA0902AB3 \n1 1FAD8549-7B84-4B12-9D6C-F6F2B9A85481 \n2 20FEBA57-1959-40ED-927C-AC919542D3C9 \n3 14AACF47-BEE8-41E9-91A0-ADBE5B528AAA \n4 159ADF8A-1013-4C2E-BB96-93CDA266289C \n\n links surname \n0 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Morey \n1 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Nowak \n2 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Bennett \n3 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Forgan \n4 {'link': [{'href': 'http://gtr.rcuk.ac.uk:80/g... Guenette " | |
}, | |
"metadata": {}, | |
"execution_count": 6 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "df.count()", | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "pkey 51618\ncreated 51618\nfirst_name 51618\nhref 51618\nid 51618\nlinks 51618\nsurname 51618\ndtype: int64" | |
}, | |
"metadata": {}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"mimetype": "text/x-python", | |
"nbconvert_exporter": "python", | |
"name": "python", | |
"file_extension": ".py", | |
"version": "3.5.1", | |
"pygments_lexer": "ipython3", | |
"codemirror_mode": { | |
"version": 3, | |
"name": "ipython" | |
} | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "read_sql_pandas.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment