Created
May 19, 2015 04:30
-
-
Save ericbusboom/b53cc98d3b24b66c216f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# San Diego Development OpenDSD Analysis" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Basic analysis of the data scrpaed from the OpenDSD API. \n\n[Issue 215](http://redmine.civicknowledge.com/issues/215)" | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "import pandas as pd\nimport sqlite3\nimport matplotlib.pylab as plt\nimport numpy as np\nimport os\nfrom lib import download_ambry_db\n%matplotlib inline\n", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "download_ambry_db(\"http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/projects.db\", \"projects.db\") \ndownload_ambry_db(\"http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/approvals.db\", \"approvals.db\") ", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": "Already downloaded: http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/projects.db\nAlready extracted: projects.db\nDownloading: http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/approvals.db\nExtracting to: approvals.db\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "# COnnect to the downloaded database and get a cursor so we can run queries. \ncon = sqlite3.connect(\"projects.db\")\ncur = con.cursor()", | |
"execution_count": 57, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "# Get an example work to look for the data structure. \nimport json\nrow = cur.execute('SELECT data FROM json LIMIT 1').fetchone()\nd = json.loads(str(row[0]).decode('zlib'))\nd.keys()", | |
"execution_count": 58, | |
"outputs": [ | |
{ | |
"execution_count": 58, | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "[u'ApplicationExpiration',\n u'Customers',\n u'ProjectManagerId',\n u'Jobs',\n u'AccountNum',\n u'Title',\n u'Invoices',\n u'ProjectId',\n u'JobOrderNum',\n u'ApplicationExpired',\n u'Header',\n u'DevelopmentId',\n u'AdminHold',\n u'ReviewCycles',\n u'Fees',\n u'Scope',\n u'ProjectManager',\n u'ApplicationDate',\n u'DevelopmentTitle']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "d['Jobs']", | |
"execution_count": 61, | |
"outputs": [ | |
{ | |
"execution_count": 61, | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "[{u'APN': u'438-252-19-00',\n u'ApprovalInfo': [],\n u'Approvals': [{u'ApprovalId': 2529,\n u'CompleteCancelDate': None,\n u'Depiction': u'1',\n u'FirstInspectionDate': None,\n u'IssueDate': None,\n u'IssuedBy': u'',\n u'JobId': 1574,\n u'NetChangeDU': u'',\n u'PermitHolder': u' ',\n u'Scope': u'Greater North Park Tentative Map Waiver to convert four residential units to condominiums at 4714 Oregon Street in the MR-3000 zone of Mid City Communities Plan District in the Greater North Park Community Plan. Council District 3. Notice Cards=1.',\n u'SquareFootage': None,\n u'Status': u'Created',\n u'Type': u'Map Waiver',\n u'Valuation': u''}],\n u'Description': u'4714 OREGON ST ',\n u'JobFeesSubTotal': None,\n u'JobId': 1574,\n u'Latitude': 32.763291,\n u'Longitude': -117.135504,\n u'MapReference': u'1269-D3',\n u'NAD83Easting': None,\n u'NAD83Northing': None,\n u'ProjectId': 1410,\n u'SignOffs': [{u'DisciplineDescription': u'LDR-Engineering Review',\n u'DisciplineId': 109,\n u'SignedDate': u'Not Signed-Off'},\n {u'DisciplineDescription': u'LDR-Map Check',\n u'DisciplineId': 111,\n u'SignedDate': u'Not Signed-Off'}],\n u'SortableStreetAddress': u'OREGON ST 0000004714 ',\n u'StreetAddress': u'4714 OREGON ST '}]" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "# Now we can iterate over the data structure and do something interesting. \nfor row in cur.execute('SELECT data FROM json LIMIT 10'):\n d = json.loads(str(row[0]).decode('zlib'))\n print d['DevelopmentId'], d['Fees']\n\n \n ", | |
"execution_count": 60, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": "1001 []\n1040 [{u'Category': u'Plan Check Fees', u'InvoiceId': 107465, u'Description': u'Deposit Account', u'ProjectId': 1040, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 3000, u'QuantityPaid': 3000, u'FeeId': 1656, u'Unit': u'Dollars'}]\n1041 [{u'Category': u'Plan Check Fees', u'InvoiceId': 107464, u'Description': u'Deposit Account', u'ProjectId': 1041, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 1700, u'QuantityPaid': 1700, u'FeeId': 1656, u'Unit': u'Dollars'}]\n1096 [{u'Category': u'Plan Check Fees', u'InvoiceId': 138746, u'Description': u'Deposit Account', u'ProjectId': 1096, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 5500, u'QuantityPaid': 5500, u'FeeId': 1656, u'Unit': u'Dollars'}]\n1102 [{u'Category': u'Plan Check Fees', u'InvoiceId': 78488, u'Description': u'Deposit Account', u'ProjectId': 1102, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 4490, u'QuantityPaid': 4490, u'FeeId': 1656, u'Unit': u'Dollars'}]\n1163 [{u'Category': u'Plan Check Fees', u'InvoiceId': 79269, u'Description': u'Deposit Acct (Pre 8-2-03)', u'ProjectId': 1164, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 2500, u'QuantityPaid': 2500, u'FeeId': 1317, u'Unit': u'Each'}]\n1110 []\n1282 [{u'Category': u'Plan Check Fees', u'InvoiceId': 22932, u'Description': u'Deposit Account', u'ProjectId': 1287, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 10661, u'QuantityPaid': 10661, u'FeeId': 1656, u'Unit': u'Dollars'}]\n1389 [{u'Category': u'Plan Check Fees', u'InvoiceId': 216989, u'Description': u'Deposit Account', u'ProjectId': 1395, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 15467, u'QuantityPaid': 15467, u'FeeId': 1656, u'Unit': u'Dollars'}]\n1404 []\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "# create a generator to iterate over a particular type of object\ndef gen_fees(cur):\n\n for row in cur.execute('SELECT data FROM json'):\n d = json.loads(str(row[0]).decode('zlib'))\n if 'Fees' in d and d['Fees']:\n for fee in d['Fees']:\n yield fee\n \n cur.close()\n con.close()", | |
"execution_count": 59, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": " \n# Example of looping over the generator\nfor i, row in enumerate(gen_fees(cur)):\n if i > 10: break\n \n print row\n ", | |
"execution_count": 44, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": "{u'Category': u'Plan Check Fees', u'InvoiceId': 107465, u'Description': u'Deposit Account', u'ProjectId': 1040, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 3000, u'QuantityPaid': 3000, u'FeeId': 1656, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 107464, u'Description': u'Deposit Account', u'ProjectId': 1041, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 1700, u'QuantityPaid': 1700, u'FeeId': 1656, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 138746, u'Description': u'Deposit Account', u'ProjectId': 1096, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 5500, u'QuantityPaid': 5500, u'FeeId': 1656, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 78488, u'Description': u'Deposit Account', u'ProjectId': 1102, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 4490, u'QuantityPaid': 4490, u'FeeId': 1656, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 79269, u'Description': u'Deposit Acct (Pre 8-2-03)', u'ProjectId': 1164, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 2500, u'QuantityPaid': 2500, u'FeeId': 1317, u'Unit': u'Each'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 22932, u'Description': u'Deposit Account', u'ProjectId': 1287, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 10661, u'QuantityPaid': 10661, u'FeeId': 1656, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 216989, u'Description': u'Deposit Account', u'ProjectId': 1395, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 15467, u'QuantityPaid': 15467, u'FeeId': 1656, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 113664, u'Description': u'Deposit Account', u'ProjectId': 1606, u'InvoiceStatus': u'Requires Invoice', u'QuantityRequired': 10059, u'QuantityPaid': 3499, u'FeeId': 1656, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 235014, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1704, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 11000, u'QuantityPaid': 11000, u'FeeId': 2955, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 222608, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1706, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 7000, u'QuantityPaid': 7000, u'FeeId': 2955, u'Unit': u'Dollars'}\n{u'Category': u'Plan Check Fees', u'InvoiceId': 255428, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1707, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 22500, u'QuantityPaid': 22500, u'FeeId': 2955, u'Unit': u'Dollars'}\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "# Then, we can create a dataframe from a list comprehension\n# FOr Fees, this is 287K records, so this will be slow. \ndf = pd.DataFrame(gen_fees(cur))\n \n", | |
"execution_count": 47, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "df.head()", | |
"execution_count": 49, | |
"outputs": [ | |
{ | |
"execution_count": 49, | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": " Category Description FeeId InvoiceId InvoiceStatus \\\n0 Plan Check Fees Deposit Account 1656 107465 Paid on \n1 Plan Check Fees Deposit Account 1656 107464 Paid on \n2 Plan Check Fees Deposit Account 1656 138746 Paid on \n3 Plan Check Fees Deposit Account 1656 78488 Paid on \n4 Plan Check Fees Deposit Acct (Pre 8-2-03) 1317 79269 Paid on \n\n ProjectId QuantityPaid QuantityRequired Unit \n0 1040 3000 3000 Dollars \n1 1041 1700 1700 Dollars \n2 1096 5500 5500 Dollars \n3 1102 4490 4490 Dollars \n4 1164 2500 2500 Each ", | |
"text/html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Category</th>\n <th>Description</th>\n <th>FeeId</th>\n <th>InvoiceId</th>\n <th>InvoiceStatus</th>\n <th>ProjectId</th>\n <th>QuantityPaid</th>\n <th>QuantityRequired</th>\n <th>Unit</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> Plan Check Fees</td>\n <td> Deposit Account</td>\n <td> 1656</td>\n <td> 107465</td>\n <td> Paid on </td>\n <td> 1040</td>\n <td> 3000</td>\n <td> 3000</td>\n <td> Dollars</td>\n </tr>\n <tr>\n <th>1</th>\n <td> Plan Check Fees</td>\n <td> Deposit Account</td>\n <td> 1656</td>\n <td> 107464</td>\n <td> Paid on </td>\n <td> 1041</td>\n <td> 1700</td>\n <td> 1700</td>\n <td> Dollars</td>\n </tr>\n <tr>\n <th>2</th>\n <td> Plan Check Fees</td>\n <td> Deposit Account</td>\n <td> 1656</td>\n <td> 138746</td>\n <td> Paid on </td>\n <td> 1096</td>\n <td> 5500</td>\n <td> 5500</td>\n <td> Dollars</td>\n </tr>\n <tr>\n <th>3</th>\n <td> Plan Check Fees</td>\n <td> Deposit Account</td>\n <td> 1656</td>\n <td> 78488</td>\n <td> Paid on </td>\n <td> 1102</td>\n <td> 4490</td>\n <td> 4490</td>\n <td> Dollars</td>\n </tr>\n <tr>\n <th>4</th>\n <td> Plan Check Fees</td>\n <td> Deposit Acct (Pre 8-2-03)</td>\n <td> 1317</td>\n <td> 79269</td>\n <td> Paid on </td>\n <td> 1164</td>\n <td> 2500</td>\n <td> 2500</td>\n <td> Each</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "df.groupby('FeeId').sum().head()", | |
"execution_count": 53, | |
"outputs": [ | |
{ | |
"execution_count": 53, | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": " InvoiceId ProjectId QuantityPaid QuantityRequired\nFeeId \n698 519810 1143739 109 308\n1280 3759 6970 1 1\n1281 6520 16355 2 2\n1317 6341947 1253552 1214860 1464054\n1400 10748236 5025918 444 443", | |
"text/html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>InvoiceId</th>\n <th>ProjectId</th>\n <th>QuantityPaid</th>\n <th>QuantityRequired</th>\n </tr>\n <tr>\n <th>FeeId</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>698 </th>\n <td> 519810</td>\n <td> 1143739</td>\n <td> 109</td>\n <td> 308</td>\n </tr>\n <tr>\n <th>1280</th>\n <td> 3759</td>\n <td> 6970</td>\n <td> 1</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>1281</th>\n <td> 6520</td>\n <td> 16355</td>\n <td> 2</td>\n <td> 2</td>\n </tr>\n <tr>\n <th>1317</th>\n <td> 6341947</td>\n <td> 1253552</td>\n <td> 1214860</td>\n <td> 1464054</td>\n </tr>\n <tr>\n <th>1400</th>\n <td> 10748236</td>\n <td> 5025918</td>\n <td> 444</td>\n <td> 443</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": 55, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"collapsed": false, | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python2", | |
"display_name": "Python 2", | |
"language": "python" | |
}, | |
"language_info": { | |
"mimetype": "text/x-python", | |
"nbconvert_exporter": "python", | |
"name": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6", | |
"file_extension": ".py", | |
"codemirror_mode": { | |
"version": 2, | |
"name": "ipython" | |
} | |
}, | |
"gist_id": "b53cc98d3b24b66c216f" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment