Created
November 10, 2014 23:02
-
-
Save odubno/df935834e63433352f33 to your computer and use it in GitHub Desktop.
Data Science Class Nov 5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:25f20ca844a744ab43182ecdd4c764e04014850c441d10e665d2f1060769af46" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"%matplotlib inline" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 199 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"customers = pd.read_csv('https://raw.githubusercontent.com/TeachingDataScience/data-science-course/forstudentviewing/16_databases/data/customers.csv', header=None)\n", | |
"orders = pd.read_csv('https://raw.githubusercontent.com/TeachingDataScience/data-science-course/forstudentviewing/16_databases/data/orders.csv', header=None)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 200 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"List the first five customers in region 2." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"customers.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" <th>5</th>\n", | |
" <th>6</th>\n", | |
" <th>7</th>\n", | |
" <th>8</th>\n", | |
" <th>9</th>\n", | |
" <th>10</th>\n", | |
" <th>11</th>\n", | |
" <th>12</th>\n", | |
" <th>13</th>\n", | |
" <th>14</th>\n", | |
" <th>15</th>\n", | |
" <th>16</th>\n", | |
" <th>17</th>\n", | |
" <th>18</th>\n", | |
" <th>19</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> VKUUXF</td>\n", | |
" <td> ITHOMQJNYX</td>\n", | |
" <td> 4608499546 Dell Way</td>\n", | |
" <td>NaN</td>\n", | |
" <td> QSDPAGD</td>\n", | |
" <td> SD</td>\n", | |
" <td> 24101</td>\n", | |
" <td> US</td>\n", | |
" <td> 1</td>\n", | |
" <td> [email protected]</td>\n", | |
" <td> 4608499546</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1979279217775911</td>\n", | |
" <td> 2012/03</td>\n", | |
" <td> user1</td>\n", | |
" <td> password</td>\n", | |
" <td> 55</td>\n", | |
" <td> 100000</td>\n", | |
" <td> M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> HQNMZH</td>\n", | |
" <td> UNUKXHJVXB</td>\n", | |
" <td> 5119315633 Dell Way</td>\n", | |
" <td>NaN</td>\n", | |
" <td> YNCERXJ</td>\n", | |
" <td> AZ</td>\n", | |
" <td> 11802</td>\n", | |
" <td> US</td>\n", | |
" <td> 1</td>\n", | |
" <td> [email protected]</td>\n", | |
" <td> 5119315633</td>\n", | |
" <td> 1</td>\n", | |
" <td> 3144519586581737</td>\n", | |
" <td> 2012/11</td>\n", | |
" <td> user2</td>\n", | |
" <td> password</td>\n", | |
" <td> 80</td>\n", | |
" <td> 40000</td>\n", | |
" <td> M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> JTNRNB</td>\n", | |
" <td> LYYSHTQJRE</td>\n", | |
" <td> 6297761196 Dell Way</td>\n", | |
" <td>NaN</td>\n", | |
" <td> LWVIFXJ</td>\n", | |
" <td> OH</td>\n", | |
" <td> 96082</td>\n", | |
" <td> US</td>\n", | |
" <td> 1</td>\n", | |
" <td> [email protected]</td>\n", | |
" <td> 6297761196</td>\n", | |
" <td> 4</td>\n", | |
" <td> 8728086929768325</td>\n", | |
" <td> 2010/12</td>\n", | |
" <td> user3</td>\n", | |
" <td> password</td>\n", | |
" <td> 47</td>\n", | |
" <td> 100000</td>\n", | |
" <td> M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 4</td>\n", | |
" <td> XMFYXD</td>\n", | |
" <td> WQLQHUHLFE</td>\n", | |
" <td> 9862764981 Dell Way</td>\n", | |
" <td>NaN</td>\n", | |
" <td> HOKEXCD</td>\n", | |
" <td> MS</td>\n", | |
" <td> 78442</td>\n", | |
" <td> US</td>\n", | |
" <td> 1</td>\n", | |
" <td> [email protected]</td>\n", | |
" <td> 9862764981</td>\n", | |
" <td> 5</td>\n", | |
" <td> 7160005148965866</td>\n", | |
" <td> 2009/09</td>\n", | |
" <td> user4</td>\n", | |
" <td> password</td>\n", | |
" <td> 44</td>\n", | |
" <td> 40000</td>\n", | |
" <td> F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5</td>\n", | |
" <td> PGDTDU</td>\n", | |
" <td> ETBYBNEGUT</td>\n", | |
" <td> 2841895775 Dell Way</td>\n", | |
" <td>NaN</td>\n", | |
" <td> RZQTCDN</td>\n", | |
" <td> AZ</td>\n", | |
" <td> 16291</td>\n", | |
" <td> US</td>\n", | |
" <td> 1</td>\n", | |
" <td> [email protected]</td>\n", | |
" <td> 2841895775</td>\n", | |
" <td> 3</td>\n", | |
" <td> 8377095518168063</td>\n", | |
" <td> 2010/10</td>\n", | |
" <td> user5</td>\n", | |
" <td> password</td>\n", | |
" <td> 21</td>\n", | |
" <td> 20000</td>\n", | |
" <td> M</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 201, | |
"text": [ | |
" 0 1 2 3 4 5 6 7 8 \\\n", | |
"0 1 VKUUXF ITHOMQJNYX 4608499546 Dell Way NaN QSDPAGD SD 24101 US \n", | |
"1 2 HQNMZH UNUKXHJVXB 5119315633 Dell Way NaN YNCERXJ AZ 11802 US \n", | |
"2 3 JTNRNB LYYSHTQJRE 6297761196 Dell Way NaN LWVIFXJ OH 96082 US \n", | |
"3 4 XMFYXD WQLQHUHLFE 9862764981 Dell Way NaN HOKEXCD MS 78442 US \n", | |
"4 5 PGDTDU ETBYBNEGUT 2841895775 Dell Way NaN RZQTCDN AZ 16291 US \n", | |
"\n", | |
" 9 10 11 12 13 14 15 \\\n", | |
"0 1 [email protected] 4608499546 1 1979279217775911 2012/03 user1 \n", | |
"1 1 [email protected] 5119315633 1 3144519586581737 2012/11 user2 \n", | |
"2 1 [email protected] 6297761196 4 8728086929768325 2010/12 user3 \n", | |
"3 1 [email protected] 9862764981 5 7160005148965866 2009/09 user4 \n", | |
"4 1 [email protected] 2841895775 3 8377095518168063 2010/10 user5 \n", | |
"\n", | |
" 16 17 18 19 \n", | |
"0 password 55 100000 M \n", | |
"1 password 80 40000 M \n", | |
"2 password 47 100000 M \n", | |
"3 password 44 40000 F \n", | |
"4 password 21 20000 M " | |
] | |
} | |
], | |
"prompt_number": 201 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Count the number of customers by State" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"customers.columns = ['index','id','id2', 'product','NaN', 'id3', 'state','zipcode', 'country','count', 'email','id5','id6','id7','date','user','password','age','amount','sex']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 202 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"customers['customer_count'] = 1" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 203 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sort_customers = customers[['state','customer_count','age','email']]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 204 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sort_customers_sum = customers[['state','customer_count']]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 205 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sort_customers_sum = sort_customers_sum.groupby('state').agg([sum])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 206 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sort_customers_sum.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>customer_count</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>sum</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>state</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>AK</th>\n", | |
" <td> 210</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>AL</th>\n", | |
" <td> 195</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>AR</th>\n", | |
" <td> 190</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>AZ</th>\n", | |
" <td> 177</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CA</th>\n", | |
" <td> 186</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 207, | |
"text": [ | |
" customer_count\n", | |
" sum\n", | |
"state \n", | |
"AK 210\n", | |
"AL 195\n", | |
"AR 190\n", | |
"AZ 177\n", | |
"CA 186" | |
] | |
} | |
], | |
"prompt_number": 207 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Return the number of customers that have a first name that start with 'Z'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sort_by_name = sort_customers.sort(['email'], ascending=False)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 208 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sort_by_name.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>state</th>\n", | |
" <th>customer_count</th>\n", | |
" <th>age</th>\n", | |
" <th>email</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>17647</th>\n", | |
" <td> NaN</td>\n", | |
" <td> 1</td>\n", | |
" <td> 70</td>\n", | |
" <td> [email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12976</th>\n", | |
" <td> NaN</td>\n", | |
" <td> 1</td>\n", | |
" <td> 36</td>\n", | |
" <td> [email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6641 </th>\n", | |
" <td> ND</td>\n", | |
" <td> 1</td>\n", | |
" <td> 61</td>\n", | |
" <td> [email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6484 </th>\n", | |
" <td> SD</td>\n", | |
" <td> 1</td>\n", | |
" <td> 65</td>\n", | |
" <td> [email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17522</th>\n", | |
" <td> NaN</td>\n", | |
" <td> 1</td>\n", | |
" <td> 45</td>\n", | |
" <td> [email protected]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 209, | |
"text": [ | |
" state customer_count age email\n", | |
"17647 NaN 1 70 [email protected]\n", | |
"12976 NaN 1 36 [email protected]\n", | |
"6641 ND 1 61 [email protected]\n", | |
"6484 SD 1 65 [email protected]\n", | |
"17522 NaN 1 45 [email protected]" | |
] | |
} | |
], | |
"prompt_number": 209 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sort_customers.age.describe()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 211, | |
"text": [ | |
"count 20000.000000\n", | |
"mean 53.883150\n", | |
"std 21.012974\n", | |
"min 18.000000\n", | |
"25% 35.000000\n", | |
"50% 54.000000\n", | |
"75% 72.000000\n", | |
"max 90.000000\n", | |
"dtype: float64" | |
] | |
} | |
], | |
"prompt_number": 211 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#SQL" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"a special-purpose programming language designed for managing data held in a relational database management system (RDBMS), or for stream processing in a relational data stream management system (RDSMS).\n", | |
"\n", | |
"Databases are a structured data source optimized for \n", | |
"efficient retrieval and storage." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###How does a database work?\n", | |
"\n", | |
"Databases are a structured data source optimized for \n", | |
"efficient retrieval and storage." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"multiple tables\n", | |
"tables have keys\n", | |
"\n", | |
"relation databases(SQL) and non relational (NoSQL)\n", | |
"\n", | |
"SQL: a language (Syntax Structure Language) primary language thats build around all of these different databases that have existed in the last 30 years. \n", | |
"\n", | |
"what are some of the main databases that are relational: SQL server (Microsoft); Oracle (large company that owns the largest comonly used databases Oracle; Oracle ows MySQL (origally developed by Sun)); PostgresQL (originally a non relational; people eventually added SQL to it, something that you could iterate through).\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###Relational Databases\n", | |
"\n", | |
"lots of tables (schema): hold info about particular things, built around primary keys and each row will be unique\n", | |
"\n", | |
"other tables you could merge will have foreign keys (there will be IDs and you could use them to join them to other IDs)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"structured: we have to pre-define organization strategy \n", | |
"\n", | |
"retrieval: the ability to read data out \n", | |
"\n", | |
"storage: the ability to write data and save it" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"In a relational database we will see a few other tables where we're not duplicating tables, but what we're looking to output is this blackfile that is a deorganized file " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"When we normalize tables, seperate tables that have different info, we make a \"join\". When we put tables together in SQL we use \"join\". Pandas also has this functionality." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The more tables you have to put together, the system says that's more and more work. For all the tables to join together it will take longer on a select statement. " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###Database crashes. what do you do?\n", | |
"\n", | |
"cache. however, if one user pushes a function that another user would push later, it'll get rejected. cache rejects duplicates.\n", | |
"\n", | |
"indexing (id/key): where/filer \n", | |
"\n", | |
"temp tables\n", | |
"\n", | |
"###etl (extract and load) second detabase of preprosessed queries \n", | |
"writing etl is 80% of your job at a small company (creating all these different tables that are very fast and easy for ppl to query)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Count(Distinct(col2)) - list unique items\n", | |
"\n", | |
"SELECT col1, AVF(col2)\n", | |
"\n", | |
"FROM table GROUP BY col1\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"SELECT t1.c1, t1.c2, t2.c2 \n", | |
"\n", | |
"FROM t1 JOIN t2 ON t1.c1 = t2.c2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"###Good interview question\n", | |
"\n", | |
"If we were to imagine our tables as van diagrams. If we do innerjoin, our data will be the data that overlaps in both tables." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Outerjoin will include the contents of the outer table, contents that do not overlap" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"createdb dellstore\n", | |
"\n", | |
"psql -d dellstore -a -f ~/Downloads/dellstore2-normal-1.0/dellstore2-normal-1.0.sql\n", | |
"\n", | |
"psql -d dellstore " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"use Valentia Studio " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"import psycopg2 in python\n", | |
"\n", | |
"or import psycopg2.extras in python\n", | |
"\n", | |
"use dataset - dataset.readthedocs.org/en/latest/api.html\n", | |
"in addition to docs.sqlalchemy.org " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import psycopg2\n", | |
"import psycopg2.extras" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"try:\n", | |
" conn = psycopg2.connect(\"dbname='dellstore' user='olehdubno' host='localhost'\")\n", | |
"except:\n", | |
" print \"Connection failure; please check connection parameters;\"" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"cur = conn.cursor()\n", | |
"cur.execute('SELECT * from Customers LIMIT 1;')\n", | |
"rows = cur.fetchall()\n", | |
"print rows" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[(1, 'VKUUXF', 'ITHOMQJNYX', '4608499546 Dell Way', None, 'QSDPAGD', 'SD', 24101, 'US', 1, '[email protected]', '4608499546', 1, '1979279217775911', '2012/03', 'user1', 'password', 55, 100000, 'M')]\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)\n", | |
"cur.execute('SELECT * from Customers LIMIT 1;')\n", | |
"rows = cur.fetchall()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print rows\n", | |
"print rows[0].keys()\n", | |
"print rows[0]['creditcardexpiration']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[[1, 'VKUUXF', 'ITHOMQJNYX', '4608499546 Dell Way', None, 'QSDPAGD', 'SD', 24101, 'US', 1, '[email protected]', '4608499546', 1, '1979279217775911', '2012/03', 'user1', 'password', 55, 100000, 'M']]\n", | |
"['creditcardexpiration', 'city', 'gender', 'password', 'zip', 'firstname', 'lastname', 'address2', 'age', 'creditcard', 'email', 'username', 'phone', 'state', 'creditcardtype', 'income', 'address1', 'region', 'country', 'customerid']\n", | |
"2012/03\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"## Twitter database\n", | |
"from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT\n", | |
"conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"create_db = 'CREATE DATABASE twitter'\n", | |
"create_table = '''CREATE TABLE tweet (\n", | |
" id BIGINT,\n", | |
" text VARCHAR(255),\n", | |
" created_at TIMESTAMP\n", | |
" );\n", | |
" '''\n", | |
"cur.execute(create_db)\n", | |
"conn.commit()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"conn = psycopg2.connect(\"dbname='twitter' user='olehdubno' host='localhost'\")\n", | |
"cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"cur.execute(create_table)\n", | |
"conn.commit()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from twitter import *\n", | |
"t = Twitter(\n", | |
" auth=OAuth(\n", | |
" token='322803973-DMGKZNX8SXRf3LLrLAwiug2PEMgruNECqbbAbN1d',\n", | |
" token_secret='y4noOkMH4D74QNcaWazFNxRBVRilGc9P79r3OrjtwVcxF',\n", | |
" consumer_key='sT8FcSnqen19WZ9ew6EwzLAUe',\n", | |
" consumer_secret='5uC13nn4MAPcQzFuMgWPeVL1Bvs8wj3Cd1puauCUD4F1zqpR1q')\n", | |
" )\n", | |
"\n", | |
"\n", | |
"for hashtag in ['#justsayyes', '#taylorswift']:\n", | |
" results = t.search.tweets(q=hashtag, count=100, result_type='mixed')\n", | |
" for r in results['statuses']:\n", | |
" try:\n", | |
" row = {\n", | |
" 'id': r['id'],\n", | |
" 'tweet': r['text'],\n", | |
" 'created_at': r['created_at'],\n", | |
" }\n", | |
" cur.execute(\"\"\"INSERT INTO tweet (id, text, created_at) VALUES (%(id)s, %(tweet)s, %(created_at)s)\"\"\", row)\n", | |
" conn.commit()\n", | |
" except UnicodeDecodeError:\n", | |
" conn.commit() # close transaction anyway\n", | |
" pass\n", | |
"\n", | |
"import pandas\n", | |
"df = pandas.read_sql_query('SELECT * FROM tweet;', con=conn)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.info()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 200 entries, 0 to 199\n", | |
"Data columns (total 3 columns):\n", | |
"id 200 non-null int64\n", | |
"text 200 non-null object\n", | |
"created_at 200 non-null datetime64[ns]\n", | |
"dtypes: datetime64[ns](1), int64(1), object(1)" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment