Created
January 31, 2013 20:36
-
-
Save herrfz/4686207 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "getting_data" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Getting data -- in Python" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# getting and setting directory\n", | |
"#\n", | |
"# equivalent to R getwd() and setwd() commands\n", | |
"#\n", | |
"# this is the more general Python way\n", | |
"# in IPython we can use system commands directly prefixed by !\n", | |
"import os\n", | |
"\n", | |
"os.getcwd()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 1, | |
"text": [ | |
"'/Users/erriza/Documents/ipython-notebooks'" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"os.chdir('..')\n", | |
"os.getcwd()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"'/Users/erriza/Documents'" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"os.chdir('./ipython-notebooks/')\n", | |
"os.getcwd()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"'/Users/erriza/Documents/ipython-notebooks'" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# load data into Pandas\n", | |
"import pandas as pd\n", | |
"\n", | |
"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD'\n", | |
"\n", | |
"# we can directly use read_csv to download the file\n", | |
"# this is equivalent to R's combined download.file() and read.table() or read.csv() commands\n", | |
"cameraData = pd.read_csv(fileUrl)\n", | |
"\n", | |
"cameraData.to_csv('./data/cameras.csv', index=False)\n", | |
"\n", | |
"# for simplicity I'll use IPython tricks to list folder contents\n", | |
"!ls ./data\n", | |
"\n", | |
"\n", | |
"# get current date and time\n", | |
"# this is equivalent to R date() command\n", | |
"# note that I use IPython ! prefix to run my system's command\n", | |
"dateDownloaded = !date\n", | |
"print dateDownloaded\n", | |
"\n", | |
"cameraData.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"camera.xls camera.xlsx cameras.csv camerasModified.csv\r\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['Thu Jan 31 21:32:24 CET 2013']\n" | |
] | |
}, | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>address</th>\n", | |
" <th>direction</th>\n", | |
" <th>street</th>\n", | |
" <th>crossStreet</th>\n", | |
" <th>intersection</th>\n", | |
" <th>Location 1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>0</strong></td>\n", | |
" <td> S CATON AVE & BENSON AVE</td>\n", | |
" <td> N/B</td>\n", | |
" <td> Caton Ave</td>\n", | |
" <td> Benson Ave</td>\n", | |
" <td> Caton Ave & Benson Ave</td>\n", | |
" <td> (39.2693779962, -76.6688185297)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>1</strong></td>\n", | |
" <td> S CATON AVE & BENSON AVE</td>\n", | |
" <td> S/B</td>\n", | |
" <td> Caton Ave</td>\n", | |
" <td> Benson Ave</td>\n", | |
" <td> Caton Ave & Benson Ave</td>\n", | |
" <td> (39.2693157898, -76.6689698176)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>2</strong></td>\n", | |
" <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n", | |
" <td> E/B</td>\n", | |
" <td> Wilkens Ave</td>\n", | |
" <td> Pine Heights</td>\n", | |
" <td> Wilkens Ave & Pine Heights</td>\n", | |
" <td> (39.2720252302, -76.676960806)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>3</strong></td>\n", | |
" <td> THE ALAMEDA & E 33RD ST</td>\n", | |
" <td> S/B</td>\n", | |
" <td> The Alameda</td>\n", | |
" <td> 33rd St</td>\n", | |
" <td> The Alameda & 33rd St</td>\n", | |
" <td> (39.3285013141, -76.5953545714)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>4</strong></td>\n", | |
" <td> E 33RD ST & THE ALAMEDA</td>\n", | |
" <td> E/B</td>\n", | |
" <td> E 33rd</td>\n", | |
" <td> The Alameda</td>\n", | |
" <td> E 33rd & The Alameda</td>\n", | |
" <td> (39.3283410623, -76.5953594625)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
" address direction street crossStreet \\\n", | |
"0 S CATON AVE & BENSON AVE N/B Caton Ave Benson Ave \n", | |
"1 S CATON AVE & BENSON AVE S/B Caton Ave Benson Ave \n", | |
"2 WILKENS AVE & PINE HEIGHTS AVE E/B Wilkens Ave Pine Heights \n", | |
"3 THE ALAMEDA & E 33RD ST S/B The Alameda 33rd St \n", | |
"4 E 33RD ST & THE ALAMEDA E/B E 33rd The Alameda \n", | |
"\n", | |
" intersection Location 1 \n", | |
"0 Caton Ave & Benson Ave (39.2693779962, -76.6688185297) \n", | |
"1 Caton Ave & Benson Ave (39.2693157898, -76.6689698176) \n", | |
"2 Wilkens Ave & Pine Heights (39.2720252302, -76.676960806) \n", | |
"3 The Alameda & 33rd St (39.3285013141, -76.5953545714) \n", | |
"4 E 33rd & The Alameda (39.3283410623, -76.5953594625) " | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# read Excel file\n", | |
"# this is equivalent to R read.xlsx() and read.xlsx2() commands\n", | |
"#\n", | |
"# we need openpyxl 1.5.8 (don't use the latest version due to a bug) and xlrd packages\n", | |
"# install with: \n", | |
"# sudo pip install openpyxl==1.5.8\n", | |
"# sudo pip install xlrd\n", | |
"#\n", | |
"# unfortunately Pandas ExcelFile() can't download and read at once (in contrast to read_csv())\n", | |
"# so we need to resort to the basic Python way\n", | |
"# also notice I'm using .xls; .xlsx doesn't work in my computer\n", | |
"import urllib2\n", | |
"\n", | |
"# download the file as camera.xls and save it in ./data subfolder\n", | |
"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xls?accessType=DOWNLOAD'\n", | |
"f = urllib2.urlopen(fileUrl)\n", | |
"data = f.read()\n", | |
"with open('./data/camera.xls', 'wb') as w:\n", | |
" w.write(data)\n", | |
"\n", | |
"# load the Excel file as a pandas DataFrame\n", | |
"cameraData = pd.ExcelFile('./data/camera.xls')\n", | |
"cameraData = cameraData.parse('Baltimore Fixed Speed Cameras', index_col=None, na_values=['NA'])\n", | |
"cameraData.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>address</th>\n", | |
" <th>direction</th>\n", | |
" <th>street</th>\n", | |
" <th>crossStreet</th>\n", | |
" <th>intersection</th>\n", | |
" <th>Location 1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>0</strong></td>\n", | |
" <td> S CATON AVE & BENSON AVE</td>\n", | |
" <td> N/B</td>\n", | |
" <td> Caton Ave</td>\n", | |
" <td> Benson Ave</td>\n", | |
" <td> Caton Ave & Benson Ave</td>\n", | |
" <td> (39.2693779962, -76.6688185297)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>1</strong></td>\n", | |
" <td> S CATON AVE & BENSON AVE</td>\n", | |
" <td> S/B</td>\n", | |
" <td> Caton Ave</td>\n", | |
" <td> Benson Ave</td>\n", | |
" <td> Caton Ave & Benson Ave</td>\n", | |
" <td> (39.2693157898, -76.6689698176)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>2</strong></td>\n", | |
" <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n", | |
" <td> E/B</td>\n", | |
" <td> Wilkens Ave</td>\n", | |
" <td> Pine Heights</td>\n", | |
" <td> Wilkens Ave & Pine Heights</td>\n", | |
" <td> (39.2720252302, -76.676960806)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>3</strong></td>\n", | |
" <td> THE ALAMEDA & E 33RD ST</td>\n", | |
" <td> S/B</td>\n", | |
" <td> The Alameda</td>\n", | |
" <td> 33rd St</td>\n", | |
" <td> The Alameda & 33rd St</td>\n", | |
" <td> (39.3285013141, -76.5953545714)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>4</strong></td>\n", | |
" <td> E 33RD ST & THE ALAMEDA</td>\n", | |
" <td> E/B</td>\n", | |
" <td> E 33rd</td>\n", | |
" <td> The Alameda</td>\n", | |
" <td> E 33rd & The Alameda</td>\n", | |
" <td> (39.3283410623, -76.5953594625)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
" address direction street crossStreet \\\n", | |
"0 S CATON AVE & BENSON AVE N/B Caton Ave Benson Ave \n", | |
"1 S CATON AVE & BENSON AVE S/B Caton Ave Benson Ave \n", | |
"2 WILKENS AVE & PINE HEIGHTS AVE E/B Wilkens Ave Pine Heights \n", | |
"3 THE ALAMEDA & E 33RD ST S/B The Alameda 33rd St \n", | |
"4 E 33RD ST & THE ALAMEDA E/B E 33rd The Alameda \n", | |
"\n", | |
" intersection Location 1 \n", | |
"0 Caton Ave & Benson Ave (39.2693779962, -76.6688185297) \n", | |
"1 Caton Ave & Benson Ave (39.2693157898, -76.6689698176) \n", | |
"2 Wilkens Ave & Pine Heights (39.2720252302, -76.676960806) \n", | |
"3 The Alameda & 33rd St (39.3285013141, -76.5953545714) \n", | |
"4 E 33rd & The Alameda (39.3283410623, -76.5953594625) " | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# R readLines() for reading a text file is similar to standard Python file access\n", | |
"# I'm not going to detail it here\n", | |
"\n", | |
"# R readLines() to read data from a website is similar to Python with urllib2 package as in xls example above\n", | |
"\n", | |
"# read json\n", | |
"# this is equivalent to R fromJSON() command\n", | |
"import json\n", | |
"\n", | |
"# first we get the json file from the website\n", | |
"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.json?accessType=DOWNLOAD'\n", | |
"req = urllib2.Request(fileUrl)\n", | |
"opener = urllib2.build_opener()\n", | |
"f = opener.open(req)\n", | |
"\n", | |
"# then we read it into a data structure\n", | |
"jsonCamera = json.loads(f.read())\n", | |
"\n", | |
"# json is loadad as dictionary\n", | |
"print jsonCamera['meta']['view']['id']\n", | |
"print jsonCamera['meta']['view']['name']\n", | |
"print jsonCamera['meta']['view']['attribution']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"dz54-2aru\n", | |
"Baltimore Fixed Speed Cameras\n", | |
"Department of Transportation\n" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# writing data\n", | |
"\n", | |
"# first read the csv file\n", | |
"cameraData = pd.read_csv('./data/cameras.csv')\n", | |
"# take a subset of the columns\n", | |
"tmpData = cameraData.ix[:,2:]\n", | |
"# then save it to a different csv file\n", | |
"# this is equivalent to R write.table() command\n", | |
"tmpData.to_csv('./data/camerasModified.csv', sep=',', index=False)\n", | |
"cameraData2 = pd.read_csv('./data/camerasModified.csv')\n", | |
"cameraData2.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>street</th>\n", | |
" <th>crossStreet</th>\n", | |
" <th>intersection</th>\n", | |
" <th>Location 1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>0</strong></td>\n", | |
" <td> Caton Ave</td>\n", | |
" <td> Benson Ave</td>\n", | |
" <td> Caton Ave & Benson Ave</td>\n", | |
" <td> (39.2693779962, -76.6688185297)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>1</strong></td>\n", | |
" <td> Caton Ave</td>\n", | |
" <td> Benson Ave</td>\n", | |
" <td> Caton Ave & Benson Ave</td>\n", | |
" <td> (39.2693157898, -76.6689698176)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>2</strong></td>\n", | |
" <td> Wilkens Ave</td>\n", | |
" <td> Pine Heights</td>\n", | |
" <td> Wilkens Ave & Pine Heights</td>\n", | |
" <td> (39.2720252302, -76.676960806)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>3</strong></td>\n", | |
" <td> The Alameda</td>\n", | |
" <td> 33rd St</td>\n", | |
" <td> The Alameda & 33rd St</td>\n", | |
" <td> (39.3285013141, -76.5953545714)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>4</strong></td>\n", | |
" <td> E 33rd</td>\n", | |
" <td> The Alameda</td>\n", | |
" <td> E 33rd & The Alameda</td>\n", | |
" <td> (39.3283410623, -76.5953594625)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
" street crossStreet intersection \\\n", | |
"0 Caton Ave Benson Ave Caton Ave & Benson Ave \n", | |
"1 Caton Ave Benson Ave Caton Ave & Benson Ave \n", | |
"2 Wilkens Ave Pine Heights Wilkens Ave & Pine Heights \n", | |
"3 The Alameda 33rd St The Alameda & 33rd St \n", | |
"4 E 33rd The Alameda E 33rd & The Alameda \n", | |
"\n", | |
" Location 1 \n", | |
"0 (39.2693779962, -76.6688185297) \n", | |
"1 (39.2693157898, -76.6689698176) \n", | |
"2 (39.2720252302, -76.676960806) \n", | |
"3 (39.3285013141, -76.5953545714) \n", | |
"4 (39.3283410623, -76.5953594625) " | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# I don't find the equivalent of R's saving and loading workspace commands" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# R's paste() and paste0() commands look like standard Python's string manipulations\n", | |
"print ['./data' + str(i) + '.csv' for i in range(1, 6)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['./data1.csv', './data2.csv', './data3.csv', './data4.csv', './data5.csv']\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# getting data off webpages\n", | |
"from lxml.html import parse\n", | |
"\n", | |
"url = 'http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en'\n", | |
"\n", | |
"# this is equivalent to the combined R's opening/reading/closing connection and htmlTreeParse() commands\n", | |
"html3 = parse(url).getroot()\n", | |
"\n", | |
"# get the title text using xpath expression\n", | |
"# this is equivalent to R xpathSApply() command\n", | |
"title = html3.xpath('//title')\n", | |
"print [x.text_content() for x in title]\n", | |
"\n", | |
"# get the texts of col-citedby elements using xpath expression\n", | |
"citedby = html3.xpath(\"//td[@id='col-citedby']\")\n", | |
"print [x.text_content() for x in citedby]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['Jeff Leek - Google Scholar Citations']\n", | |
"['Cited by', '337', '172', '140', '133', '107', '95', '79', '78', '53', '17', '10', '9', '9', '8', '8', '6', '6', '6', '5', '3']\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment