Skip to content

Instantly share code, notes, and snippets.

@herrfz
Created January 31, 2013 20:36
Show Gist options
  • Save herrfz/4686207 to your computer and use it in GitHub Desktop.
Save herrfz/4686207 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
{
"metadata": {
"name": "getting_data"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Getting data -- in Python"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# getting and setting directory\n",
"#\n",
"# equivalent to R getwd() and setwd() commands\n",
"#\n",
"# this is the more general Python way\n",
"# in IPython we can use system commands directly prefixed by !\n",
"import os\n",
"\n",
"os.getcwd()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 1,
"text": [
"'/Users/erriza/Documents/ipython-notebooks'"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"os.chdir('..')\n",
"os.getcwd()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 2,
"text": [
"'/Users/erriza/Documents'"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"os.chdir('./ipython-notebooks/')\n",
"os.getcwd()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 3,
"text": [
"'/Users/erriza/Documents/ipython-notebooks'"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# load data into Pandas\n",
"import pandas as pd\n",
"\n",
"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD'\n",
"\n",
"# we can directly use read_csv to download the file\n",
"# this is equivalent to R's combined download.file() and read.table() or read.csv() commands\n",
"cameraData = pd.read_csv(fileUrl)\n",
"\n",
"cameraData.to_csv('./data/cameras.csv', index=False)\n",
"\n",
"# for simplicity I'll use IPython tricks to list folder contents\n",
"!ls ./data\n",
"\n",
"\n",
"# get current date and time\n",
"# this is equivalent to R date() command\n",
"# note that I use IPython ! prefix to run my system's command\n",
"dateDownloaded = !date\n",
"print dateDownloaded\n",
"\n",
"cameraData.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"camera.xls camera.xlsx cameras.csv camerasModified.csv\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"['Thu Jan 31 21:32:24 CET 2013']\n"
]
},
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>address</th>\n",
" <th>direction</th>\n",
" <th>street</th>\n",
" <th>crossStreet</th>\n",
" <th>intersection</th>\n",
" <th>Location 1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> S CATON AVE & BENSON AVE</td>\n",
" <td> N/B</td>\n",
" <td> Caton Ave</td>\n",
" <td> Benson Ave</td>\n",
" <td> Caton Ave & Benson Ave</td>\n",
" <td> (39.2693779962, -76.6688185297)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> S CATON AVE & BENSON AVE</td>\n",
" <td> S/B</td>\n",
" <td> Caton Ave</td>\n",
" <td> Benson Ave</td>\n",
" <td> Caton Ave & Benson Ave</td>\n",
" <td> (39.2693157898, -76.6689698176)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
" <td> E/B</td>\n",
" <td> Wilkens Ave</td>\n",
" <td> Pine Heights</td>\n",
" <td> Wilkens Ave & Pine Heights</td>\n",
" <td> (39.2720252302, -76.676960806)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>3</strong></td>\n",
" <td> THE ALAMEDA & E 33RD ST</td>\n",
" <td> S/B</td>\n",
" <td> The Alameda</td>\n",
" <td> 33rd St</td>\n",
" <td> The Alameda & 33rd St</td>\n",
" <td> (39.3285013141, -76.5953545714)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>4</strong></td>\n",
" <td> E 33RD ST & THE ALAMEDA</td>\n",
" <td> E/B</td>\n",
" <td> E 33rd</td>\n",
" <td> The Alameda</td>\n",
" <td> E 33rd & The Alameda</td>\n",
" <td> (39.3283410623, -76.5953594625)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 4,
"text": [
" address direction street crossStreet \\\n",
"0 S CATON AVE & BENSON AVE N/B Caton Ave Benson Ave \n",
"1 S CATON AVE & BENSON AVE S/B Caton Ave Benson Ave \n",
"2 WILKENS AVE & PINE HEIGHTS AVE E/B Wilkens Ave Pine Heights \n",
"3 THE ALAMEDA & E 33RD ST S/B The Alameda 33rd St \n",
"4 E 33RD ST & THE ALAMEDA E/B E 33rd The Alameda \n",
"\n",
" intersection Location 1 \n",
"0 Caton Ave & Benson Ave (39.2693779962, -76.6688185297) \n",
"1 Caton Ave & Benson Ave (39.2693157898, -76.6689698176) \n",
"2 Wilkens Ave & Pine Heights (39.2720252302, -76.676960806) \n",
"3 The Alameda & 33rd St (39.3285013141, -76.5953545714) \n",
"4 E 33rd & The Alameda (39.3283410623, -76.5953594625) "
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# read Excel file\n",
"# this is equivalent to R read.xlsx() and read.xlsx2() commands\n",
"#\n",
"# we need openpyxl 1.5.8 (don't use the latest version due to a bug) and xlrd packages\n",
"# install with: \n",
"# sudo pip install openpyxl==1.5.8\n",
"# sudo pip install xlrd\n",
"#\n",
"# unfortunately Pandas ExcelFile() can't download and read at once (in contrast to read_csv())\n",
"# so we need to resort to the basic Python way\n",
"# also notice I'm using .xls; .xlsx doesn't work in my computer\n",
"import urllib2\n",
"\n",
"# download the file as camera.xls and save it in ./data subfolder\n",
"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xls?accessType=DOWNLOAD'\n",
"f = urllib2.urlopen(fileUrl)\n",
"data = f.read()\n",
"with open('./data/camera.xls', 'wb') as w:\n",
" w.write(data)\n",
"\n",
"# load the Excel file as a pandas DataFrame\n",
"cameraData = pd.ExcelFile('./data/camera.xls')\n",
"cameraData = cameraData.parse('Baltimore Fixed Speed Cameras', index_col=None, na_values=['NA'])\n",
"cameraData.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>address</th>\n",
" <th>direction</th>\n",
" <th>street</th>\n",
" <th>crossStreet</th>\n",
" <th>intersection</th>\n",
" <th>Location 1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> S CATON AVE & BENSON AVE</td>\n",
" <td> N/B</td>\n",
" <td> Caton Ave</td>\n",
" <td> Benson Ave</td>\n",
" <td> Caton Ave & Benson Ave</td>\n",
" <td> (39.2693779962, -76.6688185297)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> S CATON AVE & BENSON AVE</td>\n",
" <td> S/B</td>\n",
" <td> Caton Ave</td>\n",
" <td> Benson Ave</td>\n",
" <td> Caton Ave & Benson Ave</td>\n",
" <td> (39.2693157898, -76.6689698176)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
" <td> E/B</td>\n",
" <td> Wilkens Ave</td>\n",
" <td> Pine Heights</td>\n",
" <td> Wilkens Ave & Pine Heights</td>\n",
" <td> (39.2720252302, -76.676960806)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>3</strong></td>\n",
" <td> THE ALAMEDA & E 33RD ST</td>\n",
" <td> S/B</td>\n",
" <td> The Alameda</td>\n",
" <td> 33rd St</td>\n",
" <td> The Alameda & 33rd St</td>\n",
" <td> (39.3285013141, -76.5953545714)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>4</strong></td>\n",
" <td> E 33RD ST & THE ALAMEDA</td>\n",
" <td> E/B</td>\n",
" <td> E 33rd</td>\n",
" <td> The Alameda</td>\n",
" <td> E 33rd & The Alameda</td>\n",
" <td> (39.3283410623, -76.5953594625)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 5,
"text": [
" address direction street crossStreet \\\n",
"0 S CATON AVE & BENSON AVE N/B Caton Ave Benson Ave \n",
"1 S CATON AVE & BENSON AVE S/B Caton Ave Benson Ave \n",
"2 WILKENS AVE & PINE HEIGHTS AVE E/B Wilkens Ave Pine Heights \n",
"3 THE ALAMEDA & E 33RD ST S/B The Alameda 33rd St \n",
"4 E 33RD ST & THE ALAMEDA E/B E 33rd The Alameda \n",
"\n",
" intersection Location 1 \n",
"0 Caton Ave & Benson Ave (39.2693779962, -76.6688185297) \n",
"1 Caton Ave & Benson Ave (39.2693157898, -76.6689698176) \n",
"2 Wilkens Ave & Pine Heights (39.2720252302, -76.676960806) \n",
"3 The Alameda & 33rd St (39.3285013141, -76.5953545714) \n",
"4 E 33rd & The Alameda (39.3283410623, -76.5953594625) "
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# R readLines() for reading a text file is similar to standard Python file access\n",
"# I'm not going to detail it here\n",
"\n",
"# R readLines() to read data from a website is similar to Python with urllib2 package as in xls example above\n",
"\n",
"# read json\n",
"# this is equivalent to R fromJSON() command\n",
"import json\n",
"\n",
"# first we get the json file from the website\n",
"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.json?accessType=DOWNLOAD'\n",
"req = urllib2.Request(fileUrl)\n",
"opener = urllib2.build_opener()\n",
"f = opener.open(req)\n",
"\n",
"# then we read it into a data structure\n",
"jsonCamera = json.loads(f.read())\n",
"\n",
"# json is loadad as dictionary\n",
"print jsonCamera['meta']['view']['id']\n",
"print jsonCamera['meta']['view']['name']\n",
"print jsonCamera['meta']['view']['attribution']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"dz54-2aru\n",
"Baltimore Fixed Speed Cameras\n",
"Department of Transportation\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# writing data\n",
"\n",
"# first read the csv file\n",
"cameraData = pd.read_csv('./data/cameras.csv')\n",
"# take a subset of the columns\n",
"tmpData = cameraData.ix[:,2:]\n",
"# then save it to a different csv file\n",
"# this is equivalent to R write.table() command\n",
"tmpData.to_csv('./data/camerasModified.csv', sep=',', index=False)\n",
"cameraData2 = pd.read_csv('./data/camerasModified.csv')\n",
"cameraData2.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>street</th>\n",
" <th>crossStreet</th>\n",
" <th>intersection</th>\n",
" <th>Location 1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> Caton Ave</td>\n",
" <td> Benson Ave</td>\n",
" <td> Caton Ave & Benson Ave</td>\n",
" <td> (39.2693779962, -76.6688185297)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> Caton Ave</td>\n",
" <td> Benson Ave</td>\n",
" <td> Caton Ave & Benson Ave</td>\n",
" <td> (39.2693157898, -76.6689698176)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> Wilkens Ave</td>\n",
" <td> Pine Heights</td>\n",
" <td> Wilkens Ave & Pine Heights</td>\n",
" <td> (39.2720252302, -76.676960806)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>3</strong></td>\n",
" <td> The Alameda</td>\n",
" <td> 33rd St</td>\n",
" <td> The Alameda & 33rd St</td>\n",
" <td> (39.3285013141, -76.5953545714)</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>4</strong></td>\n",
" <td> E 33rd</td>\n",
" <td> The Alameda</td>\n",
" <td> E 33rd & The Alameda</td>\n",
" <td> (39.3283410623, -76.5953594625)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 7,
"text": [
" street crossStreet intersection \\\n",
"0 Caton Ave Benson Ave Caton Ave & Benson Ave \n",
"1 Caton Ave Benson Ave Caton Ave & Benson Ave \n",
"2 Wilkens Ave Pine Heights Wilkens Ave & Pine Heights \n",
"3 The Alameda 33rd St The Alameda & 33rd St \n",
"4 E 33rd The Alameda E 33rd & The Alameda \n",
"\n",
" Location 1 \n",
"0 (39.2693779962, -76.6688185297) \n",
"1 (39.2693157898, -76.6689698176) \n",
"2 (39.2720252302, -76.676960806) \n",
"3 (39.3285013141, -76.5953545714) \n",
"4 (39.3283410623, -76.5953594625) "
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# I don't find the equivalent of R's saving and loading workspace commands"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# R's paste() and paste0() commands look like standard Python's string manipulations\n",
"print ['./data' + str(i) + '.csv' for i in range(1, 6)]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"['./data1.csv', './data2.csv', './data3.csv', './data4.csv', './data5.csv']\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# getting data off webpages\n",
"from lxml.html import parse\n",
"\n",
"url = 'http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en'\n",
"\n",
"# this is equivalent to the combined R's opening/reading/closing connection and htmlTreeParse() commands\n",
"html3 = parse(url).getroot()\n",
"\n",
"# get the title text using xpath expression\n",
"# this is equivalent to R xpathSApply() command\n",
"title = html3.xpath('//title')\n",
"print [x.text_content() for x in title]\n",
"\n",
"# get the texts of col-citedby elements using xpath expression\n",
"citedby = html3.xpath(\"//td[@id='col-citedby']\")\n",
"print [x.text_content() for x in citedby]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"['Jeff Leek - Google Scholar Citations']\n",
"['Cited by', '337', '172', '140', '133', '107', '95', '79', '78', '53', '17', '10', '9', '9', '8', '8', '6', '6', '6', '5', '3']\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment