Created
January 31, 2013 21:34
-
-
Save herrfz/4686684 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "data_munging_basics" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "Data munging basics -- in Python" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import pandas as pd\n", | |
| "import numpy as np" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 6 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Camera data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "cameraData = pd.read_csv('./data/cameras.csv')\n", | |
| "cameraData.columns\n", | |
| "\n", | |
| "# somehow the column is named 'Location 1' instead of 'Location.1'" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 7, | |
| "text": [ | |
| "Index([address, direction, street, crossStreet, intersection, Location 1], dtype=object)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# transform column names to lowercase\n", | |
| "# equivalent to R's tolower()\n", | |
| "cameraData.columns = cameraData.columns.map(lambda x: x.lower())\n", | |
| "cameraData.columns" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 8, | |
| "text": [ | |
| "Index([address, direction, street, crossstreet, intersection, location 1], dtype=object)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# string split; split a string into list of strings\n", | |
| "# equivalent to R's strsplit()\n", | |
| "splitNames = cameraData.columns.map(lambda x: x.split(' '))\n", | |
| "splitNames[4]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 9, | |
| "text": [ | |
| "['intersection']" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "splitNames[5]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 10, | |
| "text": [ | |
| "['location', '1']" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 10 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "splitNames[5][0]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 11, | |
| "text": [ | |
| "'location'" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 11 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# apply function to names\n", | |
| "cameraData.columns.map(lambda x: x.split(' ')[0])" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 12, | |
| "text": [ | |
| "array([address, direction, street, crossstreet, intersection, location], dtype=object)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 12 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Peer review experiment data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'\n", | |
| "fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'\n", | |
| "\n", | |
| "reviews = pd.read_csv(fileUrl1)\n", | |
| "solutions = pd.read_csv(fileUrl2)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 13 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "reviews.head(2)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>solution_id</th>\n", | |
| " <th>reviewer_id</th>\n", | |
| " <th>start</th>\n", | |
| " <th>stop</th>\n", | |
| " <th>time_left</th>\n", | |
| " <th>accept</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>0</strong></td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 27</td>\n", | |
| " <td> 1304095698</td>\n", | |
| " <td> 1304095758</td>\n", | |
| " <td> 1754</td>\n", | |
| " <td> 1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>1</strong></td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095188</td>\n", | |
| " <td> 1304095206</td>\n", | |
| " <td> 2306</td>\n", | |
| " <td> 1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 14, | |
| "text": [ | |
| " id solution_id reviewer_id start stop time_left accept\n", | |
| "0 1 3 27 1304095698 1304095758 1754 1\n", | |
| "1 2 4 22 1304095188 1304095206 2306 1" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 14 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "solutions.head(2)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>problem_id</th>\n", | |
| " <th>subject_id</th>\n", | |
| " <th>start</th>\n", | |
| " <th>stop</th>\n", | |
| " <th>time_left</th>\n", | |
| " <th>answer</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>0</strong></td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 156</td>\n", | |
| " <td> 29</td>\n", | |
| " <td> 1304095119</td>\n", | |
| " <td> 1304095169</td>\n", | |
| " <td> 2343</td>\n", | |
| " <td> B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>1</strong></td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 269</td>\n", | |
| " <td> 25</td>\n", | |
| " <td> 1304095119</td>\n", | |
| " <td> 1304095183</td>\n", | |
| " <td> 2329</td>\n", | |
| " <td> C</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 15, | |
| "text": [ | |
| " id problem_id subject_id start stop time_left answer\n", | |
| "0 1 156 29 1304095119 1304095169 2343 B\n", | |
| "1 2 269 25 1304095119 1304095183 2329 C" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "reviews.columns" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 16, | |
| "text": [ | |
| "Index([id, solution_id, reviewer_id, start, stop, time_left, accept], dtype=object)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 16 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# remove underscores\n", | |
| "# equivalent to R's sub()\n", | |
| "reviews.columns = reviews.columns.map(lambda x: x.replace('_', ''))\n", | |
| "solutions.columns = solutions.columns.map(lambda x: x.replace('_', ''))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 17 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# test; every occurence is replaced\n", | |
| "# in contrast to R's sub(), where only one occurence is replaced\n", | |
| "testName = 'this_is_a_test'\n", | |
| "testName.replace('_', '')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 18, | |
| "text": [ | |
| "'thisisatest'" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 18 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "reviews['timeleft'][:10]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 19, | |
| "text": [ | |
| "0 1754\n", | |
| "1 2306\n", | |
| "2 2192\n", | |
| "3 2089\n", | |
| "4 2043\n", | |
| "5 1999\n", | |
| "6 2130\n", | |
| "7 NaN\n", | |
| "8 1899\n", | |
| "9 2024\n", | |
| "Name: timeleft" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 19 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# cut into ranges\n", | |
| "# this is equivalent to R's cut()\n", | |
| "timeRanges = pd.cut(reviews['timeleft'], range(0, 4000, 600)) # note it needs to go up to 4000 in order to include 3600\n", | |
| "timeRanges[:10]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 20, | |
| "text": [ | |
| "Categorical: \n", | |
| "array([(1200, 1800], (1800, 2400], (1800, 2400], (1800, 2400],\n", | |
| " (1800, 2400], (1800, 2400], (1800, 2400], nan, (1800, 2400],\n", | |
| " (1800, 2400]], dtype=object)\n", | |
| "Levels (6): Index([(0, 600], (600, 1200], (1200, 1800], (1800, 2400],\n", | |
| " (2400, 3000], (3000, 3600]], dtype=object)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 20 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "type(timeRanges)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 21, | |
| "text": [ | |
| "pandas.core.categorical.Categorical" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 21 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "pd.value_counts(timeRanges)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 22, | |
| "text": [ | |
| "(600, 1200] 32\n", | |
| "(0, 600] 30\n", | |
| "(1800, 2400] 28\n", | |
| "(1200, 1800] 25" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 22 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# equivalent to R's cut2()\n", | |
| "timeRanges = pd.cut(reviews['timeleft'], 6)\n", | |
| "pd.value_counts(timeRanges) # note that NaN values are excluded; the resulting ranges are thus different from video" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 23, | |
| "text": [ | |
| "(19.716, 402.667] 22\n", | |
| "(1164, 1544.667] 20\n", | |
| "(783.333, 1164] 19\n", | |
| "(1544.667, 1925.333] 19\n", | |
| "(402.667, 783.333] 19\n", | |
| "(1925.333, 2306] 16" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 23 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# adding an extra variable\n", | |
| "# similarly to R, simply assign to a new column\n", | |
| "reviews['timeRanges'] = timeRanges\n", | |
| "reviews.head(2)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>solutionid</th>\n", | |
| " <th>reviewerid</th>\n", | |
| " <th>start</th>\n", | |
| " <th>stop</th>\n", | |
| " <th>timeleft</th>\n", | |
| " <th>accept</th>\n", | |
| " <th>timeRanges</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>0</strong></td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 27</td>\n", | |
| " <td> 1304095698</td>\n", | |
| " <td> 1304095758</td>\n", | |
| " <td> 1754</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> (1544.667, 1925.333]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>1</strong></td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095188</td>\n", | |
| " <td> 1304095206</td>\n", | |
| " <td> 2306</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> (1925.333, 2306]</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 24, | |
| "text": [ | |
| " id solutionid reviewerid start stop timeleft accept \\\n", | |
| "0 1 3 27 1304095698 1304095758 1754 1 \n", | |
| "1 2 4 22 1304095188 1304095206 2306 1 \n", | |
| "\n", | |
| " timeRanges \n", | |
| "0 (1544.667, 1925.333] \n", | |
| "1 (1925.333, 2306] " | |
| ] | |
| } | |
| ], | |
| "prompt_number": 24 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# merging data\n", | |
| "print reviews.columns\n", | |
| "print solutions.columns" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Index([id, solutionid, reviewerid, start, stop, timeleft, accept, timeRanges], dtype=object)\n", | |
| "Index([id, problemid, subjectid, start, stop, timeleft, answer], dtype=object)\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 25 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# equivalent to R's merge()\n", | |
| "mergedData2 = pd.merge(reviews, solutions, left_on='solutionid', right_on='id', sort=True)\n", | |
| "mergedData2.ix[:,0:6].head(3)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id_x</th>\n", | |
| " <th>solutionid</th>\n", | |
| " <th>reviewerid</th>\n", | |
| " <th>start_x</th>\n", | |
| " <th>stop_x</th>\n", | |
| " <th>timeleft_x</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>0</strong></td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 26</td>\n", | |
| " <td> 1304095267</td>\n", | |
| " <td> 1304095423</td>\n", | |
| " <td> 2089</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>1</strong></td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 29</td>\n", | |
| " <td> 1304095471</td>\n", | |
| " <td> 1304095513</td>\n", | |
| " <td> 1999</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>2</strong></td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 27</td>\n", | |
| " <td> 1304095698</td>\n", | |
| " <td> 1304095758</td>\n", | |
| " <td> 1754</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 26, | |
| "text": [ | |
| " id_x solutionid reviewerid start_x stop_x timeleft_x\n", | |
| "0 4 1 26 1304095267 1304095423 2089\n", | |
| "1 6 2 29 1304095471 1304095513 1999\n", | |
| "2 1 3 27 1304095698 1304095758 1754" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 26 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "reviews.ix[0,0:6]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 27, | |
| "text": [ | |
| "id 1\n", | |
| "solutionid 3\n", | |
| "reviewerid 27\n", | |
| "start 1.304096e+09\n", | |
| "stop 1.304096e+09\n", | |
| "timeleft 1754\n", | |
| "Name: 0" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 27 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# sorting values\n", | |
| "mergedData2['reviewerid'][:10]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 28, | |
| "text": [ | |
| "0 26\n", | |
| "1 29\n", | |
| "2 27\n", | |
| "3 22\n", | |
| "4 28\n", | |
| "5 22\n", | |
| "6 29\n", | |
| "7 23\n", | |
| "8 25\n", | |
| "9 29\n", | |
| "Name: reviewerid" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 28 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# equivalent to R's sort()\n", | |
| "mergedData2['reviewerid'].order()[:10]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 29, | |
| "text": [ | |
| "3 22\n", | |
| "5 22\n", | |
| "13 22\n", | |
| "21 22\n", | |
| "22 22\n", | |
| "23 22\n", | |
| "26 22\n", | |
| "31 22\n", | |
| "36 22\n", | |
| "38 22\n", | |
| "Name: reviewerid" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 29 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# R's 'order' can be implemented by getting the index of the sorted Series\n", | |
| "mergedData2['reviewerid'][mergedData2['reviewerid'].order().index][:10]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 30, | |
| "text": [ | |
| "3 22\n", | |
| "5 22\n", | |
| "13 22\n", | |
| "21 22\n", | |
| "22 22\n", | |
| "23 22\n", | |
| "26 22\n", | |
| "31 22\n", | |
| "36 22\n", | |
| "38 22\n", | |
| "Name: reviewerid" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 30 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# reordering a DataFrame\n", | |
| "mergedData2.ix[:,0:6].head(3)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id_x</th>\n", | |
| " <th>solutionid</th>\n", | |
| " <th>reviewerid</th>\n", | |
| " <th>start_x</th>\n", | |
| " <th>stop_x</th>\n", | |
| " <th>timeleft_x</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>0</strong></td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 26</td>\n", | |
| " <td> 1304095267</td>\n", | |
| " <td> 1304095423</td>\n", | |
| " <td> 2089</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>1</strong></td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 29</td>\n", | |
| " <td> 1304095471</td>\n", | |
| " <td> 1304095513</td>\n", | |
| " <td> 1999</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>2</strong></td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 27</td>\n", | |
| " <td> 1304095698</td>\n", | |
| " <td> 1304095758</td>\n", | |
| " <td> 1754</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 31, | |
| "text": [ | |
| " id_x solutionid reviewerid start_x stop_x timeleft_x\n", | |
| "0 4 1 26 1304095267 1304095423 2089\n", | |
| "1 6 2 29 1304095471 1304095513 1999\n", | |
| "2 1 3 27 1304095698 1304095758 1754" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 31 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# sort by reviewerid\n", | |
| "sortedData = mergedData2.ix[mergedData2['reviewerid'].order().index]\n", | |
| "sortedData.ix[:,0:6].head(3)\n", | |
| "\n", | |
| "# however, it's also possible to do: mergedData2.sort(['reviewerid'])" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id_x</th>\n", | |
| " <th>solutionid</th>\n", | |
| " <th>reviewerid</th>\n", | |
| " <th>start_x</th>\n", | |
| " <th>stop_x</th>\n", | |
| " <th>timeleft_x</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>3 </strong></td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095188</td>\n", | |
| " <td> 1304095206</td>\n", | |
| " <td> 2306</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>5 </strong></td>\n", | |
| " <td> 16</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095303</td>\n", | |
| " <td> 1304095471</td>\n", | |
| " <td> 2041</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>13</strong></td>\n", | |
| " <td> 12</td>\n", | |
| " <td> 14</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095280</td>\n", | |
| " <td> 1304095301</td>\n", | |
| " <td> 2211</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 32, | |
| "text": [ | |
| " id_x solutionid reviewerid start_x stop_x timeleft_x\n", | |
| "3 2 4 22 1304095188 1304095206 2306\n", | |
| "5 16 6 22 1304095303 1304095471 2041\n", | |
| "13 12 14 22 1304095280 1304095301 2211" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 32 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# reordering by multiple columns\n", | |
| "sortedData = mergedData2.sort(['reviewerid', 'id_x'])\n", | |
| "sortedData.ix[:,0:6].head(3)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id_x</th>\n", | |
| " <th>solutionid</th>\n", | |
| " <th>reviewerid</th>\n", | |
| " <th>start_x</th>\n", | |
| " <th>stop_x</th>\n", | |
| " <th>timeleft_x</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>3 </strong></td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095188</td>\n", | |
| " <td> 1304095206</td>\n", | |
| " <td> 2306</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>13</strong></td>\n", | |
| " <td> 12</td>\n", | |
| " <td> 14</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095280</td>\n", | |
| " <td> 1304095301</td>\n", | |
| " <td> 2211</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>5 </strong></td>\n", | |
| " <td> 16</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1304095303</td>\n", | |
| " <td> 1304095471</td>\n", | |
| " <td> 2041</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 33, | |
| "text": [ | |
| " id_x solutionid reviewerid start_x stop_x timeleft_x\n", | |
| "3 2 4 22 1304095188 1304095206 2306\n", | |
| "13 12 14 22 1304095280 1304095301 2211\n", | |
| "5 16 6 22 1304095303 1304095471 2041" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 33 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "reshaping data - example" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "misShaped = pd.DataFrame({'treatmentA' : [NaN, 1, 2], 'treatmentB' : [5, 4, 3]})\n", | |
| "misShaped['people'] = ['John', 'Jane', 'Mary']\n", | |
| "misShaped" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>treatmentA</th>\n", | |
| " <th>treatmentB</th>\n", | |
| " <th>people</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>0</strong></td>\n", | |
| " <td>NaN</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> John</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>1</strong></td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 4</td>\n", | |
| " <td> Jane</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>2</strong></td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> Mary</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 34, | |
| "text": [ | |
| " treatmentA treatmentB people\n", | |
| "0 NaN 5 John\n", | |
| "1 1 4 Jane\n", | |
| "2 2 3 Mary" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 34 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# equivalent to R's melt()\n", | |
| "pd.melt(misShaped, id_vars='people')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>people</th>\n", | |
| " <th>variable</th>\n", | |
| " <th>value</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td><strong>0</strong></td>\n", | |
| " <td> John</td>\n", | |
| " <td> treatmentA</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>1</strong></td>\n", | |
| " <td> Jane</td>\n", | |
| " <td> treatmentA</td>\n", | |
| " <td> 1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>2</strong></td>\n", | |
| " <td> Mary</td>\n", | |
| " <td> treatmentA</td>\n", | |
| " <td> 2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>3</strong></td>\n", | |
| " <td> John</td>\n", | |
| " <td> treatmentB</td>\n", | |
| " <td> 5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>4</strong></td>\n", | |
| " <td> Jane</td>\n", | |
| " <td> treatmentB</td>\n", | |
| " <td> 4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td><strong>5</strong></td>\n", | |
| " <td> Mary</td>\n", | |
| " <td> treatmentB</td>\n", | |
| " <td> 3</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "output_type": "pyout", | |
| "prompt_number": 35, | |
| "text": [ | |
| " people variable value\n", | |
| "0 John treatmentA NaN\n", | |
| "1 Jane treatmentA 1\n", | |
| "2 Mary treatmentA 2\n", | |
| "3 John treatmentB 5\n", | |
| "4 Jane treatmentB 4\n", | |
| "5 Mary treatmentB 3" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 35 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment