Created
October 17, 2012 08:55
-
-
Save wesm/3904544 to your computer and use it in GitHub Desktop.
Foo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "Basics" | |
| }, | |
| "nbformat": 2, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "collapsed": true, | |
| "input": [ | |
| "from pandas import *", | |
| "import pandas", | |
| "import numpy as np", | |
| "", | |
| "def side_by_side(*objs, **kwds):", | |
| " from pandas.core.common import adjoin", | |
| " space = kwds.get('space', 4)", | |
| " reprs = [repr(obj).split('\\n') for obj in objs]", | |
| " print adjoin(space, *reprs)", | |
| "", | |
| "plt.rc('figure', figsize=(10, 6))", | |
| "pandas.set_printoptions(notebook_repr_html=False)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Series", | |
| "======" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "labels = ['a', 'b', 'c', 'd', 'e']", | |
| "s = Series(randn(5), index=labels)", | |
| "s" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "'b' in s" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 3 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "s['b']" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "s" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 5 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "mapping = s.to_dict()", | |
| "mapping" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 6 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "s = Series(mapping)", | |
| "s" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "s[:3]" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "s.index" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "DataFrame: 2D collection of Series", | |
| "==================================" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df = DataFrame({'a': np.random.randn(6),", | |
| " 'b': ['foo', 'bar'] * 3,", | |
| " 'c': np.random.randn(6)})", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 10 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.index" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 11 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.columns" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 12 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df = DataFrame({'a': np.random.randn(6),", | |
| " 'b': ['foo', 'bar'] * 3,", | |
| " 'c': np.random.randn(6)},", | |
| " index=DateRange('1/1/2000', periods=6))", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 13 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df = DataFrame({'a': np.random.randn(6),", | |
| " 'b': ['foo', 'bar'] * 3,", | |
| " 'c': np.random.randn(6)},", | |
| " columns=['a', 'b', 'c', 'd'])", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 14 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Creation from nested dicts", | |
| "--------------------------", | |
| "", | |
| "These arise naturally in Python code" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "data = {}", | |
| "for col in ['foo', 'bar', 'baz']:", | |
| " for row in ['a', 'b', 'c', 'd']:", | |
| " data.setdefault(col, {})[row] = randn()", | |
| "data" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "DataFrame(data)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 16 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Data alignment", | |
| "==============" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": true, | |
| "input": [ | |
| "close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 17 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "close_px" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 18 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "s1 = close_px['AAPL'][-20:]", | |
| "s2 = close_px['AAPL'][-25:-10]", | |
| "side_by_side(s1, s2)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 19 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "s1 + s2" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 20 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df = close_px.ix[-10:, :3]", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 21 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "side_by_side(s1.reindex(s2.index), s2)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 22 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "b, c = s1.align(s2, join='inner')", | |
| "side_by_side(b, c)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 23 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "b, c = s1.align(s2, join='outer')", | |
| "side_by_side(b, c)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 24 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "b, c = s1.align(s2, join='right')", | |
| "side_by_side(b, c)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 25 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 26 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df2 = df.ix[::2, ['IBM', 'MSFT']]", | |
| "side_by_side(df, df2)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 27 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df + df2" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 28 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "b, c = df.align(df2, join='inner')", | |
| "side_by_side(b, c) " | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 29 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Transposing: no copy if all columns are same type", | |
| "-------------------------------------------------" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df[:5].T" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 30 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Columns can be any type", | |
| "-----------------------" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "n = 10", | |
| "foo = DataFrame(index=range(n))", | |
| "foo['floats'] = np.random.randn(n)", | |
| "foo['ints'] = np.arange(n)", | |
| "foo['strings'] = ['foo', 'bar'] * (n / 2)", | |
| "foo['bools'] = foo['floats'] > 0", | |
| "foo['objects'] = DateRange('1/1/2000', periods=n)", | |
| "foo" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 31 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "foo.dtypes" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 32 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "N.B. transposing is not roundtrippable in this case (column-oriented data structure)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "foo.T.T" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 33 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "foo.T.T.dtypes" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 34 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Function application", | |
| "====================", | |
| "", | |
| "You can apply arbitrary functions to the rows or columns of a DataFrame" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.apply(np.mean)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 35 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.apply(np.mean, axis=1)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 36 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "You can get as fancy as you want" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "close_px" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 37 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def peak_date(series):", | |
| " return series.index[series.argmax()]", | |
| "close_px.apply(peak_date)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 38 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.apply(lambda x: x.max() - x.min()) # np.ptp" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 39 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "np.log(close_px)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 40 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Plotting", | |
| "========", | |
| "", | |
| "Some basic plotting integration with matplotlib in Series / DataFrame" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "close_px[['AAPL', 'IBM', 'MSFT', 'XOM']].plot()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 41 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "rets.ix[-1]" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 42 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "close_px.ix[-1].plot(kind='bar')", | |
| "title('Prices on %s' % close_px.index[-1])", | |
| "axhline(0)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 43 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Hierarchical indexing", | |
| "---------------------" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],", | |
| " ['one', 'two', 'three']],", | |
| " labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],", | |
| " [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])", | |
| "hdf = DataFrame(np.random.randn(10, 3), index=index,", | |
| " columns=['A', 'B', 'C'])", | |
| "hdf" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 44 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "hdf.ix['foo']" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 45 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "hdf.ix['foo'] = 0", | |
| "hdf" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 46 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "hdf.ix['foo', 'three']" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 47 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Stacking and unstacking", | |
| "-----------------------" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "tuples = zip(*[['bar', 'bar', 'baz', 'baz',", | |
| " 'foo', 'foo', 'qux', 'qux'],", | |
| " ['one', 'two', 'one', 'two',", | |
| " 'one', 'two', 'one', 'two']])", | |
| "index = MultiIndex.from_tuples(tuples)", | |
| "columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),", | |
| " ('B', 'cat'), ('A', 'dog')])", | |
| "df = DataFrame(randn(8, 4), index=index, columns=columns)", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 48 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df2 = df.ix[[0, 1, 2, 4, 5, 7]]", | |
| "df2" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 49 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.unstack()['B']" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 50 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "GroupBy", | |
| "=======" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',", | |
| " 'foo', 'bar', 'foo', 'foo'],", | |
| " 'B' : ['one', 'one', 'two', 'three',", | |
| " 'two', 'two', 'one', 'three'],", | |
| " 'C' : np.random.randn(8),", | |
| " 'D' : np.random.randn(8)})", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 51 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "for key, group in df.groupby('A'):", | |
| " print key", | |
| " print group" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 52 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.groupby('A')['C'].describe().T" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 53 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.groupby('A').mean()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 54 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "for key, group in df.groupby('A'):", | |
| " print key", | |
| " print group" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 55 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.groupby(['A', 'B']).mean()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 56 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.groupby(['A', 'B'], as_index=False).mean()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 57 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "GroupBy example: linear regression by group", | |
| "-------------------------------------------" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import scikits.statsmodels.api as sm", | |
| "rets = close_px / close_px.shift(1) - 1", | |
| "", | |
| "def get_beta(rets):", | |
| " rets = rets.dropna()", | |
| " rets['intercept'] = 1.", | |
| " model = sm.OLS(rets['MSFT'], rets.ix[:, ['AAPL', 'intercept']]).fit()", | |
| " return model.params", | |
| "", | |
| "get_beta(rets)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 58 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "grouped = rets.groupby([lambda x: x.year, lambda x: x.month])", | |
| "beta_by_ym = grouped.apply(get_beta)", | |
| "beta_by_ym" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 59 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "beta_by_ym.unstack(0)['AAPL']" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 60 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "GroupBy with hierarchical indexing", | |
| "----------------------------------" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "tuples = zip(*[['bar', 'bar', 'baz', 'baz',", | |
| " 'foo', 'foo', 'qux', 'qux'],", | |
| " ['one', 'two', 'one', 'two',", | |
| " 'one', 'two', 'one', 'two']])", | |
| "index = MultiIndex.from_tuples(tuples)", | |
| "columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),", | |
| " ('B', 'cat'), ('A', 'dog')])", | |
| "df = DataFrame(randn(8, 4), index=index, columns=columns)", | |
| "df" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 61 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.groupby(level=0, axis=0).mean()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 62 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.stack()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 63 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "df.stack().mean(1).unstack()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 64 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# could also have done", | |
| "df.groupby(level=1, axis=1).mean()" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 65 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": true, | |
| "input": [], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": " " | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment