Created
October 17, 2012 08:55
-
-
Save wesm/3904544 to your computer and use it in GitHub Desktop.
Foo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Basics" | |
}, | |
"nbformat": 2, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"from pandas import *", | |
"import pandas", | |
"import numpy as np", | |
"", | |
"def side_by_side(*objs, **kwds):", | |
" from pandas.core.common import adjoin", | |
" space = kwds.get('space', 4)", | |
" reprs = [repr(obj).split('\\n') for obj in objs]", | |
" print adjoin(space, *reprs)", | |
"", | |
"plt.rc('figure', figsize=(10, 6))", | |
"pandas.set_printoptions(notebook_repr_html=False)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Series", | |
"======" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"labels = ['a', 'b', 'c', 'd', 'e']", | |
"s = Series(randn(5), index=labels)", | |
"s" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"'b' in s" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"s['b']" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"s" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mapping = s.to_dict()", | |
"mapping" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"s = Series(mapping)", | |
"s" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"s[:3]" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"s.index" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"DataFrame: 2D collection of Series", | |
"==================================" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = DataFrame({'a': np.random.randn(6),", | |
" 'b': ['foo', 'bar'] * 3,", | |
" 'c': np.random.randn(6)})", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.index" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.columns" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = DataFrame({'a': np.random.randn(6),", | |
" 'b': ['foo', 'bar'] * 3,", | |
" 'c': np.random.randn(6)},", | |
" index=DateRange('1/1/2000', periods=6))", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = DataFrame({'a': np.random.randn(6),", | |
" 'b': ['foo', 'bar'] * 3,", | |
" 'c': np.random.randn(6)},", | |
" columns=['a', 'b', 'c', 'd'])", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Creation from nested dicts", | |
"--------------------------", | |
"", | |
"These arise naturally in Python code" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data = {}", | |
"for col in ['foo', 'bar', 'baz']:", | |
" for row in ['a', 'b', 'c', 'd']:", | |
" data.setdefault(col, {})[row] = randn()", | |
"data" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"DataFrame(data)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Data alignment", | |
"==============" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"close_px" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"s1 = close_px['AAPL'][-20:]", | |
"s2 = close_px['AAPL'][-25:-10]", | |
"side_by_side(s1, s2)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"s1 + s2" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = close_px.ix[-10:, :3]", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"side_by_side(s1.reindex(s2.index), s2)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"b, c = s1.align(s2, join='inner')", | |
"side_by_side(b, c)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"b, c = s1.align(s2, join='outer')", | |
"side_by_side(b, c)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"b, c = s1.align(s2, join='right')", | |
"side_by_side(b, c)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 26 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df2 = df.ix[::2, ['IBM', 'MSFT']]", | |
"side_by_side(df, df2)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 27 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df + df2" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 28 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"b, c = df.align(df2, join='inner')", | |
"side_by_side(b, c) " | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 29 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Transposing: no copy if all columns are same type", | |
"-------------------------------------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df[:5].T" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Columns can be any type", | |
"-----------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"n = 10", | |
"foo = DataFrame(index=range(n))", | |
"foo['floats'] = np.random.randn(n)", | |
"foo['ints'] = np.arange(n)", | |
"foo['strings'] = ['foo', 'bar'] * (n / 2)", | |
"foo['bools'] = foo['floats'] > 0", | |
"foo['objects'] = DateRange('1/1/2000', periods=n)", | |
"foo" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"foo.dtypes" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"N.B. transposing is not roundtrippable in this case (column-oriented data structure)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"foo.T.T" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 33 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"foo.T.T.dtypes" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 34 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Function application", | |
"====================", | |
"", | |
"You can apply arbitrary functions to the rows or columns of a DataFrame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.apply(np.mean)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 35 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.apply(np.mean, axis=1)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 36 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"You can get as fancy as you want" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"close_px" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 37 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def peak_date(series):", | |
" return series.index[series.argmax()]", | |
"close_px.apply(peak_date)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.apply(lambda x: x.max() - x.min()) # np.ptp" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"np.log(close_px)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 40 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Plotting", | |
"========", | |
"", | |
"Some basic plotting integration with matplotlib in Series / DataFrame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"close_px[['AAPL', 'IBM', 'MSFT', 'XOM']].plot()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 41 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"rets.ix[-1]" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 42 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"close_px.ix[-1].plot(kind='bar')", | |
"title('Prices on %s' % close_px.index[-1])", | |
"axhline(0)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 43 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Hierarchical indexing", | |
"---------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],", | |
" ['one', 'two', 'three']],", | |
" labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],", | |
" [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])", | |
"hdf = DataFrame(np.random.randn(10, 3), index=index,", | |
" columns=['A', 'B', 'C'])", | |
"hdf" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 44 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"hdf.ix['foo']" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 45 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"hdf.ix['foo'] = 0", | |
"hdf" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 46 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"hdf.ix['foo', 'three']" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 47 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Stacking and unstacking", | |
"-----------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"tuples = zip(*[['bar', 'bar', 'baz', 'baz',", | |
" 'foo', 'foo', 'qux', 'qux'],", | |
" ['one', 'two', 'one', 'two',", | |
" 'one', 'two', 'one', 'two']])", | |
"index = MultiIndex.from_tuples(tuples)", | |
"columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),", | |
" ('B', 'cat'), ('A', 'dog')])", | |
"df = DataFrame(randn(8, 4), index=index, columns=columns)", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 48 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df2 = df.ix[[0, 1, 2, 4, 5, 7]]", | |
"df2" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 49 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.unstack()['B']" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 50 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"GroupBy", | |
"=======" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',", | |
" 'foo', 'bar', 'foo', 'foo'],", | |
" 'B' : ['one', 'one', 'two', 'three',", | |
" 'two', 'two', 'one', 'three'],", | |
" 'C' : np.random.randn(8),", | |
" 'D' : np.random.randn(8)})", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 51 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for key, group in df.groupby('A'):", | |
" print key", | |
" print group" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 52 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.groupby('A')['C'].describe().T" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 53 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.groupby('A').mean()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 54 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for key, group in df.groupby('A'):", | |
" print key", | |
" print group" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 55 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.groupby(['A', 'B']).mean()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 56 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.groupby(['A', 'B'], as_index=False).mean()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 57 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"GroupBy example: linear regression by group", | |
"-------------------------------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import scikits.statsmodels.api as sm", | |
"rets = close_px / close_px.shift(1) - 1", | |
"", | |
"def get_beta(rets):", | |
" rets = rets.dropna()", | |
" rets['intercept'] = 1.", | |
" model = sm.OLS(rets['MSFT'], rets.ix[:, ['AAPL', 'intercept']]).fit()", | |
" return model.params", | |
"", | |
"get_beta(rets)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 58 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"grouped = rets.groupby([lambda x: x.year, lambda x: x.month])", | |
"beta_by_ym = grouped.apply(get_beta)", | |
"beta_by_ym" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 59 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"beta_by_ym.unstack(0)['AAPL']" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 60 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"GroupBy with hierarchical indexing", | |
"----------------------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"tuples = zip(*[['bar', 'bar', 'baz', 'baz',", | |
" 'foo', 'foo', 'qux', 'qux'],", | |
" ['one', 'two', 'one', 'two',", | |
" 'one', 'two', 'one', 'two']])", | |
"index = MultiIndex.from_tuples(tuples)", | |
"columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),", | |
" ('B', 'cat'), ('A', 'dog')])", | |
"df = DataFrame(randn(8, 4), index=index, columns=columns)", | |
"df" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 61 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.groupby(level=0, axis=0).mean()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 62 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.stack()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 63 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.stack().mean(1).unstack()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 64 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# could also have done", | |
"df.groupby(level=1, axis=1).mean()" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 65 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": " " | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment