Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save fabsta/d256ba38572629e30dbcb25e09d94ce0 to your computer and use it in GitHub Desktop.
Save fabsta/d256ba38572629e30dbcb25e09d94ce0 to your computer and use it in GitHub Desktop.

[TOC]

Selecting

Cells

Y = data['TV'] # column
Y = data.TV
df.ix['Arizona', 2]  # Select the third cell in the row named Arizona
df.ix[2, 'deaths']  # Select the third cell down in the column named deaths
df[:2] # first two rows
df.ix['Maricopa'] # view a row
df.ix[:, 'coverage'] # view a column
df.ix['Yuma', 'coverage'] # view the value based on a row and column

Rows

by index

df.iloc[:2] # rows by row number
df.iloc[1:2] # Select the second and third row
df.iloc[2:] # Select every row after the third row
df.iloc[3:6,0:3] #  

by label

df.loc[:'Arizona'] # all rows by index label
df.ix[['Arizona', 'Texas']]   # .ix is the combination of both .loc and .iloc. Integers are first considered labels,if not found, falls back on pos indexing

conditional

df.query('A > C')
df.query('A > 0') 
df.query('A > 0 & A < 1')
df.query('A > B | A > C')
df[df['coverage'] > 50] # all rows where coverage is more than 50
df[(df['deaths'] > 500) | (df['deaths'] < 50)]
df[(df['score'] > 1) & (df['score'] < 5)]
df[~(df['regiment'] == 'Dragoons')] # Select all the regiments not named "Dragoons"

df[df['age'].notnull() & df['sex'].notnull()]  # ignore the missing data points

is in

df[df.name.isin(value_list)]   # value_list = ['Tina', 'Molly', 'Jason']
df[~df.name.isin(value_list)]

partial matching

df2[df2.E.str.contains("tw|ou")]

regex

df['raw'].str.contains('....-..-..', regex=True)  # regex

where cells are arrays

df[df['country'].map(lambda country: 'Syria' in country)]

random

df.take(np.random.permutation(len(df))[:2])

## Cell ranges

```python
df['x3'][:1]  # take column 'x3', 

Columns

df.iloc[:,:2] # Select the first 2 columns
feature_cols = ['TV','Radio','Newspaper']
x = data[feature_cols]
data[['TV','Radio','Newspaper']]

by column labels

df.loc[:,['A','B']]  # syntax is: df.loc[rows_index, cols_index]

conditional

df.filter(like='data')
df['preTestScore'].where(df['postTestScore'] > 50) # Find where a value exists in a column

Sorting

df.sort_values(by='reports', ascending=0)  # Sort the dataframe's rows by reports, in descending order
df.sort_values(by=['coverage', 'reports'])  # Sort the dataframe's rows by coverage and then by reports, in ascending order

Exploring

general

data.head()
data.tail()
data.tail().transpose()

dimensions

data.shape()

List unique values in the df['name'] column

df.name.unique()

Stats

Descriptive statistics by group

data['preTestScore'].groupby(df['company']).describe()
data['preTestScore'].describe()

Count the number of non-NA values

data['preTestScore'].count()
data['preTestScore'].min()

Value counts

data.group.value_counts()

Correlation Matrix Of Values

df.corr()

pd.rolling_mean(df, 2)  # Calculate the moving average. That is, take
                        # the first two values, average them, 
                        # then drop the first and add the third, etc.

Mean preTestScores grouped by regiment and company

#regiment    company
#Dragoons    1st         3.5
#            2nd        27.5
#Nighthawks  1st        14.0
#            2nd        16.5
#Scouts      1st         2.5
#            2nd         2.5
#dtype: float64
data['preTestScore'].groupby([df['regiment'], data['company']]).mean()

Mean preTestScores grouped by regiment and company without heirarchical indexing

#company	1st	2nd
#regiment		
#Dragoons	3.5	27.5
#Nighthawks	14.0	16.5
#Scouts	2.5	2.5
data['preTestScore'].groupby([data['regiment'], data['company']]).mean().unstack()

Group the entire dataframe by regiment and company

#preTestScore	postTestScore
#regiment	company		
#Dragoons	1st	3.5	47.5
#2nd	27.5	75.5
#Nighthawks	1st	14.0	59.5
#2nd	16.5	59.5
#Scouts	1st	2.5	66.0
#2nd	2.5	66.0
df.groupby(['regiment', 'company']).mean()

Count the number of times each number of deaths occurs in each regiment

 Input
	guardCorps	corps1	corps2	corps3	corps4	corps5	corps6	corps7	corps8	corps9	corps10	corps11	corps14	corps15
 1875	0	0	0	0	0	0	0	1	1	0	0	0	1	0
 1876	2	0	0	0	1	0	0	0	0	0	0	0	1	1
 1877	2	0	0	0	0	0	1	1	0	0	1	0	2	0
result = horsekick.apply(pd.value_counts).fillna(0); result

Create a crosstab table by company and regiment

 regiment	company	experience	name	preTestScore	postTestScore
 0	Nighthawks	infantry	veteran	Miller	4	25
 1	Nighthawks	infantry	rookie	Jacobson	24	94
 2	Nighthawks	cavalry	veteran	Ali	31	57
 -->
 company	cavalry	infantry	All
 regiment			
 Dragoons	2	2	4
 Nighthawks	2	2	4

Counting the number of observations by regiment and category

pd.crosstab(df.regiment, df.company, margins=True)

?

df['preTestScore'].idxmax() # row with max value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment