Created
July 21, 2016 07:57
-
-
Save fabsta/c7f7d8f97d558d6ab54cc18b9e88de2d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### single | |
Y = data['TV'] # column | |
Y = data.TV | |
df[:2] # first two rows | |
df.ix['Maricopa'] # view a row | |
df.ix[:, 'coverage'] # view a column | |
df.ix['Yuma', 'coverage'] # view the value based on a row and column | |
## select rows | |
# by index | |
df.iloc[:2] # rows by row number | |
df.iloc[1:2] # Select the second and third row | |
df.iloc[2:] # Select every row after the third row | |
df.iloc[3:6,0:3] # | |
# by label | |
df.loc[:'Arizona'] # all rows by index label | |
df.ix[['Arizona', 'Texas']] # .ix is the combination of both .loc and .iloc. Integers are first considered labels,if not found, falls back on pos indexing | |
# conditional | |
df.query('A > C') | |
df.query('A > 0') | |
df.query('A > 0 & A < 1') | |
df.query('A > B | A > C') | |
df[df['coverage'] > 50] # all rows where coverage is more than 50 | |
df[(df['deaths'] > 500) | (df['deaths'] < 50)] | |
df[(df['score'] > 1) & (df['score'] < 5)] | |
df[~(df['regiment'] == 'Dragoons')] # Select all the regiments not named "Dragoons" | |
df[df['age'].notnull() & df['sex'].notnull()] # ignore the missing data points | |
# is in | |
df[df.name.isin(value_list)] # value_list = ['Tina', 'Molly', 'Jason'] | |
df[~df.name.isin(value_list)] | |
# partial matching | |
df2[df2.E.str.contains("tw|ou")] | |
# regex | |
df['raw'].str.contains('....-..-..', regex=True) # regex | |
# where cells are arrays | |
df[df['country'].map(lambda country: 'Syria' in country)] | |
# random | |
df.take(np.random.permutation(len(df))[:2]) | |
# select cells | |
df.ix['Arizona', 2] # Select the third cell in the row named Arizona | |
df.ix[2, 'deaths'] # Select the third cell down in the column named deaths | |
# select cell ranges | |
df['x3'][:1] # take column 'x3', | |
# select columns | |
df.iloc[:,:2] # Select the first 2 columns | |
feature_cols = ['TV','Radio','Newspaper'] | |
x = data[feature_cols] | |
data[['TV','Radio','Newspaper']] | |
# column labels | |
df.loc[:,['A','B']] # syntax is: df.loc[rows_index, cols_index] | |
# conditional | |
df.filter(like='data') | |
df['preTestScore'].where(df['postTestScore'] > 50) # Find where a value exists in a column | |
### sorting | |
df.sort_values(by='reports', ascending=0) # Sort the dataframe's rows by reports, in descending order | |
df.sort_values(by=['coverage', 'reports']) # Sort the dataframe's rows by coverage and then by reports, in ascending order | |
# Exploring | |
### general | |
data.head() | |
data.tail() | |
data.tail().transpose() | |
# dimensions | |
data.shape() | |
#List unique values in the df['name'] column | |
df.name.unique() | |
#### | |
# Stats | |
#### | |
# Descriptive statistics by group | |
data['preTestScore'].groupby(df['company']).describe() | |
data['preTestScore'].describe() | |
# Count the number of non-NA values | |
data['preTestScore'].count() | |
data['preTestScore'].min() | |
# value counts | |
data.group.value_counts() | |
# Correlation Matrix Of Values | |
df.corr() | |
pd.rolling_mean(df, 2) # Calculate the moving average. That is, take | |
# the first two values, average them, | |
# then drop the first and add the third, etc. | |
##### | |
# Mean preTestScores grouped by regiment and company | |
##### | |
#regiment company | |
#Dragoons 1st 3.5 | |
# 2nd 27.5 | |
#Nighthawks 1st 14.0 | |
# 2nd 16.5 | |
#Scouts 1st 2.5 | |
# 2nd 2.5 | |
#dtype: float64 | |
data['preTestScore'].groupby([df['regiment'], data['company']]).mean() | |
#### | |
# Mean preTestScores grouped by regiment and company without heirarchical indexing | |
#### | |
#company 1st 2nd | |
#regiment | |
#Dragoons 3.5 27.5 | |
#Nighthawks 14.0 16.5 | |
#Scouts 2.5 2.5 | |
data['preTestScore'].groupby([data['regiment'], data['company']]).mean().unstack() | |
##### | |
# Group the entire dataframe by regiment and company | |
##### | |
#preTestScore postTestScore | |
#regiment company | |
#Dragoons 1st 3.5 47.5 | |
#2nd 27.5 75.5 | |
#Nighthawks 1st 14.0 59.5 | |
#2nd 16.5 59.5 | |
#Scouts 1st 2.5 66.0 | |
#2nd 2.5 66.0 | |
df.groupby(['regiment', 'company']).mean() | |
# Count the number of times each number of deaths occurs in each regiment | |
# Input | |
# guardCorps corps1 corps2 corps3 corps4 corps5 corps6 corps7 corps8 corps9 corps10 corps11 corps14 corps15 | |
# 1875 0 0 0 0 0 0 0 1 1 0 0 0 1 0 | |
# 1876 2 0 0 0 1 0 0 0 0 0 0 0 1 1 | |
# 1877 2 0 0 0 0 0 1 1 0 0 1 0 2 0 | |
result = horsekick.apply(pd.value_counts).fillna(0); result | |
# Create a crosstab table by company and regiment | |
# regiment company experience name preTestScore postTestScore | |
# 0 Nighthawks infantry veteran Miller 4 25 | |
# 1 Nighthawks infantry rookie Jacobson 24 94 | |
# 2 Nighthawks cavalry veteran Ali 31 57 | |
# --> | |
# company cavalry infantry All | |
# regiment | |
# Dragoons 2 2 4 | |
# Nighthawks 2 2 4 | |
# Counting the number of observations by regiment and category | |
pd.crosstab(df.regiment, df.company, margins=True) | |
df['preTestScore'].idxmax() # row with max value | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment