Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save fabsta/f41bcdcbf107eaa5aec90464f0bef8e6 to your computer and use it in GitHub Desktop.
Save fabsta/f41bcdcbf107eaa5aec90464f0bef8e6 to your computer and use it in GitHub Desktop.

[TOC]

Apply function to column

df['TitleLength'] = df.Title.apply(len)

df.columns = map(str.lower, df.columns) # lowercase function to column

Apply A Function (Rolling Mean) To The DataFrame, By Group

Group df by df.platoon, then apply a rolling mean lambda function to df.casualties

df.groupby('Platoon')['Casualties'].apply(lambda x:x.rolling(center=False,window=2).mean())

Apply to all cells (drop string)

Drop the string variable so that applymap() can run

df = df.drop('name', axis=1)

Return the square root of every cell in the dataframe

df.applymap(np.sqrt)

Slicing

Select A where b = "value"

pd_train.loc[pd_train['AnimalType'] == 'Dog',feature]

Select subset of columns

df[df.columns[1:]]  # row,column


sliced = data[data.workclass.isin([' Local-gov', ' State-gov']) \
                 & (data.education_num > 1)][['age', 'workclass']]


cities <- c("Adelaide", "Brisbane", "Canberra", "Darwin")
dss    <- subset(ds, location %in% cities)

Dataframe

df.replace(-999, np.nan) # replacing values

Joining / Merging

df_new = pd.concat([df_a, df_b]) # Join along rows
pd.concat([df_a, df_b], axis=1) # Join along columns
pd.merge(df_new, df_n, on='subject_id') # two dataframes along the subject_id value
pd.merge(df_a, df_b, on='subject_id', how='outer') # outer join
pd.merge(df_a, df_b, on='subject_id', how='inner') # inner join
pd.merge(df_a, df_b, on='subject_id', how='right') # right join
pd.merge(df_a, df_b, on='subject_id', how='left')  # left join
pd.merge(df_a, df_b, on='subject_id', how='left', suffixes=('_left', '_right')) # add suffixes

Renaming

data.index = data.index.map(str.upper)
data.rename(index=str.title, columns=str.upper, inplace=True)  # str.title means to make the 1st letter capitalized only

Create new column

with hash of values

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
city_to_state = { 'San Francisco' : 'California', 
                  'Baltimore' : 'Maryland'}
df['state'] = df['city'].map(city_to_state)

2 columns -> 1

def pre_post_difference(pre, post):
    return post - pre
df['score_change'] = pre_post_difference(df['preTestScore'], df['postTestScore'])

1 column -> 2

def score_multipler_2x_and_3x(x):
    return x*2, x*3
df['post_score_x2'], df['post_score_x3'] = zip(*df['postTestScore'].map(score_multipler_2x_and_3x))

Arithmetic

data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index)

New column with if/else on other

Create a new column called df.elderly where the value is yes

df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
df['color'] = np.where(df['Set']=='Z', 'green', 'red')
df['Age'] = df['Age'].map(lambda x: age_mean if x == -1 else x)

df['coverageRanked'] = df['coverage'].rank(ascending=1) # rank of the value of coverage in ascending order

Multiple conditions

train_file['DayGroup'] = 'weekday'
train_file['DayGroup'][(train_file['WeekDay'] == 5) | (train_file['WeekDay'] == 6)] = 'weekend'

Convert A String Categorical Variable To A Numeric Variable

def score_to_numeric(x):
    if x=='strong':
        return 3
    if x=='normal':
        return 2
    if x=='weak':
        return 1
        
df['score_num'] = df['score'].apply(score_to_numeric)

Regex

df['female'] = df['raw'].str.extract('(\d)', expand=True)  # Extract the column of single digits
df['date'] = df['raw'].str.extract('(....-..-..)', expand=True) # Extract the column of dates
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True) # Extract the column of thousands
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True) # Extract the column of words

Special

Split the geo variable into seperate lat and lon variables

lat = []
lon = []
# For each row in a varible,
for row in df['geo']:
    try:
        lat.append(row.split(',')[0])
        lon.append(row.split(',')[1])
    except:
        lat.append(np.NaN)
        lon.append(np.NaN)
# Create two new columns from lat and lon
df['latitude'] = lat
df['longitude'] = lon

Converting columns

Dataframe to array

outcome_dog = pd_train.loc[pd_train['AnimalType'] == 'Dog','OutcomeType']
outcome_dog = np.array(outcome_dog)

Numeric -> Category

Convert the countrycode variable into three binary variables

patsy.dmatrix('C(countrycode)-1', df, return_type='dataframe')

String -> the

df['date'] = pd.to_datetime(df['date'])

Column names

names(train) <- c('last_don','no_don','total_vol_don','mon_since_first_don','don_2007')

df.rename(columns={'Leader': 'Commander'}, inplace=True)

Dummy variables

gets dummies for embarked

df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

Binning data

define bin ranges

bins = [0, 25, 50, 75, 100]

Create names for the four groups

group_names = ['Low', 'Okay', 'Good', 'Great']

Cut postTestScore

categories = pd.cut(df['postTestScore'], bins, labels=group_names)
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins) 

Time series

time as an index

df.index = df['date']
del df['date']
#so that we can do
df['2014']
df['2014-05']
df[datetime(2014, 5, 3):] # after 3rd of may
df['5/3/2014':'5/4/2014'] # btw 3rd and 4th

Truncate the dataframe

df.truncate(before='1/2/2014', after='1/3/2014') #time series

Count the number of observations per timestamp

df.groupby(level=0).count()

df.resample('D', how='mean') # get mean value per day
df.resample('D', how='sum') # sum
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment