[TOC]

Apply function to column

df['TitleLength'] = df.Title.apply(len)

df.columns = map(str.lower, df.columns) # lowercase function to column

Apply A Function (Rolling Mean) To The DataFrame, By Group

Group df by df.platoon, then apply a rolling mean lambda function to df.casualties

df.groupby('Platoon')['Casualties'].apply(lambda x:x.rolling(center=False,window=2).mean())

Apply to all cells (drop string)

Drop the string variable so that applymap() can run

df = df.drop('name', axis=1)

Return the square root of every cell in the dataframe

df.applymap(np.sqrt)

Slicing

Select A where b = "value"

pd_train.loc[pd_train['AnimalType'] == 'Dog',feature]

Select subset of columns

df[df.columns[1:]]  # row,column


sliced = data[data.workclass.isin([' Local-gov', ' State-gov']) \
                 & (data.education_num > 1)][['age', 'workclass']]


cities <- c("Adelaide", "Brisbane", "Canberra", "Darwin")
dss    <- subset(ds, location %in% cities)

Dataframe

df.replace(-999, np.nan) # replacing values

Joining / Merging

df_new = pd.concat([df_a, df_b]) # Join along rows
pd.concat([df_a, df_b], axis=1) # Join along columns
pd.merge(df_new, df_n, on='subject_id') # two dataframes along the subject_id value
pd.merge(df_a, df_b, on='subject_id', how='outer') # outer join
pd.merge(df_a, df_b, on='subject_id', how='inner') # inner join
pd.merge(df_a, df_b, on='subject_id', how='right') # right join
pd.merge(df_a, df_b, on='subject_id', how='left')  # left join
pd.merge(df_a, df_b, on='subject_id', how='left', suffixes=('_left', '_right')) # add suffixes

Renaming

data.index = data.index.map(str.upper)
data.rename(index=str.title, columns=str.upper, inplace=True)  # str.title means to make the 1st letter capitalized only

Create new column

with hash of values

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

city_to_state = { 'San Francisco' : 'California', 
                  'Baltimore' : 'Maryland'}
df['state'] = df['city'].map(city_to_state)

2 columns -> 1

def pre_post_difference(pre, post):
    return post - pre
df['score_change'] = pre_post_difference(df['preTestScore'], df['postTestScore'])

1 column -> 2

def score_multipler_2x_and_3x(x):
    return x*2, x*3
df['post_score_x2'], df['post_score_x3'] = zip(*df['postTestScore'].map(score_multipler_2x_and_3x))

Arithmetic

data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index)

New column with if/else on other

Create a new column called df.elderly where the value is yes

df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
df['color'] = np.where(df['Set']=='Z', 'green', 'red')
df['Age'] = df['Age'].map(lambda x: age_mean if x == -1 else x)

df['coverageRanked'] = df['coverage'].rank(ascending=1) # rank of the value of coverage in ascending order

Multiple conditions

train_file['DayGroup'] = 'weekday'
train_file['DayGroup'][(train_file['WeekDay'] == 5) | (train_file['WeekDay'] == 6)] = 'weekend'

Convert A String Categorical Variable To A Numeric Variable

def score_to_numeric(x):
    if x=='strong':
        return 3
    if x=='normal':
        return 2
    if x=='weak':
        return 1
        
df['score_num'] = df['score'].apply(score_to_numeric)

Regex

df['female'] = df['raw'].str.extract('(\d)', expand=True)  # Extract the column of single digits
df['date'] = df['raw'].str.extract('(....-..-..)', expand=True) # Extract the column of dates
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True) # Extract the column of thousands
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True) # Extract the column of words

Special

Split the geo variable into seperate lat and lon variables

lat = []
lon = []
# For each row in a varible,
for row in df['geo']:
    try:
        lat.append(row.split(',')[0])
        lon.append(row.split(',')[1])
    except:
        lat.append(np.NaN)
        lon.append(np.NaN)
# Create two new columns from lat and lon
df['latitude'] = lat
df['longitude'] = lon

Converting columns

Dataframe to array

outcome_dog = pd_train.loc[pd_train['AnimalType'] == 'Dog','OutcomeType']
outcome_dog = np.array(outcome_dog)

Numeric -> Category

Convert the countrycode variable into three binary variables

patsy.dmatrix('C(countrycode)-1', df, return_type='dataframe')

String -> the

df['date'] = pd.to_datetime(df['date'])

Column names

names(train) <- c('last_don','no_don','total_vol_don','mon_since_first_don','don_2007')

df.rename(columns={'Leader': 'Commander'}, inplace=True)

Dummy variables

gets dummies for embarked

df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

Binning data

define bin ranges

bins = [0, 25, 50, 75, 100]

Create names for the four groups

group_names = ['Low', 'Okay', 'Good', 'Great']

Cut postTestScore

categories = pd.cut(df['postTestScore'], bins, labels=group_names)
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)

Time series

time as an index

df.index = df['date']
del df['date']
#so that we can do
df['2014']
df['2014-05']

df[datetime(2014, 5, 3):] # after 3rd of may
df['5/3/2014':'5/4/2014'] # btw 3rd and 4th

Truncate the dataframe

df.truncate(before='1/2/2014', after='1/3/2014') #time series

Count the number of observations per timestamp

df.groupby(level=0).count()

df.resample('D', how='mean') # get mean value per day
df.resample('D', how='sum') # sum

fabsta/3. feature engineering (python data science).md

Apply function to column

Apply A Function (Rolling Mean) To The DataFrame, By Group

Group df by df.platoon, then apply a rolling mean lambda function to df.casualties

Apply to all cells (drop string)

Drop the string variable so that applymap() can run

Return the square root of every cell in the dataframe

Slicing

Select A where b = "value"

Select subset of columns

Dataframe

Joining / Merging

Renaming

Create new column

with hash of values

2 columns -> 1

1 column -> 2

Arithmetic

New column with if/else on other

Create a new column called df.elderly where the value is yes

Multiple conditions

Convert A String Categorical Variable To A Numeric Variable

Regex

Special

Split the geo variable into seperate lat and lon variables

Converting columns

Dataframe to array

Numeric -> Category

Convert the countrycode variable into three binary variables

String -> the

Column names

Dummy variables

gets dummies for embarked

Binning data

define bin ranges

Create names for the four groups

Cut postTestScore

Time series

time as an index

Truncate the dataframe

Count the number of observations per timestamp