[TOC]
df['TitleLength'] = df.Title.apply(len)
df.columns = map(str.lower, df.columns) # lowercase function to column
df.groupby('Platoon')['Casualties'].apply(lambda x:x.rolling(center=False,window=2).mean())
df = df.drop('name', axis=1)
df.applymap(np.sqrt)
pd_train.loc[pd_train['AnimalType'] == 'Dog',feature]
df[df.columns[1:]] # row,column
sliced = data[data.workclass.isin([' Local-gov', ' State-gov']) \
& (data.education_num > 1)][['age', 'workclass']]
cities <- c("Adelaide", "Brisbane", "Canberra", "Darwin")
dss <- subset(ds, location %in% cities)
df.replace(-999, np.nan) # replacing values
df_new = pd.concat([df_a, df_b]) # Join along rows
pd.concat([df_a, df_b], axis=1) # Join along columns
pd.merge(df_new, df_n, on='subject_id') # two dataframes along the subject_id value
pd.merge(df_a, df_b, on='subject_id', how='outer') # outer join
pd.merge(df_a, df_b, on='subject_id', how='inner') # inner join
pd.merge(df_a, df_b, on='subject_id', how='right') # right join
pd.merge(df_a, df_b, on='subject_id', how='left') # left join
pd.merge(df_a, df_b, on='subject_id', how='left', suffixes=('_left', '_right')) # add suffixes
data.index = data.index.map(str.upper)
data.rename(index=str.title, columns=str.upper, inplace=True) # str.title means to make the 1st letter capitalized only
df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
city_to_state = { 'San Francisco' : 'California',
'Baltimore' : 'Maryland'}
df['state'] = df['city'].map(city_to_state)
def pre_post_difference(pre, post):
return post - pre
df['score_change'] = pre_post_difference(df['preTestScore'], df['postTestScore'])
def score_multipler_2x_and_3x(x):
return x*2, x*3
df['post_score_x2'], df['post_score_x3'] = zip(*df['postTestScore'].map(score_multipler_2x_and_3x))
data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index)
df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
df['color'] = np.where(df['Set']=='Z', 'green', 'red')
df['Age'] = df['Age'].map(lambda x: age_mean if x == -1 else x)
df['coverageRanked'] = df['coverage'].rank(ascending=1) # rank of the value of coverage in ascending order
train_file['DayGroup'] = 'weekday'
train_file['DayGroup'][(train_file['WeekDay'] == 5) | (train_file['WeekDay'] == 6)] = 'weekend'
def score_to_numeric(x):
if x=='strong':
return 3
if x=='normal':
return 2
if x=='weak':
return 1
df['score_num'] = df['score'].apply(score_to_numeric)
df['female'] = df['raw'].str.extract('(\d)', expand=True) # Extract the column of single digits
df['date'] = df['raw'].str.extract('(....-..-..)', expand=True) # Extract the column of dates
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True) # Extract the column of thousands
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True) # Extract the column of words
lat = []
lon = []
# For each row in a varible,
for row in df['geo']:
try:
lat.append(row.split(',')[0])
lon.append(row.split(',')[1])
except:
lat.append(np.NaN)
lon.append(np.NaN)
# Create two new columns from lat and lon
df['latitude'] = lat
df['longitude'] = lon
outcome_dog = pd_train.loc[pd_train['AnimalType'] == 'Dog','OutcomeType']
outcome_dog = np.array(outcome_dog)
patsy.dmatrix('C(countrycode)-1', df, return_type='dataframe')
df['date'] = pd.to_datetime(df['date'])
names(train) <- c('last_don','no_don','total_vol_don','mon_since_first_don','don_2007')
df.rename(columns={'Leader': 'Commander'}, inplace=True)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
bins = [0, 25, 50, 75, 100]
group_names = ['Low', 'Okay', 'Good', 'Great']
categories = pd.cut(df['postTestScore'], bins, labels=group_names)
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
df.index = df['date']
del df['date']
#so that we can do
df['2014']
df['2014-05']
df[datetime(2014, 5, 3):] # after 3rd of may
df['5/3/2014':'5/4/2014'] # btw 3rd and 4th
df.truncate(before='1/2/2014', after='1/3/2014') #time series
df.groupby(level=0).count()
df.resample('D', how='mean') # get mean value per day
df.resample('D', how='sum') # sum