Skip to content

Instantly share code, notes, and snippets.

@ctokheim
Last active September 14, 2016 13:28
Show Gist options
  • Save ctokheim/4675579 to your computer and use it in GitHub Desktop.
Save ctokheim/4675579 to your computer and use it in GitHub Desktop.
Python: tricks
import pandas as pd
import numpy as np
# read df from delimited file
df = pd.read_csv('filename.txt', sep='\t')
# force printing out of dataframe
print df.to_string()
# apply a function to column/row
df.a = df.a.apply(lambda x: x+1, axis=0) # apply to column
# change dtype of column
df.a.astype(int) # change dtype to int
# creating pandas data frames
# check out http://pandas.pydata.org/pandas-docs/dev/dsintro.html
# Creating df with dicts
df = pd.DataFrame({'a': pd.Series(range(10)),
'b': pd.Series(range(10, 20))})
df = pd.DataFrame(df, index=range(3)) # select a subset of indexes (rows)
df = pd.DataFrame(df, columns=['b', 'a']) # select/order columns
df = df[['b', 'a']] # better way to select/order columns
s = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e']) # custom indexes for series
# slicing of data frames using the INDEX COLUMN
df.ix[:, 'a':'c'] # get columns 'a' through 'c'
df.ix[:, 1:3] # get columns 1 through 3
df.ix[10] # get index 10
# slicing of data frames using the ROW POSITION
df.iloc[10] # get row 10
# handling missing data
# check out http://pandas.pydata.org/pandas-docs/stable/missing_data.html
df.a.isnull() # returns True for each spot NaN, etc occur
pd.isnull(df.a) # alternative
df.a.notnull() # opposite of isnull
pd.notnull(df.a) # function version
df.fillna(0) # fill in NaN values as 0
df = df.dropna() # drop rows with any NA's
df = df.dropna(subset=['A', 'B']) # drop rows if NA in specified cols
# replace nulls with novel fill
null_indxs = np.nonzero(df.a.isnull())[0]
df.a[null_indxs] = 3
df.a.value_counts() # get counts for unique labels
# groupby
groupby_obj = df.groupby('a') # groupby object
my_means = df.groupby('a').mean() # aggregate using default mean
single_means = df.groupby('a')['b'].mean() # aggregate on single column
novel_agg = df.groupby('a').aggregate(np.sum) # aggregate with a function
groupToIndxs = df.groupby('a').groups # returns a dictionary that maps groups to indexes
# Create pivot tables like in excel
table = pd.pivot_table(df, # the df to be pivoted
values='col1', # values for pivot table
rows='col2', # row
cols='col3', # col
aggfunc=np.mean) # aggregating function
table.plot(kind='bar') # bar plot of pivot table
# random tricks
df.T # transpose the dataframe
df.index # index of dataframe
df.index.to_series() # returns df index as a series
np.asarray(df) # convert to numpy array
# iterate through data frame rows
for i, row in df.iterrows():
print i, row # row is a pandas series object
# complicated selection using index
# the parenthesis are necessary when using
# the "&" (and) operator
df = df[(df.index>5) & (df.index<10)]
# if you are dealing with heterogeneous indexes
# where you expect to having missing values then
# the best way to initialize your df is by the default constructor
# and then use the reindex method. Otherwise you would clip out
# indices in the second series you try to add
df = pd.DataFrame()
all_ixs = set(myseries1.index) | set(myseries2.index)
df = df.reindex(list(all_ixs))
df['1'] = myseries1
df['2'] = myseries2
# merging two DataFrames together!!!
# Merging uses SQL like options with the "how" parameter
# how = {'left', 'right', 'inner', 'outer'}
# the how corresponds to "INNER JOIN", "OUTER JOIN", etc in SQL
# 1. Merge base on DF index
merged_df = pd.merge(left_df, right_df, how='inner',
left_index=True, right_index=True)
# 2. Merge based on a separate column
merged_df = pd.merge(left_df, right_df, how='inner',
left_on='left_col', right_on='right_col',
suffixes=(' left', ' right'))
# drop all rows that represent a duplicate. the groupby filter method
# does not seem to work. Otherwise that way would be a better option.
mygrp = df.groupby('name')
df = df[mygrp.apply(lambda x: len(x)==1)]
# get 'A...Z' string
import string
alphabet = string.ascii_uppercase
# check if the user has java installed
try:
with open(os.devnull, 'wb') as f:
subprocess.call('java', stdout=f, stderr=f)
except (subprocess.CalledProcessError, OSError):
print('Java is not installed or is not in PATH')
# specify a debug point using pdb
import pdb; pdb.set_trace()
# set legend text
import matplotlib.pyplot as plt
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, my_list_of_labels)
# change font family
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf'
prop = font_manager.FontProperties(fname=path)
mpl.rcParams['font.family'] = prop.get_name()
fig, ax = plt.subplots()
ax.set_title('Text in a cool font', size=40)
plt.show()
# specify path to font
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf'
prop = font_manager.FontProperties(fname=path)
fig, ax = plt.subplots()
ax.set_title('Text in a cool font', fontproperties=prop, size=40)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment