ctokheim · September 14, 2016 13:28
diff --git a/pandas_tricks.py b/pandas_tricks.py
 import pandas as pd
 import numpy as np

 # read df from delimited file
 df = pd.read_csv('filename.txt', sep='\t')

 # force printing out of dataframe
 print df.to_string()

 # apply a function to column/row
 df.a = df.a.apply(lambda x: x+1, axis=0)  # apply to column

 # change dtype of column
 df.a.astype(int)  # change dtype to int

 # creating pandas data frames
 # check out http://pandas.pydata.org/pandas-docs/dev/dsintro.html
 # Creating df with dicts
 df = pd.DataFrame({'a': pd.Series(range(10)),
                   'b': pd.Series(range(10, 20))})
 df = pd.DataFrame(df, index=range(3))  # select a subset of indexes (rows)
 df = pd.DataFrame(df, columns=['b', 'a'])  # select/order columns
 df = df[['b', 'a']]  # better way to select/order columns
 s = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e'])  # custom indexes for series

 # slicing of data frames using the INDEX COLUMN
 df.ix[:, 'a':'c']  # get columns 'a' through 'c'
 df.ix[:, 1:3]  # get columns 1 through 3
 df.ix[10]  # get index 10

 # slicing of data frames using the ROW POSITION
 df.iloc[10]  # get row 10

 # handling missing data
 # check out http://pandas.pydata.org/pandas-docs/stable/missing_data.html
 df.a.isnull()  # returns True for each spot NaN, etc occur
 pd.isnull(df.a)  # alternative
 df.a.notnull()  # opposite of isnull
 pd.notnull(df.a)  # function version
 df.fillna(0)  # fill in NaN values as 0
 df = df.dropna()   # drop rows with any NA's
 df = df.dropna(subset=['A', 'B'])  # drop rows if NA in specified cols

 # replace nulls with novel fill
 null_indxs = np.nonzero(df.a.isnull())[0]
 df.a[null_indxs] = 3

 df.a.value_counts()  # get counts for unique labels

 # groupby
 groupby_obj = df.groupby('a')  # groupby object
 my_means = df.groupby('a').mean()  # aggregate using default mean
 single_means = df.groupby('a')['b'].mean()  # aggregate on single column
 novel_agg = df.groupby('a').aggregate(np.sum)  # aggregate with a function
 groupToIndxs = df.groupby('a').groups  # returns a dictionary that maps groups to indexes

 # Create pivot tables like in excel
 table = pd.pivot_table(df,  # the df to be pivoted
                       values='col1',  # values for pivot table
                       rows='col2',  # row
                       cols='col3',  # col
                       aggfunc=np.mean)  # aggregating function
 table.plot(kind='bar')  # bar plot of pivot table

 # random tricks
 df.T  # transpose the dataframe
 df.index  # index of dataframe
 df.index.to_series()  # returns df index as a series
 np.asarray(df)  # convert to numpy array

 # iterate through data frame rows
 for i, row in df.iterrows():
    print i, row  # row is a pandas series object

 # complicated selection using index
 # the parenthesis are necessary when using
 # the "&" (and) operator
 df = df[(df.index>5) & (df.index<10)]

 # if you are dealing with heterogeneous indexes
 # where you expect to having missing values then
 # the best way to initialize your df is by the default constructor
 # and then use the reindex method. Otherwise you would clip out
 # indices in the second series you try to add
 df = pd.DataFrame()
 all_ixs = set(myseries1.index) | set(myseries2.index)
 df = df.reindex(list(all_ixs))
 df['1'] = myseries1
 df['2'] = myseries2

 # merging two DataFrames together!!!
 # Merging uses SQL like options with the "how" parameter
 # how = {'left', 'right', 'inner', 'outer'}
 # the how corresponds to "INNER JOIN", "OUTER JOIN", etc in SQL
 # 1. Merge base on DF index
 merged_df = pd.merge(left_df, right_df, how='inner',
                     left_index=True, right_index=True)
 # 2. Merge based on a separate column
 merged_df = pd.merge(left_df, right_df, how='inner',
                     left_on='left_col', right_on='right_col',
                     suffixes=(' left', ' right'))

 # drop all rows that represent a duplicate. the groupby filter method
 # does not seem to work. Otherwise that way would be a better option.
 mygrp = df.groupby('name')
 df = df[mygrp.apply(lambda x: len(x)==1)]
diff --git a/python_tricks.py b/python_tricks.py
 # get 'A...Z' string
 import string
 alphabet = string.ascii_uppercase

 # check if the user has java installed
 try:
    with open(os.devnull, 'wb') as f:
        subprocess.call('java', stdout=f, stderr=f)
 except (subprocess.CalledProcessError, OSError):
    print('Java is not installed or is not in PATH')

 # specify a debug point using pdb
 import pdb; pdb.set_trace()
diff --git a/z_matplotlib_tricks.py b/z_matplotlib_tricks.py
 # set legend text
 import matplotlib.pyplot as plt
 handles, labels = ax.get_legend_handles_labels()
 plt.legend(handles, my_list_of_labels)

 # change font family
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import matplotlib.font_manager as font_manager

 path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf'
 prop = font_manager.FontProperties(fname=path)
 mpl.rcParams['font.family'] = prop.get_name()
 fig, ax = plt.subplots()
 ax.set_title('Text in a cool font', size=40)
 plt.show()

 # specify path to font
 import matplotlib.pyplot as plt
 import matplotlib.font_manager as font_manager

 path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf'
 prop = font_manager.FontProperties(fname=path)
 fig, ax = plt.subplots()
 ax.set_title('Text in a cool font', fontproperties=prop, size=40)
 plt.show()
	import pandas as pd
	import numpy as np

	# read df from delimited file
	df = pd.read_csv('filename.txt', sep='\t')

	# force printing out of dataframe
	print df.to_string()

	# apply a function to column/row
	df.a = df.a.apply(lambda x: x+1, axis=0) # apply to column

	# change dtype of column
	df.a.astype(int) # change dtype to int

	# creating pandas data frames
	# check out http://pandas.pydata.org/pandas-docs/dev/dsintro.html
	# Creating df with dicts
	df = pd.DataFrame({'a': pd.Series(range(10)),
	'b': pd.Series(range(10, 20))})
	df = pd.DataFrame(df, index=range(3)) # select a subset of indexes (rows)
	df = pd.DataFrame(df, columns=['b', 'a']) # select/order columns
	df = df[['b', 'a']] # better way to select/order columns
	s = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e']) # custom indexes for series

	# slicing of data frames using the INDEX COLUMN
	df.ix[:, 'a':'c'] # get columns 'a' through 'c'
	df.ix[:, 1:3] # get columns 1 through 3
	df.ix[10] # get index 10

	# slicing of data frames using the ROW POSITION
	df.iloc[10] # get row 10

	# handling missing data
	# check out http://pandas.pydata.org/pandas-docs/stable/missing_data.html
	df.a.isnull() # returns True for each spot NaN, etc occur
	pd.isnull(df.a) # alternative
	df.a.notnull() # opposite of isnull
	pd.notnull(df.a) # function version
	df.fillna(0) # fill in NaN values as 0
	df = df.dropna() # drop rows with any NA's
	df = df.dropna(subset=['A', 'B']) # drop rows if NA in specified cols

	# replace nulls with novel fill
	null_indxs = np.nonzero(df.a.isnull())[0]
	df.a[null_indxs] = 3

	df.a.value_counts() # get counts for unique labels

	# groupby
	groupby_obj = df.groupby('a') # groupby object
	my_means = df.groupby('a').mean() # aggregate using default mean
	single_means = df.groupby('a')['b'].mean() # aggregate on single column
	novel_agg = df.groupby('a').aggregate(np.sum) # aggregate with a function
	groupToIndxs = df.groupby('a').groups # returns a dictionary that maps groups to indexes

	# Create pivot tables like in excel
	table = pd.pivot_table(df, # the df to be pivoted
	values='col1', # values for pivot table
	rows='col2', # row
	cols='col3', # col
	aggfunc=np.mean) # aggregating function
	table.plot(kind='bar') # bar plot of pivot table

	# random tricks
	df.T # transpose the dataframe
	df.index # index of dataframe
	df.index.to_series() # returns df index as a series
	np.asarray(df) # convert to numpy array

	# iterate through data frame rows
	for i, row in df.iterrows():
	print i, row # row is a pandas series object

	# complicated selection using index
	# the parenthesis are necessary when using
	# the "&" (and) operator
	df = df[(df.index>5) & (df.index<10)]

	# if you are dealing with heterogeneous indexes
	# where you expect to having missing values then
	# the best way to initialize your df is by the default constructor
	# and then use the reindex method. Otherwise you would clip out
	# indices in the second series you try to add
	df = pd.DataFrame()
	all_ixs = set(myseries1.index) \| set(myseries2.index)
	df = df.reindex(list(all_ixs))
	df['1'] = myseries1
	df['2'] = myseries2

	# merging two DataFrames together!!!
	# Merging uses SQL like options with the "how" parameter
	# how = {'left', 'right', 'inner', 'outer'}
	# the how corresponds to "INNER JOIN", "OUTER JOIN", etc in SQL
	# 1. Merge base on DF index
	merged_df = pd.merge(left_df, right_df, how='inner',
	left_index=True, right_index=True)
	# 2. Merge based on a separate column
	merged_df = pd.merge(left_df, right_df, how='inner',
	left_on='left_col', right_on='right_col',
	suffixes=(' left', ' right'))

	# drop all rows that represent a duplicate. the groupby filter method
	# does not seem to work. Otherwise that way would be a better option.
	mygrp = df.groupby('name')
	df = df[mygrp.apply(lambda x: len(x)==1)]
	# get 'A...Z' string
	import string
	alphabet = string.ascii_uppercase

	# check if the user has java installed
	try:
	with open(os.devnull, 'wb') as f:
	subprocess.call('java', stdout=f, stderr=f)
	except (subprocess.CalledProcessError, OSError):
	print('Java is not installed or is not in PATH')

	# specify a debug point using pdb
	import pdb; pdb.set_trace()
	# set legend text
	import matplotlib.pyplot as plt
	handles, labels = ax.get_legend_handles_labels()
	plt.legend(handles, my_list_of_labels)

	# change font family
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import matplotlib.font_manager as font_manager

	path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf'
	prop = font_manager.FontProperties(fname=path)
	mpl.rcParams['font.family'] = prop.get_name()
	fig, ax = plt.subplots()
	ax.set_title('Text in a cool font', size=40)
	plt.show()

	# specify path to font
	import matplotlib.pyplot as plt
	import matplotlib.font_manager as font_manager

	path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf'
	prop = font_manager.FontProperties(fname=path)
	fig, ax = plt.subplots()
	ax.set_title('Text in a cool font', fontproperties=prop, size=40)
	plt.show()