Last active
September 14, 2016 13:28
-
-
Save ctokheim/4675579 to your computer and use it in GitHub Desktop.
Python: tricks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
# read df from delimited file | |
df = pd.read_csv('filename.txt', sep='\t') | |
# force printing out of dataframe | |
print df.to_string() | |
# apply a function to column/row | |
df.a = df.a.apply(lambda x: x+1, axis=0) # apply to column | |
# change dtype of column | |
df.a.astype(int) # change dtype to int | |
# creating pandas data frames | |
# check out http://pandas.pydata.org/pandas-docs/dev/dsintro.html | |
# Creating df with dicts | |
df = pd.DataFrame({'a': pd.Series(range(10)), | |
'b': pd.Series(range(10, 20))}) | |
df = pd.DataFrame(df, index=range(3)) # select a subset of indexes (rows) | |
df = pd.DataFrame(df, columns=['b', 'a']) # select/order columns | |
df = df[['b', 'a']] # better way to select/order columns | |
s = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e']) # custom indexes for series | |
# slicing of data frames using the INDEX COLUMN | |
df.ix[:, 'a':'c'] # get columns 'a' through 'c' | |
df.ix[:, 1:3] # get columns 1 through 3 | |
df.ix[10] # get index 10 | |
# slicing of data frames using the ROW POSITION | |
df.iloc[10] # get row 10 | |
# handling missing data | |
# check out http://pandas.pydata.org/pandas-docs/stable/missing_data.html | |
df.a.isnull() # returns True for each spot NaN, etc occur | |
pd.isnull(df.a) # alternative | |
df.a.notnull() # opposite of isnull | |
pd.notnull(df.a) # function version | |
df.fillna(0) # fill in NaN values as 0 | |
df = df.dropna() # drop rows with any NA's | |
df = df.dropna(subset=['A', 'B']) # drop rows if NA in specified cols | |
# replace nulls with novel fill | |
null_indxs = np.nonzero(df.a.isnull())[0] | |
df.a[null_indxs] = 3 | |
df.a.value_counts() # get counts for unique labels | |
# groupby | |
groupby_obj = df.groupby('a') # groupby object | |
my_means = df.groupby('a').mean() # aggregate using default mean | |
single_means = df.groupby('a')['b'].mean() # aggregate on single column | |
novel_agg = df.groupby('a').aggregate(np.sum) # aggregate with a function | |
groupToIndxs = df.groupby('a').groups # returns a dictionary that maps groups to indexes | |
# Create pivot tables like in excel | |
table = pd.pivot_table(df, # the df to be pivoted | |
values='col1', # values for pivot table | |
rows='col2', # row | |
cols='col3', # col | |
aggfunc=np.mean) # aggregating function | |
table.plot(kind='bar') # bar plot of pivot table | |
# random tricks | |
df.T # transpose the dataframe | |
df.index # index of dataframe | |
df.index.to_series() # returns df index as a series | |
np.asarray(df) # convert to numpy array | |
# iterate through data frame rows | |
for i, row in df.iterrows(): | |
print i, row # row is a pandas series object | |
# complicated selection using index | |
# the parenthesis are necessary when using | |
# the "&" (and) operator | |
df = df[(df.index>5) & (df.index<10)] | |
# if you are dealing with heterogeneous indexes | |
# where you expect to having missing values then | |
# the best way to initialize your df is by the default constructor | |
# and then use the reindex method. Otherwise you would clip out | |
# indices in the second series you try to add | |
df = pd.DataFrame() | |
all_ixs = set(myseries1.index) | set(myseries2.index) | |
df = df.reindex(list(all_ixs)) | |
df['1'] = myseries1 | |
df['2'] = myseries2 | |
# merging two DataFrames together!!! | |
# Merging uses SQL like options with the "how" parameter | |
# how = {'left', 'right', 'inner', 'outer'} | |
# the how corresponds to "INNER JOIN", "OUTER JOIN", etc in SQL | |
# 1. Merge base on DF index | |
merged_df = pd.merge(left_df, right_df, how='inner', | |
left_index=True, right_index=True) | |
# 2. Merge based on a separate column | |
merged_df = pd.merge(left_df, right_df, how='inner', | |
left_on='left_col', right_on='right_col', | |
suffixes=(' left', ' right')) | |
# drop all rows that represent a duplicate. the groupby filter method | |
# does not seem to work. Otherwise that way would be a better option. | |
mygrp = df.groupby('name') | |
df = df[mygrp.apply(lambda x: len(x)==1)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get 'A...Z' string | |
import string | |
alphabet = string.ascii_uppercase | |
# check if the user has java installed | |
try: | |
with open(os.devnull, 'wb') as f: | |
subprocess.call('java', stdout=f, stderr=f) | |
except (subprocess.CalledProcessError, OSError): | |
print('Java is not installed or is not in PATH') | |
# specify a debug point using pdb | |
import pdb; pdb.set_trace() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set legend text | |
import matplotlib.pyplot as plt | |
handles, labels = ax.get_legend_handles_labels() | |
plt.legend(handles, my_list_of_labels) | |
# change font family | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import matplotlib.font_manager as font_manager | |
path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf' | |
prop = font_manager.FontProperties(fname=path) | |
mpl.rcParams['font.family'] = prop.get_name() | |
fig, ax = plt.subplots() | |
ax.set_title('Text in a cool font', size=40) | |
plt.show() | |
# specify path to font | |
import matplotlib.pyplot as plt | |
import matplotlib.font_manager as font_manager | |
path = '/usr/share/fonts/truetype/msttcorefonts/Comic_Sans_MS.ttf' | |
prop = font_manager.FontProperties(fname=path) | |
fig, ax = plt.subplots() | |
ax.set_title('Text in a cool font', fontproperties=prop, size=40) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment