Skip to content

Instantly share code, notes, and snippets.

@shaybensasson
Last active June 14, 2022 17:54
Show Gist options
  • Select an option

  • Save shaybensasson/6b11b3de18840bb73432c98fe1ec585f to your computer and use it in GitHub Desktop.

Select an option

Save shaybensasson/6b11b3de18840bb73432c98fe1ec585f to your computer and use it in GitHub Desktop.
My Jupyter nbextension "Snippets Menu" json configuration
var MY_SNIPPETS = {
'name': 'Snippets',
'sub-menu': [
{
'name': 'config file',
'snippet': ['!cat ~/.local/share/jupyter/nbextensions/snippets_menu/my_snippets.js']
},
'---',
{
'name': 'Header',
'sub-menu': [
{
'name': 'numpy|pandas|matplotlib|seaborn',
"snippet": [
"%matplotlib inline",
"%config InlineBackend.figure_format = 'retina'",
"",
"import matplotlib",
"import matplotlib.pyplot as plt",
"import seaborn as sns",
"from pylab import rcParams",
"",
"sns.set(style='whitegrid', palette='muted', font_scale=1.33)",
"# plt.style.use('ggplot')",
"",
"HAPPY_COLORS_PALETTE = ['#01BEFE', '#FFDD00', '#FF7D00', '#FF006D', '#ADFF02', '#8F00FF']",
"",
"sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))",
"",
"rcParams['figure.figsize'] = 10, 8",
"",
"import pandas as pd",
"import numpy as np",
"",
"# x = np.arange(100)",
"y=np.random.randint(0, 5+1, size=(100))",
"#sns.scatterplot(x,y)",
"#sns.regplot(x,y)",
"",
"ax = plt.figure(figsize=(10,5)).gca() #w,h",
"sns.countplot(y, ax=ax)",
"ax.set_xticklabels(labels=HAPPY_COLORS_PALETTE)",
"ax.yaxis.set_major_locator(plt.MaxNLocator(10)) #see https://matplotlib.org/3.1.1/gallery/ticks_and_spines/tick-locators.html"
]
},
{
'name': 'Plot.ly 4.5',
'snippet': ['#online mode (just comment for offline mode)',
'# import chart_studio',
"# chart_studio.tools.set_credentials_file(username='bensshay', api_key='YuUeRFOAsKffHg3NpLbJ')",
'',
'from plotly.offline import iplot',
'import plotly.graph_objects as go',
'',
'import plotly.io as pio',
"pio.templates.default = 'none' #set theme",
'',
'# Cufflinks wrapper on plotly',
'import cufflinks',
'',
'cufflinks.go_offline()',
'',
'# Set global theme',
"cufflinks.set_config_file(world_readable=True, theme='pearl')",
'',
'#After importing cufflinks, plotly plots can be made using df.iplot() and then specifying parameters. ',
'# This is a great replacement for matplotlib!',
'',
'',
'#quick demo',
'fig = go.Figure(go.Scatter(x=[1, 2, 3, 4], y=[4, 3, 2, 1]))',
"fig.update_layout(title_text='hello world')",
'',
'iplot(fig)',
''
]
},
{
'name': 'versions of everything',
'snippet': ['%reload_ext watermark',
'import warnings',
'',
'import os',
"print('VirtualEnv: {}'.format(os.getenv('VIRTUAL_ENV').split('/')[-1]))",
"print('')",
'with warnings.catch_warnings():',
" warnings.simplefilter('ignore')",
' %watermark -v --packages numpy,scipy,sklearn,pandas,matplotlib,seaborn,tqdm,keras,tensorflow',
"print('')",
'!cat /usr/local/cuda/version.txt',
'',
'#import pandas as pd',
'#pd.show_versions()'
]
},
{
'name': 'ignore warnings',
'snippet': ["import warnings; warnings.simplefilter('ignore')"]
},
{
'name': 'reload an existing module',
'snippet': ['#import module',
'import importlib',
'importlib.reload(module)'
]
},
{
'name': 'add packages to python path',
'snippet': ['import sys, os',
"paths = ['~/Homer/', '~/Homer/lib/hyperopt/', '~/Homer/lib/PDPbox/', '~/Homer/lib/PyCEbox/', '~/Homer/lib/ALEPython/']",
'sys.path.extend([os.path.expanduser(p) for p in paths])'
]
},
{
"name": "Float formatting",
"snippet": [
"np.set_printoptions(formatter={'float_kind': '{:3f}'.format})",
"%precision 3 #ipython float formatter",
"pd.options.display.float_format='{:.3f}'.format"
]
},
'---',
{
'name': 'OLD Plot.ly<4.5',
'snippet': ['import plotly ',
'#online mode',
"plotly.tools.set_credentials_file(username='bensshay', api_key='YuUeRFOAsKffHg3NpLbJ')",
'',
'#offline mode',
'#from plotly.offline import init_notebook_mode, iplot',
'#Always run this the command before at the start of notebook',
'#init_notebook_mode(connected=False)',
'',
'# plotly standard imports',
'import plotly.graph_objs as go',
'import plotly.plotly as py',
'import plotly.figure_factory as ff',
'',
'# Cufflinks wrapper on plotly',
'import cufflinks',
'',
'from plotly.offline import iplot',
'cufflinks.go_offline()',
'',
'# Set global theme',
"cufflinks.set_config_file(world_readable=True, theme='pearl')",
'',
'#After importing cufflinks, plotly plots can be made using df.iplot() and then specifying parameters. ',
'# This is a great replacement for matplotlib!'
]
}
]
},
{
'name': 'Thesis',
'sub-menu': [
{
'name': 'Boilerplate',
'sub-menu': [
{
"name": "1. dirs",
"snippet": [
"import pandas as pd",
"import numpy as np",
"import pickle",
"",
"import sys, os",
"",
"HOMER_DIR = os.path.expanduser('~/Homer')",
"sys.path.extend([HOMER_DIR])",
"",
"from homer import options",
"RANDOM_SEED = options.RANDOM_SEED",
"",
"STYLES_DIR = os.path.join(HOMER_DIR, 'styles')",
"HOME_DIR = os.path.join(HOMER_DIR, 'Intelligence/v2') #project home_dir",
"DATA_DIR = os.path.join(HOME_DIR, 'data')",
"ENSEMBLES_DIR = os.path.join(HOME_DIR, 'out/ensembles')",
"TOP_FEATS_DIR = os.path.join(ENSEMBLES_DIR, 'top_feats')",
"",
"_ANALYSIS_DIR = os.path.join(HOME_DIR, 'out/analysis')",
"FIGURES_DIR = os.path.join(_ANALYSIS_DIR, 'figures')",
"INTERMEDIATE_DIR = os.path.join(_ANALYSIS_DIR, 'intermediate')"
]
},
{
"name": "2. styles",
"snippet": [
"# IMPORTANT: It is essential that the use.style will be on difference cell than the %matplotlib magic",
"import matplotlib.pyplot as plt",
"import seaborn as sns",
"",
"BASELINE_STYLE = os.path.join(STYLES_DIR, 'baseline.mplstyle')",
"THESIS_STYLE = os.path.join(STYLES_DIR, 'thesis.mplstyle')",
"THESIS_CB_STYLE = os.path.join(STYLES_DIR, 'thesis.colorblind.mplstyle')",
"THESIS_SHAP_STYLE = os.path.join(STYLES_DIR, 'thesis.shap.mplstyle')",
"# plt.style.use([BASELINE_STYLE])",
"plt.style.use([BASELINE_STYLE, THESIS_STYLE])",
"",
"#can be used inside a context manger:",
"#with plt.style.context([BASELINE_STYLE, THESIS_STYLE,THESIS_SHAP_STYLE]):",
"#with plt.rc_context({'axes.grid': False}):",
""
]
},
]
},
]
},
{
'name': 'Jupyter',
'sub-menu': [
{
'name': 'Magics',
'sub-menu': [
{
"name": "autoreload",
"snippet": [
"%load_ext autoreload",
"",
"%autoreload 2 # reloads all modules every time this cell is executed"
]
},
{
"name": "timeit (run multiple times)",
"snippet": [
"%%timeit -r2 -n3 # 2 runs x 3 iterations/loops",
"import time",
"time.sleep(1)"
]
}],
},
'---',
{
'name': 'print all pathes',
'snippet': ['!jupyter --path']
},
{
'name': 'auto save when executed',
'snippet': ['from IPython.display import Javascript',
'',
"script = ''",
'if (AUTO_SAVE_WHEN_COMPLETE):',
" script = '''",
' require(["base/js/namespace"],function(Jupyter) {',
' Jupyter.notebook.save_checkpoint();',
' });',
" '''",
'Javascript(script)'
]
},
{
'name': 'time notebook',
'snippet': ['#Start block',
'import time',
'start_time = time.time()',
'',
'#End block',
'import datetime',
'duration = str(datetime.timedelta(seconds=time.time()-start_time))',
"print(f'The whole notebook took: {duration}')"
]
}]
},
{
'name': 'Plotting',
'sub-menu': [
{
'name': 'Matplotlib',
'sub-menu': [
{
'name': 'subplotting (plt.subplots)',
'snippet': ['from tqdm.auto import tqdm, trange',
'',
'N = 6',
'NCOLS = min(5,N)',
'NROWS = int(np.ceil(N/NCOLS))',
'# print(N, NROWS, NCOLS)',
'f, axes = plt.subplots(nrows=NROWS, ncols=NCOLS, figsize=(24,3*NROWS), squeeze=False) #w,h',
'',
'for i in trange(N):',
' ax = axes[int(i/NCOLS),i%NCOLS] ',
' x = np.random.randint(10, size=(10,))',
' ax.scatter(x,x)',
" ax.set_title('#%s' % i)",
'',
'#delete leftovers',
'for i in range(N, N + NROWS*NCOLS-N):',
' ax: plt.Axes = axes[int(i/NCOLS),i%NCOLS]',
' f.delaxes(ax)',
' ',
'plt.tight_layout(w_pad=2.5, h_pad=2) #pads are specified in fraction of fontsize'
]
},
{
'name': 'subplotting (matlab style)',
'snippet': ['from tqdm.auto import tqdm, trange',
'',
'N = 6',
'NCOLS = min(5,N)',
'NROWS = int(np.ceil(N/NCOLS))',
'# print(N, NROWS, NCOLS)',
'',
'# Matlab style',
'plt.subplots(figsize=(24,3*NROWS))',
'# plt.subplots_adjust(wspace=0.2,hspace=0.5)',
'for i in trange(N):',
' ax = plt.subplot(NROWS,NCOLS,i+1)',
' ',
' x = np.random.randint(10, size=(10,))',
' ax.scatter(x,x)',
" ax.set_title('#%s' % i)",
' ',
'plt.tight_layout(w_pad=2.5, h_pad=2) #pads are specified in fraction of fontsize'
]
},
{
'name': 'set plot font_size',
'snippet': ["ax = plt.subplot(111, xlabel='x', ylabel='y', title='title')",
"ax.scatter([1,2,3], [1,0,3], label='123')",
'ax.legend()',
'for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +',
' ax.get_xticklabels() + ax.get_yticklabels() + ',
' ax.legend().get_texts()):',
' item.set_fontsize(14)',
'# more here: https://stackoverflow.com/questions/3899980/how-to-change-the-font-size-on-a-matplotlib-plot'
]
},
'---',
{
"name": "3d scatter",
"snippet": [
"%matplotlib notebook",
"# interactive plots",
"",
"from mpl_toolkits import mplot3d",
"plt.rcParams['figure.figsize'] = 15, 8",
"",
"import numpy as np",
"",
"def f(x, y):",
" return np.sin(np.sqrt(x ** 2 + y ** 2))",
"",
"N = 5000",
"theta = 2 * np.pi * np.random.random(N)",
"r = 6 * np.random.random(N)",
"x = np.ravel(r * np.sin(theta))",
"y = np.ravel(r * np.cos(theta))",
"z = f(x, y)",
"",
"ax = plt.axes(projection='3d')",
"ax.scatter(x, y, z, c=z, alpha=.4, s=50, cmap='viridis');",
"",
"# TIP: adding legend: https://stackoverflow.com/a/20505720/1640414",
"ax.view_init(elev=45, azim=45) #pan using left mouse button, zoom using right mouse button"
]
},
{
'name': '3d (static) plots examples &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;',
'external-link': 'https://www.kaggle.com/saurav9786/interactive-3-d-plots-for-data-visualization',
},
],
},
{
'name': 'Seaborn',
'sub-menu': [
{
"name": "Distinct seaborn color pallete",
"snippet": [
"%matplotlib inline",
"%config InlineBackend.figure_format = 'retina'",
"",
"import matplotlib.pyplot as plt",
"import seaborn as sns",
"import numpy as np",
"",
"#see https://mokole.com/palette.html",
"DISTINCT_COLORS_PALLETE = ['#808080','#556b2f','#7f0000','#483d8b','#008000','#008b8b','#000080','#d2691e','#daa520','#8fbc8f','#800080','#b03060','#ff4500','#ffff00','#00ff00','#00ff7f','#dc143c','#00ffff','#00bfff','#0000ff','#a020f0','#adff2f','#1e90ff','#90ee90','#add8e6','#ff1493','#7b68ee','#ee82ee','#ffdead','#ffc0cb'][::-1]",
"sns.set(style='whitegrid', font_scale=1.33)",
"sns.set_palette(DISTINCT_COLORS_PALLETE)",
"",
"# sns.palplot(sns.color_palette(DISTINCT_COLORS_PALLETE)) #render the pallete",
"",
"N_COLORS = min(30, len(DISTINCT_COLORS_PALLETE))",
"y=np.random.randint(0, N_COLORS+1, size=(100))",
"ax = plt.figure(figsize=(20,10)).gca() #w,h",
"sns.countplot(y, ax=ax);"
]
},
'---',
{
'name': 'histogram/kde',
'snippet': ['data = np.random.randn(100)',
"#plt.hist(data, density=True, bins='auto',",
'# alpha=0.7, rwidth=0.95);',
'',
"sns.kdeplot(data, color = 'red', linewidth = 2, shade = True);"
]
},
{
"name": "countplot (series value_counts())",
"snippet": [
"ax = sns.countplot(df.target)",
"ax.set_xticklabels(class_names);"
]
},
{
'name': 'scatter/regplot',
'snippet': ['x = np.arange(100)',
'y=np.random.randint(0, 100, size=(100))',
'#sns.scatterplot(x,y)',
'sns.regplot(x,y)'
]
},
{
"name": "BoxEn plot: better than box plot",
"snippet": [
"# https://towardsdatascience.com/5-lesser-known-seaborn-plots-most-people-dont-know-82e5a54baea8",
"tips = sns.load_dataset('tips')",
"#sns.boxplot(x='day', y='total_bill', data=tips) ",
"sns.boxenplot(x='day', y='total_bill', data=tips) "
]
},
{
'name': 'correlation matrix',
'snippet': [
'#https://seaborn.pydata.org/examples/many_pairwise_correlations.html',
'#https://blog.algorexhealth.com/2017/09/10-heatmaps-10-python-libraries/',
'plt.figure(figsize=(18,18)) # (w,h)',
'',
'corr = df1.corr()',
'mask = np.zeros_like(corr, dtype=np.bool)',
'mask[np.triu_indices_from(mask)] = True',
'',
"#p=sns.heatmap(corr, annot=True,cmap ='RdYlGn', mask=mask)",
'',
'# Generate a custom diverging colormap',
'cmap = sns.diverging_palette(220, 10, as_cmap=True)',
'',
'p=sns.heatmap(corr, annot=True, cmap=cmap, mask=mask, center=0,',
' square=True, linewidths=.5, cbar_kws={"shrink": .5}, fmt=".2f")'
]
},
{
"name": "heatmap (better than matshow)",
"snippet": ['#https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/',
"ax = plt.figure(figsize=(10, 10)).gca()",
"mat = np.random.random((10, 10)) * 2 - 1",
"p = sns.heatmap(mat,",
" cmap=sns.diverging_palette(220, 10, as_cmap=True),",
" annot=True, linewidths=.5, ",
" cbar_kws={'shrink': .5},",
" center=0,",
" square=True,",
" vmin=-1, vmax=1",
" )",
"labels = [chr(i) for i in ord('a') + np.arange(10)]",
"ax.set_xticklabels(labels, rotation=45)",
"ax.set_yticklabels(labels, rotation=45)"
]
},
{
"name": "Clustered Heatmap/Corr mat",
"snippet": [
"#https://towardsdatascience.com/5-lesser-known-seaborn-plots-most-people-dont-know-82e5a54baea8",
"# load boston housing ...",
"",
"corr = df.iloc[:, :-1].corr() #features only",
"#https://seaborn.pydata.org/examples/many_pairwise_correlations.html",
"#https://blog.algorexhealth.com/2017/09/10-heatmaps-10-python-libraries/",
"plt.figure(figsize=(18,18)) # (w,h)",
"",
"mask = np.zeros_like(corr, dtype=np.bool)",
"mask[np.triu_indices_from(mask)] = True",
"",
"#p=sns.heatmap(corr, annot=True,cmap ='RdYlGn', mask=mask)",
"",
"# Generate a custom diverging colormap",
"cmap = sns.diverging_palette(220, 10, as_cmap=True)",
"",
"#p=sns.heatmap(corr, annot=True, cmap=cmap, mask=mask, center=0,",
"# square=True, linewidths=.5, cbar_kws={'shrink': .5}, fmt='.2f')",
"",
"sns.clustermap(corr, ",
" figsize=(18,18), annot=True,",
" cmap=cmap, center=0, square=True, linewidths=.5, fmt='.2f') #, #2d array-like rectangular data",
" #metricstr, #distance metric to use for data (default euclidean)",
" #z_scoreint, #whether to calculate z-scores or not",
" #standard_scaleint) #whether to standardize data or not "
]
},
{
"name": "Ridge plots",
"snippet": [
"sns.set(style='white', rc={'axes.facecolor': (0, 0, 0, 0)})",
"",
"# Create the data",
"rs = np.random.RandomState(1979)",
"x = rs.randn(500)",
"g = np.tile(list('ABCDEFGHIJ'), 50)",
"df = pd.DataFrame(dict(x=x, g=g))",
"m = df.g.map(ord)",
"df['x'] += m",
"",
"# Initialize the FacetGrid object",
"# pal = sns.c(10, rot=-.25, light=.7)",
"g = sns.FacetGrid(df, row='g', hue='g', aspect=15, height=.5, palette='coolwarm')",
"",
"# Draw the densities in a few steps",
"g.map(sns.kdeplot, 'x', clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)",
"g.map(sns.kdeplot, 'x', clip_on=False, color='w', lw=2, bw=.2)",
"g.map(plt.axhline, y=0, lw=2, clip_on=False)",
"",
"",
"# Define and use a simple function to label the plot in axes coordinates",
"def label(x, color, label):",
" ax = plt.gca()",
" ax.text(0, .2, label, fontweight='bold', color='k',",
" ha='left', va='center', transform=ax.transAxes)",
"",
"",
"g.map(label, 'x')",
"",
"# Set the subplots to overlap",
"g.fig.subplots_adjust(hspace=-.25)",
"",
"# Remove axes details that don't play well with overlap",
"g.set_titles('')",
"g.set(yticks=[])",
"g.despine(bottom=True, left=True)"
]
},
],
},
{
'name': 'Plot.ly',
'sub-menu': [
{
"name": "Timeseries line chart (x is Date)",
"snippet": [
"# Using graph_objects",
"import plotly.graph_objects as go",
"",
"import pandas as pd",
"df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv')",
"",
"fig = go.Figure([go.Scatter(x=df['Date'], y=df['AAPL.High'])])",
"fig.show()"
]
},
{
"name": "Default discrete color pallete",
"snippet": [
"import plotly.graph_objects as go",
"import numpy as np",
"",
"fig = go.Figure()",
"",
"def hex2rgba(h, alpha=.7):",
" h = h.lstrip('#')",
" rgb = ','.join([str(int(h[i:i+2], 16)) for i in (0, 2, 4)])",
" return f'rgba({rgb},{alpha})'",
" ",
"#default_plotly colormap, adding opacity",
"colors_hex = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']",
"colors = [hex2rgba(c) for c in colors_hex]",
"N = len(colors)",
"",
"fig.add_trace(go.Heatmap(",
" z=[np.arange(N).tolist()],",
" colorscale=[",
" [0, colors[0]],",
" [0.1, colors[0]],",
"",
" [0.1, colors[1]],",
" [0.2, colors[1]],",
"",
" [0.2, colors[2]],",
" [0.3, colors[2]],",
"",
" [0.3, colors[3]],",
" [0.4, colors[3]],",
"",
" [0.4, colors[4]],",
" [0.5, colors[4]],",
"",
" [0.5, colors[5]],",
" [0.6, colors[5]],",
"",
" [0.6, colors[6]],",
" [0.7, colors[6]],",
"",
" [0.7, colors[7]],",
" [0.8, colors[7]],",
"",
" [0.8, colors[8]],",
" [0.9, colors[8]],",
"",
" [0.9, colors[9]],",
" [1.0, colors[9]],",
" ],",
" colorbar=dict(",
" tick0=0,",
" dtick=1",
" )",
"))",
"",
"fig.show()"
]
},
'---',
{
'name': 'confusion matrix',
'snippet': ['import numpy as np',
'from sklearn.metrics import confusion_matrix',
'import plotly.figure_factory as ff',
'',
"NEG_CLASS, POS_CLASS = 'Neg', 'Pos'",
'',
'y_true = np.random.randint(0, high=1+1, size=(100,))',
'y_pred = np.random.randint(0, high=1+1, size=(100,))',
'# y_pred = y_true',
'cm = confusion_matrix(y_true, y_pred)',
'',
'cm_ = cm.ravel()',
"norm_cm_ = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]).ravel()",
"z_text = [[f'TN: {cm_[0]} ({norm_cm_[0]:.2f})', f'FP: {cm_[1]} ({norm_cm_[1]:.2f})'],",
" [f'FN: {cm_[2]} ({norm_cm_[2]:.2f})', f'TP: {cm_[3]} ({norm_cm_[3]:.2f})']]",
'',
'#use different colors for pos/neg',
'cm_masked = cm * np.array([[1,-1],[-1,1]])',
'fig = ff.create_annotated_heatmap(',
' x=[NEG_CLASS, POS_CLASS],',
' y=[NEG_CLASS, POS_CLASS],',
' z=cm_masked,',
' annotation_text=z_text, ',
' reversescale=False,',
' showscale=True,',
" colorscale='RdBu',",
' zmid=0,',
' xgap=2,ygap=2,',
')',
'',
'fig.update_layout(dict(',
" title='Confusion Matrix',",
' xaxis=go.layout.XAxis(',
" title='Predicted label',",
" side='bottom',",
' ),',
' yaxis=go.layout.YAxis(',
" title='True label',",
" autorange='reversed',",
' )))',
"fig['data'][0]['colorbar']['showticklabels'] = False #no tick labels",
'',
'#adjust annot fonts',
'mx = np.max(cm_)',
'med = mx/2',
'for i in range(len(fig.layout.annotations)):',
' fig.layout.annotations[i].font.size = 16',
" fig.layout.annotations[i].font.color = 'white' if (mx-cm_[i]) < med else 'black'",
'',
'iplot(fig, show_link=True)'
]
}]
},
{
'name': 'Altair',
'sub-menu': [
{
"name": "Scatter",
"snippet": [
"import pandas as pd",
"import altair as alt",
"",
"data = pd.DataFrame({'country_id': [1, 2, 3, 4, 5, 6],",
" 'population': [1, 100, 200, 300, 400, 500],",
" 'income': [50, 50, 200, 300, 300, 450]})",
"",
"# data",
"",
"alt.Chart(data).mark_circle(size=200).encode(",
" x='population:Q',",
" y='income:Q',",
" color='country_id:N',",
" tooltip=['country_id', 'population', 'income'])"
]
}]
}]
},
{
'name': 'Datasets',
'sub-menu': [
{
'name': 'Regression',
'sub-menu': [
{
"name": "Boston Housing",
"snippet": [
"import pandas as pd",
"import sklearn.datasets",
"def boston():",
" #from shap: Return the boston housing data in a nice package.",
"",
" d = sklearn.datasets.load_boston()",
" df = pd.DataFrame(data=d.data, columns=d.feature_names) # pylint: disable=E1101",
" return df, d.target # pylint: disable=E1101",
"",
"df, target = boston()",
"df['target'] = target",
"df.head()"
]
},
]
},
{
'name': 'Binary Classification',
'sub-menu': [
{
"name": "Adult census",
"snippet": [
"import numpy as np",
"import pandas as pd",
"import sklearn.datasets",
"def adult(display=False):",
" # from shap: Return the Adult census data in a nice package.",
" dtypes = [",
" ('Age', 'float32'), ('Workclass', 'category'), ('fnlwgt', 'float32'),",
" ('Education', 'category'), ('Education-Num', 'float32'), ('Marital Status', 'category'),",
" ('Occupation', 'category'), ('Relationship', 'category'), ('Race', 'category'),",
" ('Sex', 'category'), ('Capital Gain', 'float32'), ('Capital Loss', 'float32'),",
" ('Hours per week', 'float32'), ('Country', 'category'), ('Target', 'category')",
" ]",
" raw_data = pd.read_csv(",
" '/datasets/adult/adult.data',",
" names=[d[0] for d in dtypes],",
" na_values='?',",
" dtype=dict(dtypes)",
" )",
" data = raw_data.drop(['Education'], axis=1) # redundant with Education-Num",
" filt_dtypes = list(filter(lambda x: not (x[0] in ['Target', 'Education']), dtypes))",
" data['Target'] = data['Target'] == ' >50K'",
" rcode = {",
" 'Not-in-family': 0,",
" 'Unmarried': 1,",
" 'Other-relative': 2,",
" 'Own-child': 3,",
" 'Husband': 4,",
" 'Wife': 5",
" }",
" for k, dtype in filt_dtypes:",
" if dtype == 'category':",
" if k == 'Relationship':",
" data[k] = np.array([rcode[v.strip()] for v in data[k]])",
" else:",
" data[k] = data[k].cat.codes",
"",
" if display:",
" return raw_data.drop(['Education', 'Target', 'fnlwgt'], axis=1), data['Target'].values",
" else:",
" return data.drop(['Target', 'fnlwgt'], axis=1), data['Target'].values",
"",
"df, target = adult()",
"df['target'] = target",
"df.head()"
]
},
{
"name": "Pima diabetes",
"snippet": [
"import numpy as np",
"import pandas as pd",
"import sklearn.datasets",
"def pima():",
" # Returns the Pima diabetes data in a nice package.",
" ",
" raw_data = pd.read_csv(",
" '/datasets/pima/diabetes.csv'",
" )",
" return raw_data.drop(['Outcome'], axis=1), raw_data['Outcome'].values",
"",
"df, target = pima()",
"df['target'] = target",
"df.head()"
]
},
{
"name": "Titanic",
"snippet": [
"import numpy as np",
"import pandas as pd",
"import sklearn.datasets",
"def titanic():",
" # Returns the Titanic data in a nice package.",
" # https://www.kaggle.com/c/titanic/data",
" ",
" raw_data = pd.read_csv(",
" '/datasets/titanic/titanic.csv'",
" )",
" return raw_data.drop(['Survived'], axis=1), raw_data['Survived'].values",
"",
"df, target = titanic()",
"df['target'] = target",
"#df = df[['target', 'Pclass', 'Sex', 'Age', 'Ticket', 'Fare', 'Embarked']]",
"df.head()"
]
},
]
},
{
'name': 'Multi-Class',
'sub-menu': [
{
"name": "Iris",
"snippet": [
"import pandas as pd",
"import sklearn.datasets",
"def iris(display=True):",
" #from shap: Return the classic iris data in a nice package.",
" # display: targets are str, otherwise int",
"",
" d = sklearn.datasets.load_iris()",
" df = pd.DataFrame(data=d.data, columns=d.feature_names) # pylint: disable=E1101",
" if display:",
" return df, [d.target_names[v] for v in d.target] # pylint: disable=E1101",
" else:",
" return df, d.target # pylint: disable=E1101",
"",
"df, target = iris()",
"df['target'] = target",
"df.head()"
]
},
]
},
]
},
'---',
{
'name': 'Bash',
'sub-menu': [
{
'name': 'nice tree of recursive dir listing',
'snippet': ['!tree -d /datasets/dogscats/']
}]
},
{
'name': 'Debugger',
'sub-menu': [
{
'name': 'set a breakpoint/set_trace()',
'snippet': ['#http://wangchuan.github.io/coding/2017/07/12/ipdb-cheat-sheet.html',
'',
'from IPython.core.debugger import set_trace',
'def my_function(x):',
' answer = 42',
' #set_trace() # <-- uncomment!',
' #Python 3.7 has `breakpoint()` built-in!',
' #type `exit` to quit the debugger',
' answer += x',
' return answer',
'',
'my_function(12)'
]
}]
},
{
'name': 'Testing',
'sub-menu': [
{
'name': 'unittest (great assert, no class)',
'snippet': ['import unittest',
"T = unittest.TestCase('__init__')",
'#T.assertEqual((1,2), (2,1))'
]
}]
},
{
'name': 'Formatting & Printing',
'sub-menu': [
{
'name': 'Formatting strings examples (python 3.5)',
"snippet": [
"#https://pyformat.info/",
"#old",
"'s=%s, i=%d' % ('str', 15)",
"",
"#new",
"'{} {}'.format('one', 'two')",
"'{1} {0}'.format('one', 'two')",
"",
"'{:d} {:.2f}'.format(15, 3.1415)",
"",
"#newest",
"data = {'first': 'Hodor', 'last': 'Hodor!'}",
"'{first} {last}'.format(**data)",
"",
"from datetime import datetime",
"'{:%d-%m-%Y %H:%M:%S}'.format(datetime(2001, 2, 3, 16, 5))",
""
]
},
{
'name': 'pprint_color()',
"snippet": [
"from pprint import pformat, pprint",
"",
"from pygments import highlight",
"# from pygments.formatters.terminal import TerminalFormatter # dark theme",
"from pygments.formatters.terminal256 import Terminal256Formatter #light theme",
"from pygments.lexers.python import PythonLexer",
"",
"",
"def pprint_color(obj, *args, **kwargs):",
"# print(highlight(pformat(obj), PythonLexer(), TerminalFormatter()))",
" print(highlight(pformat(obj, *args, **kwargs), PythonLexer(), Terminal256Formatter()))"
]
},
{
'name': 'Print progress in the same line',
'snippet': ['num_episodes = 50000',
'for i in range(1, num_episodes + 1):',
" # Print out which episode we're on, useful for debugging.",
' if i % 100 == 0:',
" print('\rEpisode {}/{}.'.format(i, num_episodes), end='')",
' sys.stdout.flush()'
]
},
{
'name': 'Render JSON (great for hierchical dicts)',
'external-link': 'https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html',
}]
},
{
'name': 'Iter',
'sub-menu': [
{
'name': 'zip and unzip into lists',
'snippet': ['sub1a = [1, 3, 8]; sub2a = [2, 4, 9]',
'l1 = list(zip(sub1a, sub2a)) #[(1, 2), (3, 4), (8, 9)]',
'sub1b, sub2b = list(zip(*l1)) #(1, 3, 8), (2, 4, 9)',
'print(list(sub1b)) #[1, 3, 8]',
'print(list(sub2b)) #[2, 4, 9]'
]
}]
},
{
'name': 'TQDM',
'sub-menu': [
{
'name': 'TQDM for notebook',
'snippet': ['from tqdm.auto import tqdm, trange']
},
{
"name": "TQDM with description",
"snippet": [
"import time",
"from tqdm.auto import tqdm",
"",
"series_list = [str(i) for i in range(100)]",
"with tqdm(total=len(series_list)) as t:",
" for series in series_list:",
" t.set_description(f'Series: `{series}`')",
" t.update()",
"",
" time.sleep(0.1)"
]
},
]
},
{
'name': 'Dictionaries',
'sub-menu': [
{
'name': 'flattening dict',
'snippet': ["def flatten_dict(dd, separator='_', prefix=''):",
' """',
' Flattens a dict, adding separator (and prefix or `level0`) between levels',
' """',
' return {',
' prefix + separator + k if prefix else k: v',
' for kk, vv in dd.items()',
' for k, v in flatten_dict(vv, separator, kk).items()',
' } if isinstance(dd, dict) else {prefix: dd}',
'',
"d = {'a': 1, 'b': {'c':2, 'd':3}}",
"flatten_dict(d, '.')"
]
}]
},
{
'name': 'Parsing',
'sub-menu': [
{
'name': 'from string to dict',
'snippet': ['import ast',
'',
'# Convert from a string to a dictionary',
'ast.literal_eval("{\'a\': 1, \'b\': 2}")'
]
}]
},
{
'name': 'RegEx',
'sub-menu': [
{
"name": "pattern exists?",
"snippet": [
"import re",
"p = re.compile(r'^[A]{0,1}F[p|P]{0,1}[\\d|z]$')",
"assert p.match('AF1') is not None",
"assert p.match('AF1m') is None"
]
},
{
'name': 'search with groups',
'snippet': ['import re',
'',
"regex = r'\\$\\$ (\\(\\w{1,3}\\))$'",
"test_str = '$$\\int S_{xx}(\\omega)d\\omega^{-1/2}$$ (Fp1)'",
'',
'm = re.search(regex, test_str)',
'',
"assert m is not None, 'Could not find regex on `%s`' % test_str",
'',
"print(f'Match {matchNum} was found at {m.start()}-{m.end()}: {m.group()}')",
'',
'for groupNum in range(0, len(m.groups())):',
' groupNum = groupNum + 1',
'',
" print('Group {groupNum} found at {start}-{end}: {group}'.format(groupNum = groupNum, start = m.start(groupNum), end = m.end(groupNum), group = m.group(groupNum)))",
'',
'g = m.groups()',
'print(g)'
]
},
'---',
{
'name': 'regex101.com',
'external-link': 'https://regex101.com/',
}
]
},
{
'name': 'IO',
'sub-menu': [
{
'name': 'Pickling',
'sub-menu': [
{
'name': 'dump and load',
'snippet': ['import pickle',
'',
'a = (df_movies, missing)',
'',
"with open('tmp.pickle', 'wb') as handle:",
' pickle.dump(a, handle)',
'',
"with open('tmp.pickle', 'rb') as handle:",
' pickle.load(handle)'
]
}]
},
{
"name": "Numpy savez",
"snippet": [
"np.savez('out/shap_interaction_values.testset.npz', shap_interaction_values=shap_interaction_values)",
"shap_interaction_values = np.load('out/shap_interaction_values.testset.npz')['shap_interaction_values']"
]
},
'---',
{
"name": "Read file contents to string",
"snippet": [
"with open('data.txt', 'r') as file:",
" data = file.read()"
]
},
{
"name": "Using pathlib to create paths",
"snippet": [
"import pathlib",
"",
"import homer",
"",
"PACKAGE_ROOT = pathlib.Path(homer.__file__).resolve().parent #resolve() normalizes the path (sym links extraction, path correction)",
"TRAINED_MODEL_DIR = PACKAGE_ROOT / 'trained_models'",
"DATASET_DIR = PACKAGE_ROOT / 'datasets'"
]
},
]
},
{
'name': 'Timing',
'sub-menu': [
{
'name': 'with contextmanager',
'snippet': ['import time',
'from contextlib import contextmanager',
'',
'@contextmanager',
'def timer(title):',
' t0 = time.time()',
' yield',
" print('{} - done in {:.0f}s'.format(title, time.time() - t0))",
'',
"with timer('ABC'):",
' time.sleep(2)'
]
}]
},
{
'name': 'Download',
'sub-menu': [
{
'name': 'Download and extract zip with tqdm',
'snippet': ['from urllib.request import urlretrieve',
'from os.path import isfile, isdir',
'from tqdm import tqdm',
'import zipfile',
'',
"dataset_folder_path = 'data'",
"dataset_filename = 'text8.zip'",
"dataset_name = 'Text8 Dataset'",
'',
'class DLProgress(tqdm):',
' last_block = 0',
'',
' def hook(self, block_num=1, block_size=1, total_size=None):',
' self.total = total_size',
' self.update((block_num - self.last_block) * block_size)',
' self.last_block = block_num',
'',
'if not isfile(dataset_filename):',
" with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:",
' urlretrieve(',
" 'http://mattmahoney.net/dc/text8.zip',",
' dataset_filename,',
' pbar.hook)',
'',
'if not isdir(dataset_folder_path):',
' with zipfile.ZipFile(dataset_filename) as zip_ref:',
' zip_ref.extractall(dataset_folder_path)',
'',
"with open('data/text8') as f:",
' text = f.read()'
]
},
{
'name': 'download url to file',
'snippet': ['import urllib.request',
"urllib.request.urlretrieve('about:blank', 'file.txt')"
]
}]
},
'---',
{
'name': 'EDA',
'sub-menu': [
{
"name": "Predictive Power Score heatmap (seaborn)",
"snippet": [
"#https://towardsdatascience.com/rip-correlation-introducing-the-predictive-power-score-3d90808b9598",
"import ppscore as pps",
"df_pps = pps.matrix_tqdm(df)",
"",
"#https://blog.algorexhealth.com/2017/09/10-heatmaps-10-python-libraries/",
"ax = plt.figure(figsize=(18,18)).gca() # (w,h)",
"",
"mask = np.zeros_like(df_pps.values, dtype=np.bool)",
"mask[np.diag_indices_from(mask)] = True",
"",
"# Generate a custom colormap",
"cmap = sns.color_palette('Blues')",
"# cmap = sns.color_palette('YlOrRd')",
"",
"p=sns.heatmap(df_pps, annot=True, cmap=cmap, mask=mask,",
" square=True, linewidths=.5, cbar_kws={'shrink': .5}, fmt='.2f',",
" ax=ax)",
"",
"labels=df_pps.columns",
"ax.xaxis.tick_top(); ax.tick_params(direction='out', width=1, colors='k', top=True, left=True)",
"",
"ax.set_xticklabels(labels, rotation=90);",
"ax.set_yticklabels(labels, rotation=0);",
"",
"ax.set_ylabel('Predictee')",
"ax.set_xlabel('Predictor');",
"",
"# The `target` row of the matrix tells you that the best univariate predictor of the it",
"# on regression, MAE=0 yield 1.0 score and regressor that always predicts the median yields 0.0 score.",
"# on classification, F1=1 yield 1.0 score and classifier that always predicts the most freq class yields 0.0 score."
]
},
{
"name": "Predictive Power Score heatmap (plotly)",
"snippet": [
"#https://towardsdatascience.com/rip-correlation-introducing-the-predictive-power-score-3d90808b9598",
"fig = df_pps.T.iplot(kind='heatmap', colorscale='Blues', asFigure=True)",
"",
"FONT_SIZE = 10",
"",
"#NOTE: annotations are too heavy for a matrix with ~100 features",
"annotations = []",
"for n, row in enumerate(df_pps.itertuples()):",
" ix = row[0] #predicted",
" for m, val in enumerate(row[1:]): # but index",
" annotations.append(",
" go.layout.Annotation(text=f'{val:.2f}',",
" x=ix,",
" y=df_pps.columns[m],",
" xref='x1',",
" yref='y1',",
" showarrow=False,",
" font=dict(size=FONT_SIZE, color='black' if val<.8 else 'white')))",
"",
"fig.update_layout(autosize=False,",
" width=500,",
" height=500,",
" paper_bgcolor='rgba(0,0,0,0)',",
" plot_bgcolor='rgba(0,0,0,0)',",
" xaxis={",
" 'title': {'text': '<b>Predictor</b>'},",
" 'side': 'top',",
" 'tickfont': {'size': FONT_SIZE}",
" },",
" yaxis={",
" 'title': {'text': '<b>Predictee</b>'},",
" 'autorange': 'reversed',",
" 'tickfont': {'size': FONT_SIZE}",
" },",
" annotations=annotations)",
"fig.show()",
"# The `target` row of the matrix tells you that the best univariate predictor of the it",
"# on regression, MAE=0 yield 1.0 score and regressor that always predicts the median yields 0.0 score.",
"# on classification, F1=1 yield 1.0 score and classifier that always predicts the most freq class yields 0.0 score."
]
},
{
"name": "Predictive Power Score target hmap (plotly)",
"snippet": [
"#https://towardsdatascience.com/rip-correlation-introducing-the-predictive-power-score-3d90808b9598",
"target = df_all.columns[-1]",
"feats =df_all.columns[:-1]",
"d = {}",
"for f in feats:",
" res = pps.score(df_all, f, target)",
" d[f] = res['ppscore']",
"",
"df_pps_target = pd.DataFrame([d], index=['target']).T",
"FONT_SIZE = 10",
"fig = df_pps_target.iplot(kind='heatmap', colorscale='Blues', asFigure=True)",
"",
"#NOTE: annotations are too heavy for a matrix with ~100 features",
"annotations = []",
"for n, row in enumerate(df_pps_target.itertuples()):",
" ix = row[0] #predicted",
" for m, val in enumerate(row[1:]): # but index",
" annotations.append(",
" go.layout.Annotation(text='{:.2f}'.format(val).lstrip('0'),",
" x=ix,",
" y=df_pps_target.columns[m],",
" xref='x1',",
" yref='y1',",
" showarrow=False,",
" font=dict(size=FONT_SIZE, color='black' if val<(.8*df_pps_target.values.max()) else 'white')))",
"",
"fig.update_layout(autosize=False,",
" width=1000,",
" height=300,",
" paper_bgcolor='rgba(0,0,0,0)',",
" plot_bgcolor='rgba(0,0,0,0)',",
" xaxis={",
" 'title': {'text': '<b>Predictor</b>'},",
" 'side': 'top',",
" 'tickfont': {'size': FONT_SIZE}",
" },",
" yaxis={",
" 'title': {'text': '<b>Predictee</b>'},",
" 'autorange': 'reversed',",
" 'tickfont': {'size': FONT_SIZE}",
" },",
" annotations=annotations)",
"fig.show()",
"# The `target` row of the matrix tells you that the best univariate predictor of the it",
"# on regression, MAE=0 yield 1.0 score and regressor that always predicts the median yields 0.0 score.",
"# on classification, F1=1 yield 1.0 score and classifier that always predicts the most freq class yields 0.0 score."
]
},
]
},
{
'name': 'Numpy',
'sub-menu': [
{
'name': 'reshaping',
'snippet': ['import numpy as np',
'arr = np.random.randn(100) # (100,)',
'col_vec = arr[:, np.newaxis] # (100,1)',
'row_vec = arr[np.newaxis, :] # (1, 100)',
'arr2 = col_vec.ravel() # (100,)',
'print(arr.shape, col_vec.shape, row_vec.shape, arr2.shape)'
]
},
{
"name": "Numpy unique (like Series.value_counts())",
"snippet": [
"y = np.random.randint(2, size=(100,))",
"{v:c for (v,c) in np.unique(y, return_counts=True)}"
]
}]
},
{
'name': 'Pandas',
'sub-menu': [
{
'name': 'fast DataFrame creation',
'snippet': ['import pandas as pd',
'import numpy as np',
"pd.DataFrame(np.random.rand(4,8), columns=list('abcdefgh'))"
]
},
{
'name': 'display options and formatting',
'snippet': ['pd.options.display.max_rows=50',
'pd.options.display.max_columns=200',
'pd.options.display.max_colwidth=60 #no ...',
"pd.options.display.float_format='{:.2f}'.format",
"#pd.reset_drinoption('all') #reset to default",
"#pd.describe_option('rows') #describe all options that contains 'rows' in their name",
'',
'(df.head(10).style.format({',
" 'Age': '{:.1f}',",
" 'Date': '{:%m/%d/%y}'",
'}))'
]
},
{
'name': 'profile report',
'snippet': ['import pandas_profiling',
'pandas_profiling.ProfileReport(df)'
]
},
'---',
{
'name': 'simple filter',
'snippet': ["df = pd.read_csv('http://bit.ly/drinksbycountry')",
"df[(df.continent == 'Europe') & (df.beer_servings > 200)]"
]
},
{
"name": "assert no NaNs",
"snippet": [
"_ = df.dropna(axis=0, subset=df.columns, how='any', inplace=False)",
"assert _.shape == df.shape, '`df` has nans'"
]
},
{
"name": "binnify and return indexes of bins",
"snippet": [
"import pandas as pd",
"import numpy as np",
"np.random.seed(1)",
"df = pd.DataFrame(np.random.randint(0, 9+1, size=(10,1)), columns=list('a'))",
"",
"bins = np.arange(0,9+1,2)",
"df['a_bin'] = np.digitize(df['a'], bins=bins)",
"df"
]
},
{
'name': 'pivot_table',
'snippet': ["df = pd.read_csv('http://bit.ly/kaggletrain') #titanic",
"tbl = df.pivot_table(index='Sex', columns='Pclass', values='Survived', aggfunc='count')",
'#add margins=True, for summation',
"tbl.iplot(kind='bar', barmode='stack')",
'tbl.head()'
]
},
{
"name": "clip() values by lower/upper",
"snippet": [
"import pandas as pd",
"import numpy as np",
"df1 = pd.DataFrame(np.random.randint(2, size=(4,4))*2-1 * np.random.rand(4,4), columns=list('abcd')) #[-1,1]",
"df2 = df1.clip(lower=-0.5,upper=0.5)",
"",
"print(df1.head(1))",
"print(df2.head(1))"
]
},
{
"name": "transform() values using lambda",
"snippet": [
"import pandas as pd",
"import numpy as np",
"df = pd.DataFrame(np.random.rand(4,4), columns=list('abcd')) #[0,1]",
"df = df.transform(lambda x: np.log(x))",
"# df = df.transform([np.sqrt, np.exp])",
"",
"df.head()"
]
},
{
"name": "groupby with named aggregation",
"snippet": [
"import numpy as np",
"import pandas as pd",
"df = pd.read_csv('/datasets/direct_marketing/DirectMarketing.csv')",
"",
"# df[['Age','Salary','AmountSpent']].groupby('Age').agg({'Salary':'mean', 'AmountSpent':'sum'}).round(2)",
"",
"df[['Age','Salary','AmountSpent']].groupby('Age').agg(",
" avgSalary = ('Salary','mean'), #redundant pd.NamedAgg",
" totalSpent = ('AmountSpent','sum'), #this can be a lambda x: also, e.g. np.sum",
" count = ('Age','count') #here is a counter",
")"
]
},
{
"name": "groupby agg (apply() vs transform())",
"snippet": [
"import numpy as np",
"import pandas as pd",
"",
"df = pd.DataFrame({",
" 'restaurant_id': [101,102,103,104,105,106,107],",
" 'address': ['A','B','C','D', 'E', 'F', 'G'],",
" 'city': ['London','London','London','Oxford','Oxford', 'Durham', 'Durham'],",
" 'sales': [10,500,48,12,21,22,14]",
"})",
"",
"g = df.groupby('city')['sales']",
"df_t = g.transform(np.sum) #sum sales by city",
"df_a = g.apply(np.sum)",
"",
"print(df_a) #operates on multiple series",
"print(df_t) #operates on a single series",
"",
"# here is something we can achieve efficently with transform()",
"(df['sales']/df_t).apply(lambda x: format(x, '.2%')) #% sales per city"
]
},
]
},
{
'name': 'Pre-Processing',
'sub-menu': [
{
"name": "zscore",
"snippet": [
"from scipy.stats import zscore",
"z_data = df_all[df_all.columns[:-1]].apply(zscore)",
"z_data = z_data[(np.abs(z_data) < 4).all(axis=1)] #remove rows with outliers",
"sns.boxplot(data=z_data) #features only"
]
},
{
'name': 'scale() to {0,1} function',
'snippet': ['def scale(x, raw_range=(None, 255), feature_range=(-1, 1)):',
' # scale to (0, 1) ',
' source_range = np.zeros(2)',
' source_range[1] = x.max() if raw_range[1] is None else raw_range[1]',
' source_range[0] = x.min() if raw_range[0] is None else raw_range[0]',
' ',
' x = ((x - source_range[0])/(source_range[1] - source_range[0]))',
' ',
' # scale to feature_range ',
' min, max = feature_range',
' x = x * (max - min) + min',
' return x'
]
},
]
},
{
'name': 'ML',
'sub-menu': [
{
'name': 'Supervised',
'sub-menu': [
{
'name': 'Random Forest',
'sub-menu': [
{
'name': 'RandomForestRegressor',
'snippet': ['from sklearn.ensemble import RandomForestRegressor',
'rf = RandomForestRegressor()',
'rf.fit(X_train, y_train)',
'predictions = rf.predict(X_valid)',
'rmse = np.sqrt(np.mean(np.square(prediction - y_valid)))',
'print(rmse)'
]
}]
},
{
'name': 'SVM',
'sub-menu': [
{
'name': 'svm for binary classification',
'snippet': ['from sklearn import svm',
"# clf = svm.SVC(C=1.0, kernel='rbf', gamma=.7)",
"clf = svm.SVC(kernel='linear', probability=True)",
'',
'clf.fit(X_train, y_train)',
'prob = clf.predict_proba(X_train)[:,1]',
'acc = clf.score(X_train, y_train)',
'print(acc) #WARN: becareful when using imbalanced classes'
]
},
{
'name': 'svm with `rbf` kernel for classification',
'snippet': ['import numpy as np; np.random.seed(90210)',
'from numpy.random import permutation',
'from sklearn import svm, datasets',
'',
'iris = datasets.load_iris()',
'per = permutation(iris.target.size)',
'iris.data = iris.data[per]',
'iris.target = iris.target[per]',
'',
"clf = svm.SVC(C=1.0, kernel='rbf', gamma=.7)",
'clf.fit(iris.data[:90], iris.target[:90])',
'',
'acc = clf.score(iris.data[90:], iris.target[90:])',
'print(acc)'
]
}]
},
{
'name': 'LightGBM',
'sub-menu': [
{
'name': 'train-test classification',
'snippet': ['import lightgbm as lgb',
'from sklearn.metrics import roc_auc_score',
'',
'# Model with default hyperparameters',
"model = lgb.LGBMClassifier(objective = 'binary', random_state=RANDOM_SEED)",
'',
'model.fit(X, y)',
'',
'predictions = model.predict_proba(X_test)[:, 1]',
'auc = roc_auc_score(y_test, predictions)',
'',
"print('The baseline score on the test set is {:.4f}.'.format(auc))"
]
},
{
'name': 'train-cv classification',
'snippet': ['import lightgbm as lgb',
'',
'# Create a lgb dataset',
'train_set = lgb.Dataset(X, label = y)',
'',
'# Perform cross validation with 10 folds (with early stopping)',
'params = {} #default',
"r = lgb.cv(params, train_set, num_boost_round = 10000, nfold = 10, metrics = 'auc', ",
' early_stopping_rounds = 100, verbose_eval = False, seed = RANDOM_SEED)',
'',
'# Highest score',
"r_best = np.max(r['auc-mean'])",
'',
'# Standard deviation of best score',
"r_best_std = r['auc-stdv'][np.argmax(r['auc-mean'])]",
'',
"print('The maximium ROC AUC on the validation set was {:.5f} with std of {:.5f}.'.format(r_best, r_best_std))",
"print('The ideal number of iterations was {}.'.format(np.argmax(r['auc-mean']) + 1))"
]
}]
},
{
"name": "KNN with hpo",
"snippet": [
"import pandas as pd",
"import numpy as np",
"",
"from sklearn.neighbors import KNeighborsClassifier",
"from sklearn.model_selection import GridSearchCV",
"",
"from sklearn.model_selection import cross_val_score",
"from sklearn.model_selection import train_test_split",
"",
"from sklearn.metrics import accuracy_score, classification_report",
"",
"df = pd.read_csv('/datasets/diabetes/diabetes_data.csv')",
"X,y = df.drop(columns=['diabetes']), df['diabetes'].values",
"",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0, stratify=y)",
"",
"knn = KNeighborsClassifier()",
"param_grid = {'n_neighbors': np.arange(start=1, stop=25+1, step=2), 'weights': ['uniform', 'distance']}",
"",
"knn_gscv = GridSearchCV(knn, param_grid, cv=5, verbose=1) #n_jobs=4",
"knn_gscv.fit(X_train, y_train)",
"",
"print(f'best params: {knn_gscv.best_params_}, mean cv score: {knn_gscv.best_score_}",
"')",
"knn = knn_gscv.best_estimator_",
"print(knn)",
"",
"print('')",
"pred = knn.predict(X_test)",
" ",
"# evaluate and return accuracy",
"print(f'Accuracy of best_estimator on test set: {knn.score(X_test, y_test)}')"
]
},
]
},
{
'name': 'Unsupervised',
'sub-menu': [
{
"name": "Clustering metrics",
"snippet": [
"## [clustering metrics](https://scikit-learn.org/stable/modules/clustering.html#k-means)",
"* `Inertia (within-cluster sum-of-squares)`: $\\sum_{i=0}^{n}\\min_{\\mu_j \\in C}(||x_i - \\mu_j||^2)$",
"* `(Adjusted) Random Index`: compares `labels_true` to `labels pred` being permutation-invariant (random labeling = 0)",
"* `(Adjusted) Mutual Information`: compares `labels_true` to `labels pred` being permutation-invariant (random labeing = 0; upper bound = 1)",
"* `Homogeneity, completeness and V-measure` ($[0,1]$, highr is better): ",
" * `Homogeneity`: each cluster contains only members of a single class",
" * `completeness`: all members of a given class are assigned to the same cluster",
" * `V-measure`: Their harmonic mean (with eta=1$)",
"* `Silhouette`: higher score relates to a model with better defined clusters.",
" * Bounded between -1 for incorrect clustering and +1 for highly dense clustering. Zero indicate overlapping clusters.",
" * The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster."
]
},
{
"name": "IsolationForest for outlier detection",
"snippet": [
"import time",
"",
"import numpy as np",
"import matplotlib",
"import matplotlib.pyplot as plt",
"",
"from sklearn.datasets import make_blobs",
"from sklearn.ensemble import IsolationForest",
"",
"RANDOM_STATE = 90210",
"",
"# Example settings",
"n_samples = 300",
"outliers_fraction = 0.15",
"n_outliers = int(outliers_fraction * n_samples)",
"n_inliers = n_samples - n_outliers",
"",
"",
"# define outlier/anomaly detection methods to be compared",
"algorithm = IsolationForest(contamination=outliers_fraction,",
" random_state=RANDOM_STATE)",
"",
"# Define datasets",
"blobs_params = dict(random_state=RANDOM_STATE, n_samples=n_inliers, n_features=2)",
"X = make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5],",
" **blobs_params)[0]",
"",
"print(f'Shape: {X.shape}')",
"",
"# Compare given classifiers under given settings",
"xx, yy = np.meshgrid(np.linspace(-7, 7, 150),",
" np.linspace(-7, 7, 150))",
"",
"rng = np.random.RandomState(RANDOM_STATE)",
"",
"",
"# Add outliers",
"X = np.concatenate([X, rng.uniform(low=-6, high=6,",
" size=(n_outliers, 2))], axis=0)",
"",
"t0 = time.time()",
"algorithm.fit(X)",
"t1 = time.time()",
"",
"",
"# fit the data and tag outliers",
"y_pred = algorithm.fit(X).predict(X) # [-1,1]",
"y_pred = (y_pred *.5 +.5).astype(int) # [0,1] # zeros are outliers",
"",
"# plot the levels lines and the points",
"Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])",
"Z = Z.reshape(xx.shape)",
"plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')",
"",
"colors = np.array(['#377eb8', '#ff7f00'])",
"plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])",
"",
"plt.xlim(-7, 7)",
"plt.ylim(-7, 7)",
"plt.xticks(())",
"plt.yticks(())",
"",
"print('Duration: ' + ('%.2fs' % (t1 - t0)).lstrip('0'))",
"",
"#value_counts",
"y = np.bincount(y_pred)",
"ii = np.nonzero(y)[0]",
"np.vstack((ii, y[ii])).T"
]
},
]
},
'---',
{
'name': 'Dimensionality Reduction',
'sub-menu': [
{
"name": "PCA",
"snippet": [
"import matplotlib.pyplot as plt",
"from sklearn.datasets import make_classification",
"from sklearn.decomposition import PCA",
"X, y = make_classification(1000, 5, n_informative=1, n_classes=2, n_clusters_per_class=1, random_state=0)",
"n_components = 2",
"pca = PCA(n_components=n_components)",
"X_ = pca.fit_transform(X)",
"",
"print('explained_variance_ratio_:', pca.explained_variance_ratio_)",
"# sns.barplot(np.arange(n_components)+1, pca.explained_variance_ratio_)",
"print('singular_values_:', pca.singular_values_)",
"",
"ax = plt.figure(figsize=(8,8)).gca()",
"ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=plt.cm.Spectral)"
]
},
{
"name": "T-SNE",
"snippet": [
"import matplotlib.pyplot as plt",
"from sklearn.datasets import make_classification",
"from sklearn.manifold import TSNE",
"X, y = make_classification(1000, 5, n_informative=1, n_classes=2, n_clusters_per_class=1, random_state=0)",
"n_components = 2",
"",
"#It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) ",
"# to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high.",
"",
"tsne = TSNE(n_components=n_components, init='pca',",
" random_state=0)",
"",
"X_ = tsne.fit_transform(X)",
"",
"ax = plt.figure(figsize=(8,8)).gca()",
"ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=plt.cm.Spectral)",
"",
"print('n_iter_:', tsne.n_iter_)",
"print('kl_divergence_:', tsne.kl_divergence_)"
]
}]
},
'---',
{
'name': 'binary classification metrices',
'snippet': [
'* **Sensitivity** (*TPR/recall*) - proportion of positives that are correctly classified',
'* **Specificity** (*TNR*) - proportion of negatives that are correctly classified',
'',
"* **FPR** (*1-Specificity*) - When it's actually no, how often does it predict yes?",
'',
'* **Precision** - proportion of true positives out of all detected positives'
]
},
{
'name': 'binary classification AUC ROC and PR',
'snippet': ['from sklearn.metrics import roc_auc_score, average_precision_score',
'',
'auc_roc = roc_auc_score(y_train, prob, sample_weight=None)',
"print('auc_roc', auc_roc)",
'auc_pr = average_precision_score(y_train, prob, sample_weight=None)',
"print('auc_pr', auc_pr)",
'',
'from sklearn.utils import compute_sample_weight as sklearn_compute_sample_weight',
"sample_weight = sklearn_compute_sample_weight(class_weight='balanced', y=y_train)",
'',
'auc_roc = roc_auc_score(y_train, prob, sample_weight=sample_weight)',
"print('auc_roc (using sample_weight)', auc_roc)",
'auc_pr = average_precision_score(y_train, prob, sample_weight=sample_weight)',
"print('auc_pr (using sample_weight)', auc_pr)"
]
},
{
'name': 'plotly ROC for binary classification',
'snippet': ['from sklearn.metrics import roc_curve, roc_auc_score',
'',
'y_true = np.random.randint(0, high=1+1, size=(100,))',
'y_pred = np.random.rand(*(100,))',
'# y_pred = y_true',
'',
'use_sample_weight = False',
'sample_weight=None',
'from sklearn.utils import compute_sample_weight as sklearn_compute_sample_weight',
'if use_sample_weight:',
" sample_weight = sklearn_compute_sample_weight(class_weight='balanced', y=y_true)",
'',
'fpr, tpr, thresholds = roc_curve(y_true,y_pred, sample_weight=sample_weight)',
'auc = roc_auc_score(y_true,y_pred, sample_weight=sample_weight)',
'',
'lw = 2',
'',
'trace1 = go.Scatter(',
' x=fpr,',
' y=tpr,',
" mode='lines',",
" line=dict(color='darkorange', width=lw),",
" fill='tonexty',",
" name=f'ROC curve<br>(area = {auc:.4f})',",
" customdata=[f'TH: {t:.2f}' for t in thresholds],",
" hovertemplate='FPR: %{x:.2f}<br>' + 'TPR: %{y:.2f}<br>' +",
" '%{customdata}<br>' + '<extra></extra>',",
' showlegend=True',
')',
'',
'#WARN: when classes are imbalanced this might not be the ',
'trace2 = go.Scatter(x=[0, 1.01],',
' y=[0, 1.01],',
" mode='lines',",
" line=dict(color='navy', width=lw, dash='dash'),",
" name=f'Random classifier',",
' showlegend=True)',
'',
"layout = go.Layout(title='<b>R</b>eceiver <b>O</b>perating <b>C</b>haracteristic curve' +",
" ('<br><i>(with sample weighting)</i>' if use_sample_weight else ''),",
" xaxis=dict(title='<b>FPR</b> (1-Specificity)<br>Incorrectly predicted positives', range=[0.001, 1.01]),",
" yaxis=dict(title='<b>TPR</b> (Sensitivity)<br>Positives detected out of all positives', range=[0.001, 1.01]))",
'',
'fig = go.Figure(data=[trace1, trace2], layout=layout)',
'',
'',
"#plotly.io.write_image(fig, 'figures/sup1a.pdf')",
'',
'iplot(fig, show_link=True)'
]
},
{
'name': 'plotly PR curve for binary classification',
'snippet': ['from sklearn.metrics import precision_recall_curve, average_precision_score',
'y_true = np.random.randint(0, high=1+1, size=(100,))',
'y_pred = np.random.rand(*(100,))',
'# y_pred = y_true',
'',
'use_sample_weight = False',
'sample_weight=None',
'from sklearn.utils import compute_sample_weight as sklearn_compute_sample_weight',
'if use_sample_weight:',
" sample_weight = sklearn_compute_sample_weight(class_weight='balanced', y=y_true)",
' ',
'precision, recall, thresholds = precision_recall_curve(y_true,y_pred, ',
' sample_weight=sample_weight)',
'',
'#close the curve',
'recall = np.append([1.], recall)',
'precision = np.append([0.], precision)',
'thresholds = np.append([0.], thresholds)',
'',
'auc = average_precision_score(y_true,y_pred, sample_weight=sample_weight)',
'',
'lw = 2',
'',
'trace1 = go.Scatter(',
' x=recall,',
' y=precision,',
" mode='lines',",
" line=dict(color='darkorange', width=lw),",
" fill='tonexty',",
" name=f'ROC curve<br>(area = {auc:.4f})',",
" customdata=[f'TH: {t:.2f}' for t in thresholds],",
" hovertemplate='Recall: %{x:.2f}<br>' + 'Precision: %{y:.2f}<br>' +",
" '%{customdata}<br>' + '<extra></extra>',",
' showlegend=True',
')',
'',
'#WARN: when classes are imbalanced this might not be accurate',
'trace2 = go.Scatter(x=[0, 1.01],',
' y=[.5, .5],',
" mode='lines',",
" line=dict(color='navy', width=lw, dash='dash'),",
" name=f'Random classifier',",
' showlegend=True)',
'',
'eps = np.finfo(np.float32).eps',
"layout = go.Layout(title='<b>P</b>recision-<b>R</b>ecall curve' + ",
" ('<br><i>(with sample weighting)' if use_sample_weight else ''),",
" xaxis=dict(title='<b>Recall</b><br>(Positives that were correctly classified)', range=[0.001, 1.01]),",
" yaxis=dict(title='<b>Precision</b><br>(Positives detected out of all positives)', range=[0.001, 1.01]))",
'',
'',
'fig = go.Figure(data=[trace1, trace2], layout=layout)',
'',
"#plotly.io.write_image(fig, 'figures/sup1b.pdf')",
'',
'iplot(fig, show_link=True)'
]
},
{
'name': 'train_test_split',
'snippet': ['from sklearn.model_selection import train_test_split',
'X, X_val, y, y_val = train_test_split(',
' X, y, test_size=0.33, random_state=RANDOM_SEED)'
]
}
]
},
{
'name': 'DL',
'sub-menu': [
{
'name': 'PyTorch',
'sub-menu': [
{
'name': 'imports and GPU',
'snippet': ['import torch',
'import torch.nn as nn',
'import torch.nn.functional as F',
'import torch.optim as optim',
'import torchvision',
'',
"assert torch.cuda.is_available(), 'No GPU!'",
"DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
'print(DEVICE)',
'',
'X = torch.rand(1,5).to(DEVICE)',
'X'
]
},
{
'name': 'torch model summary',
'snippet': ['import torchsummary',
'',
"torchsummary.summary(model, (3, 224, 224), device='cpu')"
]
},
{
'name': 'linear regression on gpu',
'snippet': ['import torch',
'from torch import optim',
'from torch import nn',
'',
'def get_data():',
' from sklearn.datasets import make_regression',
'',
' n_features = 1',
' n_samples = 100',
'',
' X, y = make_regression(',
' n_samples=n_samples,',
' n_features=n_features,',
' noise=10,',
' )',
' ',
' X = torch.from_numpy(X).float()',
' y = torch.from_numpy(y.reshape((n_samples, n_features))).float()',
' ',
' X, y = X.to(device), y.to(device)',
' return X,y',
'',
'class LinReg(nn.Module):',
' def __init__(self, input_dim):',
' super().__init__()',
' self.beta = nn.Linear(input_dim, 1)',
' ',
' def forward(self, X):',
' return self.beta(X)',
'',
"device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
'X,y = get_data()',
'',
'n_samples, n_features = X.shape',
'print(X.shape, y.shape)',
'',
'#(Father, Son, Holy Ghost) \\equiv (Model, Loss, Optimizer)',
'model = LinReg(n_features).to(device) ',
'criterion = nn.MSELoss()',
'optimizer = optim.SGD(model.parameters(), lr=1e-1)',
'',
'',
'#Training',
'from tqdm.auto import tqdm, trange',
'for _ in trange(10):',
' # Train step',
' model.train()',
" optimizer.zero_grad() #IMPORTANT: reset (don't accumulate) gradients",
'',
' y_ = model(X)',
' loss = criterion(y_, y)',
'',
' loss.backward() #compute gradients wrt the weights',
' optimizer.step() #apply the learning rule',
'',
' # Eval (suppose to be on the validation data)',
' model.eval()',
' with torch.no_grad():',
' y_ = model(X) ',
'',
'# Vis',
'fig, ax = plt.subplots()',
"ax.plot(X.cpu().numpy(), y_.cpu().numpy(), '.', label='pred')",
"ax.plot(X.cpu().numpy(), y.cpu().numpy(), '.', label='data')",
"ax.set_title(f'MSE: {loss.item():0.1f}')",
'ax.legend();'
]
},
'---',
{
'name': 'Parsimonous MNIST',
'snippet': ["#PyTorch ANN with > 99% accuracy (after 20 epochs) on the MNIST dataset.",
"",
"# ~~~~ Boilerplate ~~~",
"import torch #1.4.0",
"from torch import nn",
"from tqdm.auto import tqdm, trange",
"import numpy as np",
"",
"# ~~~~ Options ~~~",
"opts = {",
" 'lr': 1e-3,",
" 'epochs': 1, #20 achieves 99%",
" 'batch_size': 64",
"}",
"",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
"print(device)",
"",
"# ~~~~ Data loading ~~~",
"import torchvision.datasets as dset #0.5.0",
"import torchvision.transforms as transforms",
"dataroot = '/datasets'",
"ds_train = dset.MNIST(root=dataroot, train=True, download=False,",
" transform=transforms.Compose([",
" transforms.ToTensor(),",
" transforms.Normalize((0.1307,), (0.3081,))",
" ]))",
"ds_test = dset.MNIST(root=dataroot, train=False, download=False,",
" transform=transforms.Compose([",
" transforms.ToTensor(),",
" transforms.Normalize((0.1307,), (0.3081,))",
" ]))",
"train_loader = torch.utils.data.DataLoader(dataset=ds_train, batch_size=opts['batch_size'], shuffle=True)",
"test_loader = torch.utils.data.DataLoader(dataset=ds_test, batch_size=opts['batch_size'], shuffle=False)",
"",
"# ~~~~ Model, Optimizer, Loss ~~~",
"class CNN(nn.Module):",
" def __init__(self, input_size=(1,28,28), num_classes=10):",
" super(CNN, self).__init__()",
"",
" self.layer1 = nn.Sequential(",
" nn.Conv2d(input_size[0], 32, kernel_size=5),",
" nn.ReLU(),",
" nn.MaxPool2d(kernel_size=2))",
" ",
" self.layer2 = nn.Sequential(",
" nn.Conv2d(32, 64, kernel_size=5),",
" nn.ReLU(),",
" nn.MaxPool2d(kernel_size=2))",
"",
" self.fc1 = nn.Linear(4 * 4 * 64, num_classes)",
" ",
" ",
" def forward(self, x):",
" # x: (Nx1x28x28) tensor",
" x = self.layer1(x)",
" x = self.layer2(x)",
" x = x.reshape(x.size(0), -1)",
" x = self.fc1(x)",
" return x",
" ",
"model = CNN((1, 28, 28), 10).to(device)",
"optimizer = torch.optim.Adam(model.parameters(), opts['lr'])",
"criterion = torch.nn.CrossEntropyLoss() # loss function",
"",
"# ~~~~ Main loop ~~~",
"for epoch in range(opts['epochs']):",
" model.train()",
" train_loss = []",
" N = len(train_loader)",
" loss_, NUDGE = np.nan, int(N/10)",
" pbar = tqdm(enumerate(train_loader), total=N,",
" desc=f'Epoch[{epoch+1:^3}], Batch[{0+1:^4}], Loss[{loss_:.2f}]')",
" for i, (data, labels) in pbar:",
" data, labels = data.to(device), labels.to(device)",
" outputs = model(data)",
" loss = criterion(outputs, labels)",
" optimizer.zero_grad()",
" loss.backward()",
" optimizer.step()",
" loss_ = loss.item()",
" train_loss.append(loss_)",
" if i%NUDGE == NUDGE-1:",
" pbar.set_description(f'Epoch[{epoch+1:^3}], Batch[{i+1:^4}], Loss[{loss_:.2f}]')",
" ",
" ",
" model.eval()",
" test_loss = []",
" test_accuracy = []",
" for i, (data, labels) in enumerate(test_loader):",
" data, labels = data.to(device), labels.to(device)",
" outputs = model(data)",
" _, predicted = torch.max(outputs.data, 1)",
" loss = criterion(outputs, labels)",
" test_loss.append(loss.item())",
" test_accuracy.append((predicted == labels).sum().item() / predicted.size(0))",
" ",
" print(f'Epoch: {epoch}, train loss: {np.mean(train_loss):.3f}, test loss: {np.mean(test_loss):.3f}, test accuracy: {np.mean(test_accuracy):.3f}')"
]
},
{
"name": "MLP with BCE",
"snippet": [
"# ~~~~ Boilerplate ~~~",
"import torch #1.5.0",
"from torch import nn",
"from tqdm.auto import tqdm, trange",
"import numpy as np",
"",
"np.random.seed(90210)",
"torch.manual_seed(90210)",
"",
"# ~~~~ Options ~~~",
"opts = {",
" 'lr': 1e-3,",
" 'epochs': 10,",
" 'batch_size': 1",
"}",
"",
"# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
"device = torch.device('cpu')",
"print(device)",
"",
"# ~~~~ Data loading ~~~",
"from sklearn.datasets import make_classification",
"n_features = 2",
"X, y = make_classification(n_samples=1000, n_features=n_features, n_informative=n_features, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=90210)",
"",
"from torch.utils.data import DataLoader, TensorDataset",
"",
"dataset = TensorDataset( torch.FloatTensor(X), torch.FloatTensor(y) )",
"",
"train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=opts['batch_size'], shuffle=False)",
"test_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=opts['batch_size'], shuffle=False)",
"",
"# ~~~~ Model, Optimizer, Loss ~~~",
"class MLP(torch.nn.Module):",
" def __init__(self, input_size, hidden_size):",
" super(MLP, self).__init__()",
" self.fc1 = torch.nn.Linear(input_size, hidden_size)",
" self.relu = torch.nn.ReLU()",
" self.fc2 = torch.nn.Linear(hidden_size, 1)",
" self.sigmoid = torch.nn.Sigmoid()",
" def forward(self, x):",
" hidden = self.fc1(x)",
" relu = self.relu(hidden)",
" output = self.fc2(relu)",
" output = self.sigmoid(output)",
" return output",
" ",
"model = MLP(n_features, 50).to(device)",
"optimizer = torch.optim.Adam(model.parameters(), opts['lr'])",
"# optimizer = torch.optim.SGD(model.parameters(), opts['lr'])",
"criterion = torch.nn.BCELoss() # loss function",
"",
"# ~~~~ Main loop ~~~",
"for epoch in range(opts['epochs']):",
" model.train()",
" train_loss = []",
" N = len(train_loader)",
" loss_, NUDGE = np.nan, int(N/10)",
" pbar = tqdm(enumerate(train_loader), total=N,",
" desc=f'Epoch[{epoch+1:^3}], Batch[{0+1:^4}], Loss[{loss_:.2f}]')",
" for i, (data, labels) in pbar:",
" data, labels = data.to(device), labels.to(device)",
" outputs = model(data)",
" outputs = outputs.squeeze(1)",
" loss = criterion(outputs, labels)",
" optimizer.zero_grad()",
" loss.backward()",
" optimizer.step()",
" loss_ = loss.item()",
" train_loss.append(loss_)",
" if i%NUDGE == NUDGE-1:",
" pbar.set_description(f'Epoch[{epoch+1:^3}], Batch[{i+1:^4}], Loss[{loss_:.2f}]')",
" ",
" ",
" model.eval()",
" test_loss = []",
" test_accuracy = []",
" for i, (data, labels) in enumerate(test_loader):",
" data, labels = data.to(device), labels.to(device)",
" outputs = model(data)",
" outputs = outputs.squeeze(1)",
" predicted = (outputs>0.5).float()",
" loss = criterion(outputs, labels)",
" test_loss.append(loss.item())",
" test_accuracy.append((predicted == labels).sum().item() / opts['batch_size'])",
" ",
" print(f'Epoch: {epoch}, train loss: {np.mean(train_loss):.3f}, test loss: {np.mean(test_loss):.3f}, test accuracy: {np.mean(test_accuracy):.3f}')"
]
}]
},
{
'name': 'Keras',
'sub-menu': [
{
'name': 'plot_model',
'snippet': ['#requirements: graphviz (apt-get), pydot (pip)',
'from IPython.display import SVG',
'from keras.utils.vis_utils import model_to_dot',
'def plot_keras_model(model, show_shapes=True, show_layer_names=True):',
' return SVG(model_to_dot(model, show_shapes=show_shapes,',
" show_layer_names=show_layer_names).create(prog='dot',format='svg'))",
'plot_keras_model(model, show_shapes=True, show_layer_names=False)'
]
}]
},
{
'name': 'TensorFlow',
'sub-menu': [
{
'name': 'assert is using GPU',
'snippet': ['import tensorflow as tf',
"assert tf.test.gpu_device_name(), 'tf does not run on GPU!'"
]
},
{
'name': 'supress warnings',
'snippet': ['#supress tf warnings',
'#https://stackoverflow.com/a/38645250/1640414',
'import os',
"os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}, higher is less verbose",
'#https://stackoverflow.com/a/51327615/1640414',
'tf.logging.set_verbosity(tf.logging.FATAL)'
]
},
{
'name': 'eval on CPU',
'snippet': ['import tensorflow as tf',
'config = tf.ConfigProto(',
" device_count = {'GPU': 0}",
' )',
'',
"tensor = tf.ones([3,2], dtype=tf.float32, name='ones')",
'',
'with tf.Session(config=config) as sess:',
' print(sess.run(tensor))'
]
}]
},
{
'name': 'OpenAI Gym',
'sub-menu': [
{
'name': 'no render from notebook',
'snippet': ['import gym',
'from gym import wrappers',
"env = gym.make('CartPole-v0')",
'',
"#If you'd like rendering use it outside the notebook",
'#From: https://stackoverflow.com/a/50866507/1640414',
"env = wrappers.Monitor(env, '/tmp/gym', video_callable=False ,force=True)",
'env.reset()',
'print(env.step(env.action_space.sample())) # take a random action',
'env.close()'
]
}]
}]
},
'---',
{
'name': 'Markdown',
'sub-menu': [
{
"name": "Table (with column alignment)",
"snippet": [
"* Table alignment",
"| Syntax | Description | Test Text |",
"| :--- | :----: | ---: |",
"| Header | Title | Here's this |",
"| Paragraph | Text | And more |"
]
},
{
'name': 'Table (with code)',
'snippet': ['from IPython.display import HTML, display',
'import tabulate',
"table = [['Sun',696000,1989100000],",
" ['Earth',6371,5973.6],",
" ['Moon',1737,73.5],",
" ['Mars',3390,641.85]]",
"display(HTML(tabulate.tabulate(table, headers=['h1', 'h2', 'h3'], tablefmt='html')))"
]
},
{
'name': 'add YouTube video',
'snippet': ['from IPython.lib.display import YouTubeVideo',
"YouTubeVideo('Boy3zHVrWB4', start=0)"
]
},
{
'name': 'add IFrame embedding',
'snippet': ['from IPython.display import IFrame',
"IFrame('https://www.desmos.com/calculator/osig1u1uwl?embed', width=350, height=350)"
]
},
{
'name': 'embedded code markdown',
'snippet': ['```bash',
'git clone about:blank',
'```'
]
},
{
"name": "Figure template in HTML",
"snippet": [
"<center>",
"<figure>",
"<img src='http://pyro.ai/_static/img/vae_plots/test_elbo_vae.png' style='width: 550px;'>",
"<figcaption>",
"<font size='+1'><b>Figure 3:</b> How the test ELBO evolves over the course of training. </font>",
"</figcaption>",
"</figure>",
"</center>"
]
},
'---',
{
'name': 'Cheetsheet',
'external-link': 'https://github.com/adam-p/markdown-here/wiki/Markdown-Here-Cheatsheet',
},
{
'name': 'Extended syntax',
'external-link': 'https://www.markdownguide.org/extended-syntax/',
}
]
},
{
'name': 'LaTeX',
'sub-menu': [
{
'name': 'Equations with numbers',
'snippet': ['$$',
'\\begin{equation}',
'dS_A+dS_B>0 \\\\',
'dS_A+dS_B>0',
'\\end{equation}',
'$$'
]
},
{
'name': 'Aligning multiple equations',
'snippet': ['$$\\begin{align*}',
'p_A &\\sim \\text{Uniform}[\\text{low}=0,\\text{high}=1) \\\\',
'p_B &\\sim \\text{Uniform}[\\text{low}=0,\\text{high}=1) \\\\',
'X\\ &\\sim \\text{Bernoulli}(\\text{prob}=p) \\\\',
'\\text{for } i &= 1\\ldots N: \\\\',
' X_i\\ &\\sim \\text{Bernoulli}(p_i)',
'\\end{align*}$$'
]
},
{
'name': 'Vector in matrix notation',
'snippet': ['$\\begin{bmatrix} ',
' 0 \\\\ ',
' 0 \\\\ ',
'\\end{bmatrix}\\in\\text{Null Space}$'
]
}]
},
'---',
{
'name': 'Best practices',
'sub-menu': [
{
'name': 'Static typing',
'external-link': 'https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html',
},
{
'name': 'Styling matplotlib __',
'external-link': 'https://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template',
},
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment