Skip to content

Instantly share code, notes, and snippets.

@acbart
Last active August 7, 2023 16:07
Show Gist options
  • Save acbart/c10a369d83baef8d79aa5a49fa88ea65 to your computer and use it in GitHub Desktop.
Save acbart/c10a369d83baef8d79aa5a49fa88ea65 to your computer and use it in GitHub Desktop.
Common Jupyter Header
## Generally useful built libraries
import sys
import os
import json
import math
import re
import itertools
from collections import defaultdict
from dataclasses import dataclass
from datetime import timedelta, datetime
## Core Data Science Libraries
# Main matplotlib usage is with `plt.whatever`
import matplotlib.pyplot as plt
# Sometimes you need style stuff via `mpl.whatever`
import matplotlib as mpl
# Statistic functions like `st.f_oneway`, `st.pearsonr`, etc.
import scipy.stats as st
# Your entire life is now pd.DataFrame(), pd.concat, etc.
import pandas as pd
# Real plotters user Seaborn over Matplotlib: sns.histplot, sns.jointplot, sns.lmplot, etc.
import seaborn as sns
# Sometimes you have to use numpy stuff, but almost always better to stay in Pandas
import numpy as np
## Quality of Life Libraries
# Progress bars! Just do:
# for an_item in tqdm(an_iterable):
from tqdm.notebook import tqdm
# Need to render HTML or Code? Combine them with `display`
# Sometimes you will want to `print`, and sometimes `display`
from IPython.display import display, HTML, Code
## More Situational Libraries
# Situational, but sometimes natsorting is helpful
from natsort import index_natsorted, natsorted, order_by_index, natsort_keygen
# Some more useful statistic related functions in here
import sklearn.metrics as metrics
# More stats stuff, mixed effects models, generalized linear models, ODEs, etc.
import statsmodels as sm
# Probably won't need to use hidden markov models
#from hmmlearn import hmm
### Style Stuff
# Recommend putting this in a separate cell
# At the minimum, make the background white so you can copy/paste into Discord :)
mpl.rcParams['figure.facecolor'] = 'white'
# Calculations I did one time for LaTeX papers, hope they're accurate..?
COLUMN_WIDTH = 240/72.27
TEXT_WIDTH = 504/72.27
# I liked this style, but use your best judgement
plt.style.use('seaborn-v0_8-whitegrid')
# Setup nice fonts for an ACM LaTeX paper
nice_fonts = {
# Ideally, you should enable this if you have LaTeX installed
# It'll make the graphs match the paper font much more closely.
# "text.usetex": True,
"text.usetex": False,
"font.family": "serif",
# Use 10pt font in plots, to match 10pt font in document
"axes.labelsize": 10,
"font.size": 10,
# Make the legend/label fonts a little smaller
"legend.fontsize": 8,
"xtick.labelsize": 8,
"ytick.labelsize": 8,
}
mpl.rcParams.update(nice_fonts)
# Stop using scientific notation, show two decimal places instead.
pd.set_option('display.float_format', '{:.2f}'.format)
### Helper Functions
def save_figure(filename, fig=None, folder='reports/figures/'):
""" Helper function to quickly save figures for the paper. """
if fig is None:
fig = plt
fig.savefig(os.path.join(folder, filename), format='pdf', bbox_inches='tight')
def display_code(data):
return display(Code(data))
display_code("a = 0")
def set_size(width, fraction=1, subplots=(1, 1)):
""" Set figure dimensions to avoid scaling in LaTeX.
Parameters
----------
width: float or string
Document width in points, or string of predined document type
fraction: float, optional
Fraction of the width which you wish the figure to occupy
subplots: array-like, optional
The number of rows and columns of subplots.
Returns
-------
fig_dim: tuple
Dimensions of figure in inches
"""
if width == 'thesis':
width_pt = 426.79135
elif width == 'beamer':
width_pt = 307.28987
elif width == 'pnas':
width_pt = 246.09686
else:
width_pt = width
# Width of figure (in pts)
fig_width_pt = width_pt * fraction
# Convert from pt to inches
inches_per_pt = 1 / 72.27
# Golden ratio to set aesthetic figure height
# https://disq.us/p/2940ij3
golden_ratio = (5**.5 - 1) / 2
# Figure width in inches
fig_width_in = fig_width_pt * inches_per_pt
# Figure height in inches
fig_height_in = fig_width_in * golden_ratio * (subplots[0] / subplots[1])
return (fig_width_in, fig_height_in)
import re
def tex_escape(text):
"""
:param text: a plain text message
:return: the message escaped to appear correctly in LaTeX
"""
conv = {
'&': r'\&',
'%': r'\%',
'$': r'\$',
'#': r'\#',
'_': r'\_',
'{': r'\{',
'}': r'\}',
'~': r'\textasciitilde{}',
'^': r'\^{}',
'\\': r'\textbackslash{}',
'<': r'\textless{}',
'>': r'\textgreater{}',
}
regex = re.compile('|'.join(re.escape(str(key)) for key in sorted(conv.keys(), key = lambda item: - len(item))))
return regex.sub(lambda match: conv[match.group()], text)
### Good Research Stuff
# Choose a consistent alpha threshold for P-Value calculations
ALPHA = .05
def proportion_stats(incidences, total, label=None):
""" Pretty prints a proportion with APA-style statistics. """
proportion = incidences/total
rounded_proportion = round(100*proportion*10)/10
standard_deviation = math.sqrt(proportion*(1-proportion))/math.sqrt(total)
rounded_sd = round(100*standard_deviation)/100
result = f"n={incidences}, M={rounded_proportion}%, SD={rounded_sd}"
if label is None:
return result
else:
print(label, result)
print(proportion_stats(50, 100))
proportion_stats(75, 103, 'Test Case')
def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
'''
Helper function that gives a quick summary of quantattive data
Arguments
=========
dataframe: pandas dataframe
x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
y: str. vertical axis to plot the quantitative data
hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)
palette: array-like. Colour of the plot
swarm: if swarm is set to True, a swarm plot would be overlayed
Returns
=======
Quick Stats of the data and also the box plot of the distribution
'''
series = dataframe[y]
print(series.describe())
print('mode: ', series.mode())
if verbose:
print('='*80)
print(series.value_counts())
sns.boxplot(x=x, y=y, hue=hue, data=dataframe, palette=palette, ax=ax)
if swarm:
sns.swarmplot(x=x, y=y, hue=hue, data=dataframe,
palette=palette, ax=ax)
plt.show()
tqdm
pandas
numpy
scipy
matplotlib
seaborn
numpy
jupyterlab
natsort
sklearn
statsmodels
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment