Last active
August 7, 2023 16:07
-
-
Save acbart/c10a369d83baef8d79aa5a49fa88ea65 to your computer and use it in GitHub Desktop.
Common Jupyter Header
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Generally useful built libraries | |
import sys | |
import os | |
import json | |
import math | |
import re | |
import itertools | |
from collections import defaultdict | |
from dataclasses import dataclass | |
from datetime import timedelta, datetime | |
## Core Data Science Libraries | |
# Main matplotlib usage is with `plt.whatever` | |
import matplotlib.pyplot as plt | |
# Sometimes you need style stuff via `mpl.whatever` | |
import matplotlib as mpl | |
# Statistic functions like `st.f_oneway`, `st.pearsonr`, etc. | |
import scipy.stats as st | |
# Your entire life is now pd.DataFrame(), pd.concat, etc. | |
import pandas as pd | |
# Real plotters user Seaborn over Matplotlib: sns.histplot, sns.jointplot, sns.lmplot, etc. | |
import seaborn as sns | |
# Sometimes you have to use numpy stuff, but almost always better to stay in Pandas | |
import numpy as np | |
## Quality of Life Libraries | |
# Progress bars! Just do: | |
# for an_item in tqdm(an_iterable): | |
from tqdm.notebook import tqdm | |
# Need to render HTML or Code? Combine them with `display` | |
# Sometimes you will want to `print`, and sometimes `display` | |
from IPython.display import display, HTML, Code | |
## More Situational Libraries | |
# Situational, but sometimes natsorting is helpful | |
from natsort import index_natsorted, natsorted, order_by_index, natsort_keygen | |
# Some more useful statistic related functions in here | |
import sklearn.metrics as metrics | |
# More stats stuff, mixed effects models, generalized linear models, ODEs, etc. | |
import statsmodels as sm | |
# Probably won't need to use hidden markov models | |
#from hmmlearn import hmm | |
### Style Stuff | |
# Recommend putting this in a separate cell | |
# At the minimum, make the background white so you can copy/paste into Discord :) | |
mpl.rcParams['figure.facecolor'] = 'white' | |
# Calculations I did one time for LaTeX papers, hope they're accurate..? | |
COLUMN_WIDTH = 240/72.27 | |
TEXT_WIDTH = 504/72.27 | |
# I liked this style, but use your best judgement | |
plt.style.use('seaborn-v0_8-whitegrid') | |
# Setup nice fonts for an ACM LaTeX paper | |
nice_fonts = { | |
# Ideally, you should enable this if you have LaTeX installed | |
# It'll make the graphs match the paper font much more closely. | |
# "text.usetex": True, | |
"text.usetex": False, | |
"font.family": "serif", | |
# Use 10pt font in plots, to match 10pt font in document | |
"axes.labelsize": 10, | |
"font.size": 10, | |
# Make the legend/label fonts a little smaller | |
"legend.fontsize": 8, | |
"xtick.labelsize": 8, | |
"ytick.labelsize": 8, | |
} | |
mpl.rcParams.update(nice_fonts) | |
# Stop using scientific notation, show two decimal places instead. | |
pd.set_option('display.float_format', '{:.2f}'.format) | |
### Helper Functions | |
def save_figure(filename, fig=None, folder='reports/figures/'): | |
""" Helper function to quickly save figures for the paper. """ | |
if fig is None: | |
fig = plt | |
fig.savefig(os.path.join(folder, filename), format='pdf', bbox_inches='tight') | |
def display_code(data): | |
return display(Code(data)) | |
display_code("a = 0") | |
def set_size(width, fraction=1, subplots=(1, 1)): | |
""" Set figure dimensions to avoid scaling in LaTeX. | |
Parameters | |
---------- | |
width: float or string | |
Document width in points, or string of predined document type | |
fraction: float, optional | |
Fraction of the width which you wish the figure to occupy | |
subplots: array-like, optional | |
The number of rows and columns of subplots. | |
Returns | |
------- | |
fig_dim: tuple | |
Dimensions of figure in inches | |
""" | |
if width == 'thesis': | |
width_pt = 426.79135 | |
elif width == 'beamer': | |
width_pt = 307.28987 | |
elif width == 'pnas': | |
width_pt = 246.09686 | |
else: | |
width_pt = width | |
# Width of figure (in pts) | |
fig_width_pt = width_pt * fraction | |
# Convert from pt to inches | |
inches_per_pt = 1 / 72.27 | |
# Golden ratio to set aesthetic figure height | |
# https://disq.us/p/2940ij3 | |
golden_ratio = (5**.5 - 1) / 2 | |
# Figure width in inches | |
fig_width_in = fig_width_pt * inches_per_pt | |
# Figure height in inches | |
fig_height_in = fig_width_in * golden_ratio * (subplots[0] / subplots[1]) | |
return (fig_width_in, fig_height_in) | |
import re | |
def tex_escape(text): | |
""" | |
:param text: a plain text message | |
:return: the message escaped to appear correctly in LaTeX | |
""" | |
conv = { | |
'&': r'\&', | |
'%': r'\%', | |
'$': r'\$', | |
'#': r'\#', | |
'_': r'\_', | |
'{': r'\{', | |
'}': r'\}', | |
'~': r'\textasciitilde{}', | |
'^': r'\^{}', | |
'\\': r'\textbackslash{}', | |
'<': r'\textless{}', | |
'>': r'\textgreater{}', | |
} | |
regex = re.compile('|'.join(re.escape(str(key)) for key in sorted(conv.keys(), key = lambda item: - len(item)))) | |
return regex.sub(lambda match: conv[match.group()], text) | |
### Good Research Stuff | |
# Choose a consistent alpha threshold for P-Value calculations | |
ALPHA = .05 | |
def proportion_stats(incidences, total, label=None): | |
""" Pretty prints a proportion with APA-style statistics. """ | |
proportion = incidences/total | |
rounded_proportion = round(100*proportion*10)/10 | |
standard_deviation = math.sqrt(proportion*(1-proportion))/math.sqrt(total) | |
rounded_sd = round(100*standard_deviation)/100 | |
result = f"n={incidences}, M={rounded_proportion}%, SD={rounded_sd}" | |
if label is None: | |
return result | |
else: | |
print(label, result) | |
print(proportion_stats(50, 100)) | |
proportion_stats(75, 103, 'Test Case') | |
def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False): | |
''' | |
Helper function that gives a quick summary of quantattive data | |
Arguments | |
========= | |
dataframe: pandas dataframe | |
x: str. horizontal axis to plot the labels of categorical data (usually the target variable) | |
y: str. vertical axis to plot the quantitative data | |
hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable) | |
palette: array-like. Colour of the plot | |
swarm: if swarm is set to True, a swarm plot would be overlayed | |
Returns | |
======= | |
Quick Stats of the data and also the box plot of the distribution | |
''' | |
series = dataframe[y] | |
print(series.describe()) | |
print('mode: ', series.mode()) | |
if verbose: | |
print('='*80) | |
print(series.value_counts()) | |
sns.boxplot(x=x, y=y, hue=hue, data=dataframe, palette=palette, ax=ax) | |
if swarm: | |
sns.swarmplot(x=x, y=y, hue=hue, data=dataframe, | |
palette=palette, ax=ax) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tqdm | |
pandas | |
numpy | |
scipy | |
matplotlib | |
seaborn | |
numpy | |
jupyterlab | |
natsort | |
sklearn | |
statsmodels |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment