Skip to content

Instantly share code, notes, and snippets.

View twolodzko's full-sized avatar

Timothy Wolodzko twolodzko

View GitHub Profile
@twolodzko
twolodzko / matplotlib_identity_line.py
Created October 7, 2019 13:11
Identity line for matplotlib
import matplotlib.pyplot as plt
def identity_line(ax=None, ls='--', *args, **kwargs):
# see: https://stackoverflow.com/q/22104256/3986320
ax = ax or plt.gca()
identity, = ax.plot([], [], ls=ls, *args, **kwargs)
def callback(axes):
low_x, high_x = ax.get_xlim()
low_y, high_y = ax.get_ylim()
low = min(low_x, low_y)
@twolodzko
twolodzko / partial_correlation.py
Last active July 12, 2019 16:35
Partial correlation
import numpy as np
def pcor(X, rowvar=False):
"""
Partial correlation
Implemented as in pcor::pcor function in R.
Kim, S. (2015) ppcor: An R Package for a Fast Calculation to Semi-partial Correlation Coefficients.
Communications for Statistical Applications and Methods, 22(6), 665-674.
@twolodzko
twolodzko / sensivity_specifity_cutoff.py
Last active March 1, 2025 16:15
Use Youden index to determine cut-off for classification
import numpy as np
from sklearn.metrics import roc_curve
def sensivity_specifity_cutoff(y_true, y_score):
'''Find data-driven cut-off for classification
Cut-off is determied using Youden's index defined as sensitivity + specificity - 1.
Parameters
----------
@twolodzko
twolodzko / clustered_train_test_split.py
Last active October 1, 2018 10:36
Split to train and test samples by clusters
import numpy as np
def train_test_split(*arrays, test_size, random_state, clusters):
'''Split to train and test samples by clusters
Parameters
----------
test_size : float, 0 < test_size < 1
fraction of clusters to include in test set
@twolodzko
twolodzko / WeightedAverage_layer.py
Created September 20, 2018 08:57
WeightedAverage merging layer for Keras
from keras import backend as K
from keras.layers import Average
from keras.activations import softmax
class WeightedAverage(Average):
def build(self, input_shape):
self.kernel = self.add_weight(name='kernel',
shape=(1, len(input_shape)),
initializer='ones',
@twolodzko
twolodzko / lr_finder.py
Created September 12, 2018 08:16 — forked from jeremyjordan/lr_finder.py
Keras Callback for finding the optimal range of learning rates
from keras.callbacks import Callback
import matplotlib.pyplot as plt
class LRFinder(Callback):
'''
A simple callback for finding the optimal learning rate range for your model + dataset.
# Usage
```python
@twolodzko
twolodzko / one_hot_xxhash.py
Created September 7, 2018 12:19
One hot encoder using xxHash
import xxhash
from keras.preprocessing.text import hashing_trick
# one_hot and hashing_trick in Keras both use by default python's hash function
# it is unstable: https://stackoverflow.com/q/27522626/3986320
# alternatively, you could use md5, but it's not the fastest hashing function
# xxHash package offers a faster alternative
xxh = lambda w: int(xxhash.xxh32(w.encode()).hexdigest(), 16)
one_hot = lambda x, n, **kwargs: hashing_trick(x, n, hash_function=xxh, **kwargs)
@twolodzko
twolodzko / data_cleaning.sh
Last active August 22, 2018 09:21
Handy command line data cleaning functions in bash / sed / awk
# Many of the functions are adapted from:
# https://www.polydesmida.info/cookbook/functions.html
nohead ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: nohead file [n=1]"
echo "Omit header (n rows) from a file."
@twolodzko
twolodzko / TopKTokenizer-class.py
Last active August 21, 2018 07:09
Tokenize Top K Words
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six import iteritems
from tqdm import tqdm
from collections import Counter
import re
class TopKTokenizer(object):
@twolodzko
twolodzko / tqdm_function_decorator.py
Last active August 16, 2018 12:55
tqdm Function decorator
from tqdm import tqdm
def tqdm_function_decorator(*args, **kwargs):
"""
Decorate a function by adding a progress bar
Parameters
----------
*args, **kwargs