Skip to content

Instantly share code, notes, and snippets.

@amrakm
amrakm / Check if two numpy arrays are similar.py
Last active March 16, 2021 19:17
Check if two numpy arrays are similar - useful for confirming ML implementations
## source: http://nbviewer.jupyter.org/github/rasbt/algorithms_in_ipython_notebooks/blob/master/ipython_nbs/statistics/linregr_least_squares_fit.ipynb#Sections
import numpy as np
np.testing.assert_almost_equal(arr_1, arr_2, decimal=5)
# source: http://nbviewer.jupyter.org/github/rasbt/algorithms_in_ipython_notebooks/blob/master/ipython_nbs/statistics/linregr_least_squares_fit.ipynb#Sections
# source_2: https://github.com/rasbt/data-science-tutorial/blob/master/code/linear-reqression-leastsquares.ipynb
import numpy as np
def matrix_lstsqr(x, y):
""" Computes the least-squares solution to a linear matrix equation. """
X = np.vstack([x, np.ones(len(x))]).T
return (np.linalg.inv(X.T.dot(X)).dot(X.T)).dot(y)
@amrakm
amrakm / Normalised histogram
Created October 26, 2018 21:26
Normalised histogram that can be applied to Pandas plots directly
def normalised_hist(x):
counts, ranges = np.histogram(x.dropna(), range=(min(x), max(x)))
weights = np.ones_like(x.shape[0])/float(x.shape[0])
normalised_counts = counts * weights
ranges = ranges.astype(int)
return pd.Series(normalised_counts, zip(ranges[:-1], ranges[1:]))
kw = dict(stacked=True, width=1, rot=45)
df.groupby('flag').get_group(True)[['numerical_att')]].apply(normalised_hist).unstack(0).plot.bar(**kw)
@amrakm
amrakm / Improve Pandas Memory Efficiency
Created October 27, 2018 11:02
Automatically change dtypes for numerical columns to the minimum it can use
## source: https://www.kaggle.com/jeru666/did-you-think-of-these-features
def change_datatype(df):
int_cols = list(df.select_dtypes(include=['int']).columns)
for col in int_cols:
if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
df[col] = df[col].astype(np.int8)
elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
df[col] = df[col].astype(np.int16)
elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
ranges = [0,50,100,160]
ranges_label = ['cheap', "average", "expensive"]
df['price_cat'] = pd.cut(df.current_package_price, ranges, labels=ranges_label)
# Another example
# Bucketing age groups
ranges = [18,25,29,34, 50,70, 2020]
ranges_label = ['18-25','26-29',"30-34", "35-50", '51-70','unknown']
df['age_group'] = pd.cut(df['age'], ranges, labels=ranges_label)
## pip install gender-guesser
import gender_guesser.detector as gender
def get_gender(x):
d = gender.Detector()
return d.get_gender(x.lower().capitalize())
@amrakm
amrakm / gist:be4c89555ef5318aa6827905eb887d3f
Created January 15, 2019 14:30
Dynamically update plots in Jupyter lab
# source: https://stackoverflow.com/a/52672859/5554394
from IPython.display import clear_output
from matplotlib import pyplot as plt
import collections
%matplotlib inline
def live_plot(data_dict, figsize=(7,5), title=''):
clear_output(wait=True)
plt.figure(figsize=figsize)
@amrakm
amrakm / camel_case_to_snake_case.py
Created April 3, 2019 11:06 — forked from jaytaylor/camel_case_to_snake_case.py
Convert camel-case to snake-case in python.
#!/usr/bin/env python
"""
Convert camel-case to snake-case in python.
e.g.: CamelCase -> snake_case
Relevant StackOverflow question: http://stackoverflow.com/a/1176023/293064
"""
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import matplotlib as mpl
import matplotlib.pyplot as plt
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
'legend.fontsize': med,
'figure.figsize': (16, 10),
'axes.labelsize': med,
'axes.titlesize': med,
'xtick.labelsize': med,