Last active
November 4, 2020 21:39
-
-
Save vincefav/3eedfc562de6c77c88708016c0ac58d0 to your computer and use it in GitHub Desktop.
Common data science tasks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_column_names(cols): | |
''' | |
Pass your dataframe's columns into this function and it'll clean up their names. | |
Sample usage: | |
data.columns = clean_column_names(data.columns) | |
''' | |
from string import punctuation | |
cols = cols.str.strip().str.lower() | |
for i in list(punctuation): | |
if i != '_': | |
cols = cols.str.replace(i, '') | |
cols = cols.str.replace(" ", '_') | |
return cols | |
# Btw, it's unusual but not necessarily bad to import libraries inside your functions. | |
# I'm doing it to make my code more copy-pasteable. | |
def absolute_correlations(col, df=data): | |
''' Sorts correlations by their absolute values (biggest appear up top | |
regardless of positive or negative ''' | |
corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation']) | |
corrs['absol'] = np.abs(corrs['correlation']) | |
return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1) | |
def cronbach_alpha(df): | |
'''pass this a dataframe of related test items''' | |
item_vars = df.var(axis=0, ddof=1) | |
t_scores = df.sum(axis=1) | |
n_items = len(df.columns) | |
return (n_items / (n_items-1)) * (1 - (item_vars.sum() / t_scores.var(ddof=1))) | |
# Monkey-patches the dataframe so you can return numeric columns a little faster | |
def numeric(self): | |
return self.select_dtypes(include=[np.number]) | |
pd.DataFrame.numeric = numeric | |
# Monkey-patches pandas to include a .zscore() and .normalize() method | |
def zscore(self): | |
return (self - self.mean()) / self.std() | |
def normalize(self): | |
return (self - self.min()) / (self.max() - self.min()) | |
pd.DataFrame.zscore = zscore | |
pd.Series.zscore = zscore | |
pd.DataFrame.normalize = normalize | |
pd.Series.normalize = normalize | |
def correlation_matrix(df, figsize=(15,7)): | |
''' Makes a pretty heatmap of correlations ''' | |
matrix = df.corr() | |
matrix = matrix[matrix.columns[1:]] | |
matrix = matrix.tail(len(matrix)-1) | |
# Generate a mask for the upper triangle | |
mask = np.zeros_like(matrix, dtype=np.bool) | |
mask[np.triu_indices_from(mask)] = True | |
# Resize and display | |
plt.figure(figsize=figsize) | |
sns.heatmap(matrix, annot=True, fmt='.2f', center=0, mask=mask, cmap='seismic_r') | |
def tts(x_data=x, y_data=y, test_size=.2): | |
''' NOT recommended, but quickly splits your training and testing data ''' | |
from sklearn.model_selection import train_test_split | |
global xtrain | |
global xtest | |
global ytrain | |
global ytest | |
xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data, test_size=test_size) | |
from scipy.spatial.distance import cosine | |
def cosine_similarity(a, b): | |
'''1 - cosine distance''' | |
return 1 - cosine(a, b) | |
from nltk import pos_tag | |
from string import punctuation | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem.wordnet import WordNetLemmatizer | |
from nltk.stem import SnowballStemmer, PorterStemmer | |
def prepare_text(text, aggressiveness=2): | |
'''normalizes, tokenizes, and lemmatizes input text''' | |
text = text.lower() | |
for i in punctuation: | |
text = text.replace(i, ' ') | |
words = word_tokenize(text) | |
words = [i for i in words if i not in stopwords.words('english')] | |
for i in 'rasvn': | |
try: | |
words = [WordNetLemmatizer().lemmatize(w, pos=i) for w in words] | |
except: | |
pass | |
if aggressiveness > 0: | |
if aggressiveness == 1: | |
st = PorterStemmer() | |
elif aggressiveness == 2: | |
st = SnowballStemmer('english') | |
elif aggressiveness > 2: | |
st = LancasterStemmer() | |
words = [st.stem(w) for w in words] | |
return words | |
def sort_dict(user_dict, ascending=False): | |
''' Sorts a dictionary by its values ''' | |
import operator | |
return sorted(user_dict.items(), key=operator.itemgetter(1), reverse=not ascending) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment