-
-
Save maria-aguilera/a8d585120254ef26c887826fd051070d to your computer and use it in GitHub Desktop.
Kaggle Helper Scripts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import seaborn as sns | |
from sklearn import preprocessing, ensemble | |
from scipy.stats import kendalltau | |
import pandas as pd | |
import random | |
#todo change module name | |
from tqdm import tqdm | |
import numpy as np | |
import pandas as pd | |
from multiprocessing import Pool | |
def _apply_df(args): | |
df, func, num, kwargs = args | |
return num, df.apply(func, **kwargs) | |
def _apply_series(args): | |
df, func, num, kwargs = args | |
return num, df.apply(func) | |
def apply_by_multiprocessing(df,func,**kwargs): | |
workers = kwargs.pop('workers') | |
chunks = kwargs.pop('chunks') | |
is_series = kwargs.pop('is_series') | |
with Pool(workers) as p: | |
apply_lst = [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, chunks))] | |
if is_series: | |
result = list(tqdm(p.imap(_apply_series, apply_lst), total=len(apply_lst))) | |
else: | |
result = list(tqdm(p.imap(_apply_df, apply_lst), total=len(apply_lst))) | |
result=sorted(result,key=lambda x:x[0]) | |
return pd.concat([i[1] for i in result], sort = False) | |
def _apply_df_groupby(args): | |
group, func, name, kwargs = args | |
return name, func(group, **kwargs) | |
def multiprocessing_groupby(groupby, | |
func, | |
**kwargs): | |
workers = kwargs.pop('workers') | |
with Pool(workers) as p: | |
apply_lst = [(group, func, name, kwargs) for name,group in groupby] | |
result = list(tqdm(p.imap(_apply_df_groupby, apply_lst), total=len(apply_lst))) | |
result=sorted(result,key=lambda x:x[0]) | |
return pd.concat([i[1] for i in result], sort = False) | |
# From Kaggle's Avito Comp | |
def reduce_mem_usage(df): | |
""" iterate through all the columns of a dataframe and modify the data type | |
to reduce memory usage. | |
""" | |
start_mem = df.memory_usage().sum() / 1024**2 | |
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) | |
for col in df.columns: | |
col_type = df[col].dtype | |
if col_type != object: | |
c_min = df[col].min() | |
c_max = df[col].max() | |
if str(col_type)[:3] == 'int': | |
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: | |
df[col] = df[col].astype(np.int8) | |
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: | |
df[col] = df[col].astype(np.int16) | |
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: | |
df[col] = df[col].astype(np.int32) | |
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: | |
df[col] = df[col].astype(np.int64) | |
else: | |
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: | |
df[col] = df[col].astype(np.float16) | |
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: | |
df[col] = df[col].astype(np.float32) | |
else: | |
df[col] = df[col].astype(np.float64) | |
else: df[col] = df[col].astype('category') | |
end_mem = df.memory_usage().sum() / 1024**2 | |
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) | |
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) | |
return df | |
#load csv file at path | |
def load_dataframe(csv_path): | |
return pd.read_csv(csv_path) # returns dataframe | |
#show info about csv | |
def show_basic_info(dataframe): | |
print("data basic info:") | |
print("size: {0}".format(dataframe.size)) | |
print("columns: {0}".format(dataframe.columns)) | |
#convert to numeric data | |
def convert_to_numeric(column, drop_missing_data=True): | |
numeric = pd.to_numeric(column, errors='coerce') | |
if(drop_missing_data): | |
numeric = numeric.dropna() | |
return numeric | |
#get columns for pandas dataframe | |
def get_columns(dataframe, columns_names_arr): | |
return dataframe[columns_names_arr] | |
#plot basic hex graph for pair of numerical data | |
def plot_hex_graph(numerical_data1, numerical_data2): | |
with sns.axes_style("white"): | |
hex_plt=sns.jointplot(x=numerical_data1, y=numerical_data2, kind="hex", gridsize=24, space=0, color="r") | |
print("--> plotting data on hex graph....") | |
sns.plt.show() | |
#numerical data needs to be pndas.series | |
def plot_histogram(numerical_data): | |
print("plotting histogram for {0}".format(numerical_data.name)) | |
try: | |
sns.distplot(numerical_data) | |
sns.plt.show() | |
except: | |
print("failed to plot histogram") | |
#summarize one categorical data (csv column) | |
#column needs to be pandas series | |
def summarize_categorical_data(data_name, column): | |
print("________________________") | |
print("{0}:".format(data_name)) | |
description=column.describe() | |
counts_description=description.sort(['counts'], ascending=False) | |
print(counts_description) | |
#summarize all categorical data (multiple csv columns) | |
#categorical_data needs to be array of pandas series | |
def summarize_all_categorical_data(categorical_data): | |
print("--> categorical data summaries:") | |
for i in range(len(categorical_data.columns)): | |
column_name=categorical_data.columns[i] | |
column_data=categorical_data[column_name] | |
converted_to_categorical=pd.Categorical(column_data) | |
summarize_categorical_data(column_name,converted_to_categorical) | |
#get names of categorical and numerical data columns | |
def divide_data_to_categorical_and_numerical(data): | |
numerical_column_names=[] | |
categorical_column_names=[] | |
for column_name in data.columns: | |
column= data[column_name] # column type is "pandas.core.series.Series" | |
few=10 | |
few_elements=column.values[:few] # ten first elements | |
is_categorical=[type(x) for x in few_elements].count(type("string"))==few | |
#print("column: {0}. Categorical {1}".format(column_name,is_categorical)) | |
if(is_categorical): | |
categorical_column_names.append(column_name) | |
else: | |
numerical_column_names.append(column_name) | |
return categorical_column_names, numerical_column_names | |
#create bivariate distributions for numerical data | |
def plot_multiple_bivariate_distributions_grid(dataframe): | |
sns.pairplot(dataframe) | |
######################################################################################### | |
#keras visuals | |
layer_names = [] | |
for layer in model.layers[:-1]: | |
layer_names.append(layer.name) | |
images_per_row = 16 | |
for layer_name, layer_activation in zip(layer_names, activations): | |
if layer_name.startswith('conv'): | |
n_features = layer_activation.shape[-1] | |
size = layer_activation.shape[1] | |
n_cols = n_features // images_per_row | |
display_grid = np.zeros((size * n_cols, images_per_row * size)) | |
for col in range(n_cols): | |
for row in range(images_per_row): | |
channel_image = layer_activation[0,:, :, col * images_per_row + row] | |
channel_image -= channel_image.mean() | |
channel_image /= channel_image.std() | |
channel_image *= 64 | |
channel_image += 128 | |
channel_image = np.clip(channel_image, 0, 255).astype('uint8') | |
display_grid[col * size : (col + 1) * size, | |
row * size : (row + 1) * size] = channel_image | |
scale = 1. / size | |
plt.figure(figsize=(scale * display_grid.shape[1], | |
scale * display_grid.shape[0])) | |
plt.title(layer_name) | |
plt.grid(False) | |
plt.imshow(display_grid, aspect='auto', cmap='viridis') | |
######################################################################################## | |
##https://www.kaggle.com/adityaecdrid/mnist-with-keras-for-beginners-99457/ | |
# Look at confusion matrix | |
def plot_confusion_matrix(cm, classes, | |
normalize=False, | |
title='Confusion matrix', | |
cmap=plt.cm.): | |
""" | |
This function prints and plots the confusion matrix. | |
Normalization can be applied by setting `normalize=True`. | |
""" | |
plt.imshow(cm, interpolation='nearest', cmap=cmap) | |
plt.title(title) | |
plt.colorbar() | |
tick_marks = np.arange(len(classes)) | |
plt.xticks(tick_marks, classes, rotation=45) | |
plt.yticks(tick_marks, classes) | |
if normalize: | |
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | |
thresh = cm.max() / 2. | |
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): | |
plt.text(j, i, cm[i, j], | |
horizontalalignment="center", | |
color="white" if cm[i, j] > thresh else "black") | |
plt.tight_layout() | |
plt.ylabel('True label') | |
plt.xlabel('Predicted label') | |
# Predict the values from the validation dataset | |
Y_pred = model.predict(X_val) | |
# Convert predictions classes to one hot vectors | |
Y_pred_classes = np.argmax(Y_pred, axis = 1) | |
# Convert validation observations to one hot vectors | |
Y_true = np.argmax(Y_val, axis = 1) | |
# compute the confusion matrix | |
confusion_mtx = confusion_matrix(Y_true, Y_pred_classes) | |
# plot the confusion matrix | |
plot_confusion_matrix(confusion_mtx, classes = range(10)) | |
######################################################################################################################## | |
print(h.history.keys()) | |
# "Accuracy" | |
plt.plot(h.history['acc']) | |
plt.plot(h.history['val_acc']) | |
plt.title('model accuracy') | |
plt.ylabel('accuracy') | |
plt.xlabel('epoch') | |
plt.legend(['train', 'validation'], loc='upper left') | |
# "Loss" | |
plt.plot(h.history['loss']) | |
plt.plot(h.history['val_loss']) | |
plt.title('model loss') | |
plt.ylabel('loss') | |
plt.xlabel('epoch') | |
plt.legend(['train', 'validation'], loc='upper left') | |
#"Learning Rate" | |
plt.plot(h.history['lr']) | |
plt.title('Learning Rate') | |
plt.show() | |
########################################################################################################### | |
# Errors are difference between predicted labels and true labels | |
errors = (Y_pred_classes - Y_true != 0) | |
Y_pred_classes_errors = Y_pred_classes[errors] | |
Y_pred_errors = Y_pred[errors] | |
Y_true_errors = Y_true[errors] | |
X_val_errors = X_val[errors] | |
def display_errors(errors_index,img_errors,pred_errors, obs_errors): | |
""" This function shows 6 images with their predicted and real labels""" | |
n = 0 | |
nrows = 2 | |
ncols = 3 | |
fig, ax = plt.subplots(nrows,ncols,sharex=True,sharey=True) | |
for row in range(nrows): | |
for col in range(ncols): | |
error = errors_index[n] | |
ax[row,col].imshow((img_errors[error]).reshape((28,28))) | |
ax[row,col].set_title("Predicted label :{}\nTrue label :{}".format(pred_errors[error],obs_errors[error])) | |
n += 1 | |
# Probabilities of the wrong predicted numbers | |
Y_pred_errors_prob = np.max(Y_pred_errors,axis = 1) | |
# Predicted probabilities of the true values in the error set | |
true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1)) | |
# Difference between the probability of the predicted label and the true label | |
delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors | |
# Sorted list of the delta prob errors | |
sorted_dela_errors = np.argsort(delta_pred_true_errors) | |
# Top 6 errors | |
most_important_errors = sorted_dela_errors[-6:] | |
# Show the top 6 errors | |
display_errors(most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors) | |
############################################################################################################ | |
''' | |
K.function creates theano/tensorflow tensor functions which is later used | |
to get the output from the symbolic graph given the input. | |
Now K.learning_phase() is required as an input as many Keras layers like Dropout/Batchnomalization | |
depend on it to change behavior during training and test time. | |
Had taken help from the keras | |
[docs](https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer), | |
this [answer](https://stackoverflow.com/questions/41711190/keras-how-to-get-the-output-of-each-layer) on | |
stack and converted the same in a function like thing ... | |
''' | |
def layer_to_visualize(layer): | |
inputs = [K.learning_phase()] + model.inputs | |
_convout1_f = K.function(inputs, [layer.output]) | |
def convout1_f(X): | |
# The [0] is to disable the training phase flag | |
return _convout1_f([0] + [X]) | |
convolutions = convout1_f(img_to_visualize) | |
convolutions = np.squeeze(convolutions) | |
print ('Shape of conv:', convolutions.shape) | |
n = convolutions.shape[0] | |
n = int(np.ceil(np.sqrt(n))) | |
# Visualization of each filter of the layer | |
fig = plt.figure(figsize=(12,12)) | |
for i in range(len(convolutions)): | |
ax = fig.add_subplot(n,n,i+1) | |
ax.imshow(convolutions[i], cmap='gray') | |
# Specify the layer to want to visualize | |
layer_to_visualize(convo1) | |
# As convout2 is the result of a MaxPool2D layer | |
# We can see that the image has blurred since | |
# the resolution has reduced | |
layer_to_visualize(convo2) | |
################################################################################# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment