Created
April 8, 2018 02:29
-
-
Save ravila4/1bdbf8c636841ee3164d6661b085da0b to your computer and use it in GitHub Desktop.
Snippet to calculate k-means
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.cluster import KMeans\n", | |
"from sklearn import metrics\n", | |
"from sklearn.metrics import pairwise_distances\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"np.set_printoptions(threshold=np.nan)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv(\"all_ligand_descriptors.csv\")\n", | |
"features = df.iloc[:, 2:]\n", | |
"\n", | |
"scaler = StandardScaler()\n", | |
"scaler.fit(features)\n", | |
"features_scaled = scaler.transform(features)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def calc_kmeans(n_clusters, features):\n", | |
" kmeans = KMeans(n_clusters, random_state=13, n_jobs=-1)\n", | |
" kmeans.fit(features)\n", | |
" cluster_assignments = kmeans.labels_\n", | |
" cluster_centers = kmeans.cluster_centers_\n", | |
" return cluster_assignments, cluster_centers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"assignments, centers = calc_kmeans(100, features_scaled,)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"17207.945644503612" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Calculate Calinski-Harabasz score (larger is better)\n", | |
"metrics.calinski_harabaz_score(features_scaled, assignments)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"732.31562691166084" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def sum_squares(n_clusters, cluster_assignments):\n", | |
" \"\"\"Calculates the sum of squared errors.\"\"\"\n", | |
" square_sums = np.zeros(n_clusters)\n", | |
" for i in range(n_clusters):\n", | |
" indexes = np.where(cluster_assignments == i)[0]\n", | |
" # Squared errors\n", | |
" s = np.square(np.subtract(features_scaled[indexes], centers[i]))\n", | |
" # Sum of squared errors\n", | |
" square_sums[i] = np.sum(s)\n", | |
" return np.average(square_sums)\n", | |
"\n", | |
"sum_squares(100, assignments)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment