Created
March 17, 2021 18:54
-
-
Save BioSciEconomist/6a1b642a5250461fe6bc4a75c1dc8efd to your computer and use it in GitHub Desktop.
Example k-means clustering
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex kmeans.py | |
# | DATE: 3/17/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: example code based on: https://realpython.com/k-means-clustering-python/ | |
# *---------------------------------------------------------------- | |
df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/german_reunification.csv') | |
df = df.drop(columns="code", axis=1) | |
df.head() | |
df.tail() | |
df.columns | |
df.describe() | |
# reduce to just numerics | |
df1 = df[["gdp","infrate","trade","year"]] | |
# drop NAs | |
df1 = df1.dropna(axis=0,how='any') | |
df1.isnull().sum() # total missing per column | |
df1.shape | |
# get only latest year for clustering (i.e. 1992) | |
df1 = df1[df1.year== 1992] | |
df1.head() | |
# Import KMeans | |
from sklearn.cluster import KMeans | |
from sklearn.preprocessing import StandardScaler | |
scaler = StandardScaler() | |
scaled_features = scaler.fit_transform(df1) | |
#----------------------------- | |
# find optimal number of clusters | |
#----------------------------- | |
kmeans_kwargs = { | |
"init": "random", | |
"n_init": 10, | |
"max_iter": 300, | |
"random_state": 40, | |
} | |
sse = [] | |
for k in range(1, 11): | |
kmeans = KMeans(n_clusters=k, **kmeans_kwargs) | |
kmeans.fit(scaled_features) | |
sse.append(kmeans.inertia_) | |
import matplotlib.pyplot as plt | |
plt.style.use("fivethirtyeight") | |
plt.plot(range(1, 11), sse) | |
plt.xticks(range(1, 11)) | |
plt.xlabel("Number of Clusters") | |
plt.ylabel("SSE") | |
plt.show() | |
# looks liek 5 clusters at most | |
kmeans = KMeans(init="random",n_clusters=5,n_init=10,max_iter=300,random_state=42) | |
labels = kmeans.fit(scaled_features) | |
# add cluster ID to data frame | |
df1['cluster'] = labels.labels_ | |
df1.head() | |
df1.describe() | |
df1.shape | |
# descriptives | |
df1.groupby(['cluster']).size().reset_index(name='count') | |
df1.groupby('cluster')['infrate'].mean() | |
# Create a KMeans instance with 3 clusters: model | |
model = KMeans(n_clusters=3) | |
# Fit model to points | |
model.fit(points) | |
# Determine the cluster labels of new_points: labels | |
labels = model.predict(new_points) | |
# Print cluster labels of new_points | |
print(labels) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment