Created
February 3, 2023 22:16
-
-
Save fdovila/2ec1490e824b2ca6867dc14102153b9f to your computer and use it in GitHub Desktop.
A python script to assign clusters to entries (rows) based on the text contained in columns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A python script to assign clusters to entries (rows) based on the text contained in columns | |
# by F.B. Avila-Rencoret, MD, 2023 | |
import pandas as pd | |
import openpyxl | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
def clean_column(col): | |
return col.str.replace('<.*?>', ' ') | |
# Load data from an Excel file and specify the sheet | |
entries = pd.read_excel('source.xlsx', header = 0, sheet_name='Sheet1', engine='openpyxl') | |
entries.apply(clean_column) | |
# Specify the column positions to combine | |
column_positions = [0, 1, 2] | |
# Combine the specified columns into a single string | |
entries['combined'] = entries.iloc[:, column_positions].apply(lambda x: ' '.join(x.astype(str)), axis=1) | |
# Transform text into numerical features using TF-IDF | |
vectorizer = TfidfVectorizer() | |
X = vectorizer.fit_transform(entries['combined']) | |
# Clustering using KMeans DEFINE expected clusters | |
model = KMeans(n_clusters=5) | |
model.fit(X) | |
# Assign the clusters back to the entries dataframe | |
entries['cluster'] = model.labels_ | |
# Check the resulting clusters | |
print(entries.groupby('cluster').size()) | |
# Save the results to the Excel file | |
entries.to_excel('output.xlsx', index=False) | |
# Check the resulting clusters | |
print(entries.groupby('cluster').size()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment