Last active
September 2, 2022 16:10
-
-
Save amrakm/afba80ae7442ee342cfc00bcf8fc3729 to your computer and use it in GitHub Desktop.
cluster images using CLIP embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/MaartenGr/Concept | |
# !pip install concept umap-learn matplotlib | |
import glob | |
import hdbscan | |
import umap | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from concept import ConceptModel | |
imgs_folder_path = './imgs' | |
imgs_list = list(glob.glob(f'{imgs_folder_path}/*')) | |
# list of keywords to be used for labelling clusters (doesn't affect clustering results) | |
docs = ['sea', 'mountains', 'nature', 'tree', 'stree'] | |
umap_model = umap.UMAP(n_neighbors=30, | |
min_dist=0.01, | |
n_components=5) | |
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=5) | |
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") | |
concept_model = ConceptModel(min_concept_size=100, | |
diversity=0.3, | |
umap_model=umap_model, | |
hdbscan_model=hdbscan_model, | |
embedding_model = 'clip-ViT-B-32', | |
vectorizer_model=vectorizer_model, | |
ctfidf=False) | |
concepts = concept_model.fit_transform(imgs_list, docs=docs) | |
topics = concept_model.topics | |
if topics: | |
n_topics = len(topics) | |
else: | |
n_topics = len(concept_model.cluster_labels) -1 | |
print(f'n_topics={n_topics}') | |
## visualise 10 concepts | |
viz_object = concept_model.visualize_concepts(concepts=list(range(n_topics))[:10]) | |
## visualise all concepts | |
viz_object = concept_model.visualize_concepts(concepts=list(range(n_topics)), figsize=(20, int(n_topics * 1.2))) | |
viz_object.savefig(f'concepts_n_{len(imgs_list)}_imgs.pdf') | |
#### extensions ###### | |
# save labels and cluster embeddings | |
results_df = pd.DataFrame({'filename': imgs_list, | |
'concept_id': concepts}) | |
results_df=results_df.reindex(columns=['filename','concept_id']).copy() | |
results_df[['filename','concept_id']].to_csv('./img_clustering_results.csv', index=False) | |
results_stats_df = results_df.groupby('concept_id').size().sort_values(ascending=False).to_frame('size').reset_index() | |
results_stats_df['cluster_kw'] = results_stats_df.concept_id.map(concept_model.topics) | |
cluster_embeddings_dict = {i:x for i,x in enumerate(concept_model.cluster_embeddings)} | |
results_stats_df['emb'] = results_stats_df.concept_id.map(cluster_embeddings_dict) | |
results_stats_df.to_csv('img_clustering_stats.csv', index=False) | |
## use random sample of english vocab | |
import random | |
import nltk | |
nltk.download("wordnet") | |
from nltk.corpus import wordnet as wn | |
all_nouns = [word for synset in wn.all_synsets('n') for word in synset.lemma_names() if "_" not in word] | |
docs = random.sample(all_nouns, 50_000) | |
## save sample images for each cluster, a collage for each cluster | |
clusters_saving_folder = './clusters_samples' | |
os.makedirs(clusters_saving_folder, exist_ok=True) | |
for cid, img in concept_model.cluster_images.items(): | |
cname = f'''{topics[cid].replace(', ', '-')}.png''' | |
img.save(os.path.join(clusters_saving_folder, cname)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment