Skip to content

Instantly share code, notes, and snippets.

@amrakm
Last active September 2, 2022 16:10
Show Gist options
  • Save amrakm/afba80ae7442ee342cfc00bcf8fc3729 to your computer and use it in GitHub Desktop.
Save amrakm/afba80ae7442ee342cfc00bcf8fc3729 to your computer and use it in GitHub Desktop.
cluster images using CLIP embeddings
# https://github.com/MaartenGr/Concept
# !pip install concept umap-learn matplotlib
import glob
import hdbscan
import umap
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from concept import ConceptModel
imgs_folder_path = './imgs'
imgs_list = list(glob.glob(f'{imgs_folder_path}/*'))
# list of keywords to be used for labelling clusters (doesn't affect clustering results)
docs = ['sea', 'mountains', 'nature', 'tree', 'stree']
umap_model = umap.UMAP(n_neighbors=30,
min_dist=0.01,
n_components=5)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=5)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
concept_model = ConceptModel(min_concept_size=100,
diversity=0.3,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
embedding_model = 'clip-ViT-B-32',
vectorizer_model=vectorizer_model,
ctfidf=False)
concepts = concept_model.fit_transform(imgs_list, docs=docs)
topics = concept_model.topics
if topics:
n_topics = len(topics)
else:
n_topics = len(concept_model.cluster_labels) -1
print(f'n_topics={n_topics}')
## visualise 10 concepts
viz_object = concept_model.visualize_concepts(concepts=list(range(n_topics))[:10])
## visualise all concepts
viz_object = concept_model.visualize_concepts(concepts=list(range(n_topics)), figsize=(20, int(n_topics * 1.2)))
viz_object.savefig(f'concepts_n_{len(imgs_list)}_imgs.pdf')
#### extensions ######
# save labels and cluster embeddings
results_df = pd.DataFrame({'filename': imgs_list,
'concept_id': concepts})
results_df=results_df.reindex(columns=['filename','concept_id']).copy()
results_df[['filename','concept_id']].to_csv('./img_clustering_results.csv', index=False)
results_stats_df = results_df.groupby('concept_id').size().sort_values(ascending=False).to_frame('size').reset_index()
results_stats_df['cluster_kw'] = results_stats_df.concept_id.map(concept_model.topics)
cluster_embeddings_dict = {i:x for i,x in enumerate(concept_model.cluster_embeddings)}
results_stats_df['emb'] = results_stats_df.concept_id.map(cluster_embeddings_dict)
results_stats_df.to_csv('img_clustering_stats.csv', index=False)
## use random sample of english vocab
import random
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet as wn
all_nouns = [word for synset in wn.all_synsets('n') for word in synset.lemma_names() if "_" not in word]
docs = random.sample(all_nouns, 50_000)
## save sample images for each cluster, a collage for each cluster
clusters_saving_folder = './clusters_samples'
os.makedirs(clusters_saving_folder, exist_ok=True)
for cid, img in concept_model.cluster_images.items():
cname = f'''{topics[cid].replace(', ', '-')}.png'''
img.save(os.path.join(clusters_saving_folder, cname))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment