Skip to content

Instantly share code, notes, and snippets.

@aialenti
Created July 19, 2020 16:27
Show Gist options
  • Save aialenti/8d6dfc0b1e7695eac341bbf3d269eff9 to your computer and use it in GitHub Desktop.
Save aialenti/8d6dfc0b1e7695eac341bbf3d269eff9 to your computer and use it in GitHub Desktop.
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
import numpy as np
import os
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from PIL import Image
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from sklearn.cluster import DBSCAN
# create the base pre-trained model
base_model = ResNet50(weights='imagenet', include_top=False)
# Lists for data collection
all_features = []
files_column = []
files_onlyname_column = []
# Read each image and extract ResNet50 features
for root, dirs, files in os.walk('./data/images/images/images/'):
for name in files:
if name.endswith((".jpg")):
filename = "{}/{}".format(root, name)
# Load image and resize
img = image.load_img(filename, target_size=(224, 224))
img_array = image.img_to_array(img)
img_array = np.expand_dims(img_array, axis=0)
# Preprocess input
preprocessed = preprocess_input(img_array)
# Extract features and flatten in array (Shape is (7,7,2048) )
features = base_model.predict(preprocessed, batch_size=1)
flatten_features = features.flatten()
all_features.append(flatten_features)
files_column.append(filename)
files_onlyname_column.append(name)
# Save thumbnail
image_resize = Image.open(filename)
image_resize.thumbnail((64, 64), Image.ANTIALIAS)
image_resize.save("./data/images/resized_leaflet/{}".format(name), "JPEG")
# Convert all the features in a Numpy Array
all_features = np.asarray(all_features)
# pickle.dump(all_features,open('features.pkl', 'wb'))
# all_features = pickle.load(open('features.pkl', 'rb'))
# Apply PCA to reduce features to 2048
embedded_features = PCA(n_components=2048).fit_transform(all_features)
# Apply tSNE
embedded_features = TSNE(n_components=2, verbose=3, perplexity=25).fit_transform(embedded_features)
# Find Clusters using DBSCAN
dbscan = DBSCAN(eps=1.5, min_samples=15).fit(embedded_features)
colors = ["black", "brown", "chocolate", "gold", "forestgreen", "teal",
"dodgerblue", "navy", "darkviolet", "deeppink"]
# Create data frame and print the results using matplotlib
df = pd.DataFrame()
df["files"] = files_column
df["files_names"] = files_onlyname_column
df["x"] = embedded_features[:, 0]
df["y"] = embedded_features[:, 1]
df["labels"] = dbscan.labels_
df.to_csv("output.csv", index=False)
df = df.sort_values(by=["labels"])
x = df["x"].values * 200
y = df["y"].values * 200
fig, ax = plt.subplots(figsize=(150, 150))
ax.scatter(x, y)
def getImage(path, alpha=1):
return OffsetImage(plt.imread(path), alpha=alpha)
for x0, y0, path, l in zip(x, y, df["files_names"].values, df["labels"].values):
if l == -1:
ab = AnnotationBbox(getImage("{}/{}".format("data/images/resized-leaflet", path), 0.1), (x0, y0),
frameon=True, pad=0)
else:
ab = AnnotationBbox(getImage("{}/{}".format("data/images/resized-leaflet", path)), (x0, y0),
frameon=True, bboxprops=dict(edgecolor=colors[l % len(colors)], lw=10), pad=0)
ax.add_artist(ab)
plt.savefig('foo.png')
# Output as JSON
df[df["labels"] != -1].to_json("output.json", orient="records")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment