aialenti · July 19, 2020 16:27
diff --git a/tSNE.py b/tSNE.py
 from tensorflow.keras.applications.resnet50 import ResNet50
 from tensorflow.keras.preprocessing import image
 from tensorflow.keras.applications.resnet50 import preprocess_input
 import numpy as np
 import os
 from sklearn.manifold import TSNE
 from sklearn.decomposition import PCA
 from PIL import Image
 import pandas as pd
 import pickle
 import matplotlib.pyplot as plt
 from matplotlib.offsetbox import OffsetImage, AnnotationBbox
 from sklearn.cluster import DBSCAN

 # create the base pre-trained model
 base_model = ResNet50(weights='imagenet', include_top=False)

 # Lists for data collection
 all_features = []
 files_column = []
 files_onlyname_column = []

 #   Read each image and extract ResNet50 features
 for root, dirs, files in os.walk('./data/images/images/images/'):
    for name in files:
        if name.endswith((".jpg")):
            filename = "{}/{}".format(root, name)

            #   Load image and resize
            img = image.load_img(filename, target_size=(224, 224))
            img_array = image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)

            #   Preprocess input
            preprocessed = preprocess_input(img_array)

            #   Extract features and flatten in array (Shape is (7,7,2048) )
            features = base_model.predict(preprocessed, batch_size=1)
            flatten_features = features.flatten()
            all_features.append(flatten_features)

            files_column.append(filename)
            files_onlyname_column.append(name)

            #   Save thumbnail
            image_resize = Image.open(filename)
            image_resize.thumbnail((64, 64), Image.ANTIALIAS)
            image_resize.save("./data/images/resized_leaflet/{}".format(name), "JPEG")

 #   Convert all the features in a Numpy Array
 all_features = np.asarray(all_features)
 # pickle.dump(all_features,open('features.pkl', 'wb'))
 # all_features = pickle.load(open('features.pkl', 'rb'))

 #   Apply PCA to reduce features to 2048
 embedded_features = PCA(n_components=2048).fit_transform(all_features)

 #   Apply tSNE
 embedded_features = TSNE(n_components=2, verbose=3, perplexity=25).fit_transform(embedded_features)

 #   Find Clusters using DBSCAN
 dbscan = DBSCAN(eps=1.5, min_samples=15).fit(embedded_features)

 colors = ["black", "brown", "chocolate", "gold", "forestgreen", "teal",
          "dodgerblue", "navy", "darkviolet", "deeppink"]

 #   Create data frame and print the results using matplotlib
 df = pd.DataFrame()
 df["files"] = files_column
 df["files_names"] = files_onlyname_column
 df["x"] = embedded_features[:, 0]
 df["y"] = embedded_features[:, 1]
 df["labels"] = dbscan.labels_
 df.to_csv("output.csv", index=False)
 df = df.sort_values(by=["labels"])

 x = df["x"].values * 200
 y = df["y"].values * 200

 fig, ax = plt.subplots(figsize=(150, 150))
 ax.scatter(x, y)


 def getImage(path, alpha=1):
    return OffsetImage(plt.imread(path), alpha=alpha)


 for x0, y0, path, l in zip(x, y, df["files_names"].values, df["labels"].values):
    if l == -1:
        ab = AnnotationBbox(getImage("{}/{}".format("data/images/resized-leaflet", path), 0.1), (x0, y0),
                            frameon=True, pad=0)
    else:
        ab = AnnotationBbox(getImage("{}/{}".format("data/images/resized-leaflet", path)), (x0, y0),
                            frameon=True, bboxprops=dict(edgecolor=colors[l % len(colors)], lw=10), pad=0)

    ax.add_artist(ab)
 plt.savefig('foo.png')

 #   Output as JSON
 df[df["labels"] != -1].to_json("output.json", orient="records")
	from tensorflow.keras.applications.resnet50 import ResNet50
	from tensorflow.keras.preprocessing import image
	from tensorflow.keras.applications.resnet50 import preprocess_input
	import numpy as np
	import os
	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA
	from PIL import Image
	import pandas as pd
	import pickle
	import matplotlib.pyplot as plt
	from matplotlib.offsetbox import OffsetImage, AnnotationBbox
	from sklearn.cluster import DBSCAN

	# create the base pre-trained model
	base_model = ResNet50(weights='imagenet', include_top=False)

	# Lists for data collection
	all_features = []
	files_column = []
	files_onlyname_column = []

	# Read each image and extract ResNet50 features
	for root, dirs, files in os.walk('./data/images/images/images/'):
	for name in files:
	if name.endswith((".jpg")):
	filename = "{}/{}".format(root, name)

	# Load image and resize
	img = image.load_img(filename, target_size=(224, 224))
	img_array = image.img_to_array(img)
	img_array = np.expand_dims(img_array, axis=0)

	# Preprocess input
	preprocessed = preprocess_input(img_array)

	# Extract features and flatten in array (Shape is (7,7,2048) )
	features = base_model.predict(preprocessed, batch_size=1)
	flatten_features = features.flatten()
	all_features.append(flatten_features)

	files_column.append(filename)
	files_onlyname_column.append(name)

	# Save thumbnail
	image_resize = Image.open(filename)
	image_resize.thumbnail((64, 64), Image.ANTIALIAS)
	image_resize.save("./data/images/resized_leaflet/{}".format(name), "JPEG")

	# Convert all the features in a Numpy Array
	all_features = np.asarray(all_features)
	# pickle.dump(all_features,open('features.pkl', 'wb'))
	# all_features = pickle.load(open('features.pkl', 'rb'))

	# Apply PCA to reduce features to 2048
	embedded_features = PCA(n_components=2048).fit_transform(all_features)

	# Apply tSNE
	embedded_features = TSNE(n_components=2, verbose=3, perplexity=25).fit_transform(embedded_features)

	# Find Clusters using DBSCAN
	dbscan = DBSCAN(eps=1.5, min_samples=15).fit(embedded_features)

	colors = ["black", "brown", "chocolate", "gold", "forestgreen", "teal",
	"dodgerblue", "navy", "darkviolet", "deeppink"]

	# Create data frame and print the results using matplotlib
	df = pd.DataFrame()
	df["files"] = files_column
	df["files_names"] = files_onlyname_column
	df["x"] = embedded_features[:, 0]
	df["y"] = embedded_features[:, 1]
	df["labels"] = dbscan.labels_
	df.to_csv("output.csv", index=False)
	df = df.sort_values(by=["labels"])

	x = df["x"].values * 200
	y = df["y"].values * 200

	fig, ax = plt.subplots(figsize=(150, 150))
	ax.scatter(x, y)


	def getImage(path, alpha=1):
	return OffsetImage(plt.imread(path), alpha=alpha)


	for x0, y0, path, l in zip(x, y, df["files_names"].values, df["labels"].values):
	if l == -1:
	ab = AnnotationBbox(getImage("{}/{}".format("data/images/resized-leaflet", path), 0.1), (x0, y0),
	frameon=True, pad=0)
	else:
	ab = AnnotationBbox(getImage("{}/{}".format("data/images/resized-leaflet", path)), (x0, y0),
	frameon=True, bboxprops=dict(edgecolor=colors[l % len(colors)], lw=10), pad=0)

	ax.add_artist(ab)
	plt.savefig('foo.png')

	# Output as JSON
	df[df["labels"] != -1].to_json("output.json", orient="records")