Created
December 11, 2013 09:40
-
-
Save parosky/7907612 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import json | |
import urllib2 | |
import urllib | |
import os | |
import glob | |
import PIL | |
import leargist | |
import numpy as np | |
import sklearn.decomposition | |
import sklearn.mixture | |
bing_id = YOUR_BING_ID | |
bing_key = YOUR_BING_KEY | |
def get_images(query): | |
if not os.path.exists(query): | |
os.mkdir(query) | |
skips = [0, 50] | |
count = 0 | |
for skip in skips: | |
url = u'https://api.datamarket.azure.com/Bing/Search/Image?$format=json&Query=%27{{query}}%27&$skip={{skip}}' | |
url = url.replace('{{query}}', urllib.quote(query.encode('utf-8', 'ignore'))) | |
url = url.replace('{{skip}}', str(skip)) | |
password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() | |
password_mgr.add_password(None, 'http://api.datamarket.azure.com', bing_id, bing_key) | |
handler = urllib2.HTTPBasicAuthHandler(password_mgr) | |
opener = urllib2.build_opener(handler) | |
urllib2.install_opener(opener) | |
line = urllib2.urlopen(url).read() | |
open('%s/%d.txt' % (query, skip), 'w').write(line) | |
r = json.loads(line) | |
for item in r['d']['results']: | |
img_url = item['MediaUrl'] | |
print count, img_url | |
try: | |
urllib.urlretrieve(img_url.encode('utf-8', 'ignore'), '%s/%d.jpg' % (query, count)) | |
except: | |
print 'error' | |
continue | |
count += 1 | |
def extract_features(directory): | |
features = [] | |
filenames = glob.glob('%s/*.jpg' % directory) | |
filenames.sort(lambda a,b: int(a[a.index('/')+1:a.rindex('.')])-int(b[b.index('/')+1:b.rindex('.')])) | |
for filename in filenames: | |
num = filename[filename.index('/')+1:filename.rindex('.')] | |
try: | |
im = PIL.Image.open(filename) | |
except: | |
print 'load error' | |
features.append(leargist.color_gist(im)) | |
features = np.array(features) | |
np.save('%s/features' % directory, features) | |
def kl(g1, g2): | |
m1 = g1.means_[0] | |
m2 = g2.means_[0] | |
c1 = np.diag(g1.covars_[0]) | |
c2 = np.diag(g2.covars_[0]) | |
ret = np.log(np.linalg.det(c2)/np.linalg.det(c1)) | |
ret += np.trace(np.dot(np.linalg.inv(c2),c1)) | |
ret += np.dot(np.dot((m2-m1).T,np.linalg.inv(c2)), (m2-m1)) | |
ret += -len(m1) | |
ret *= 1./2. | |
return ret | |
def run(): | |
towns = [u'高円寺', u'下北沢', u'霞ヶ関', u'池袋', u'渋谷', u'浅草', u'新宿', u'神楽坂', u'代官山', u'表参道', u'原宿', u'バラナシ', u'デリー', u'ムンバイ', u'川越', u'京都'] | |
# get images | |
for town in towns: | |
get_images(town) | |
# extract features | |
for town in towns: | |
extract_features(town) | |
# load extracted features | |
d = [] | |
for town in towns: | |
d.append(np.load(u'%s/features.npy' % town)) | |
# PCA | |
pca = sklearn.decomposition.PCA(n_components=20) | |
pca.fit(np.vstack(d)) | |
# GMM fitting | |
g = [] | |
for i,town in enumerate(towns): | |
g.append(sklearn.mixture.GMM(n_components=1, covariance_type='diag')) | |
g[i].fit(pca.transform(d[i])) | |
similarities = np.zeros((len(towns),len(towns))) | |
# compute KL divergence | |
for i in range(len(towns)): | |
for j in range(len(towns)): | |
similarities[i,j] = (kl(g[j],g[i])+kl(g[i],g[j])/2) | |
# show results | |
r = [] | |
for i in range(1,len(towns)): | |
r.append((similarities[0, i], towns[i])) | |
r.sort() | |
for rr in r: | |
print rr[1], rr[0] | |
if __name__ == '__main__': | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment