Last active
December 23, 2015 20:59
-
-
Save sergeyk/6693204 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Code to deal with the ImageNet dataset. | |
Feature details (computed by Yangqing): | |
- VLfeat SIFT dense extraction: with SIFT patch size 16, 32 and 64, | |
and a stride of 4 pixels. The images are reduced to size 500*500 | |
(smaller images are not resized up though). | |
- LLC-coded features with 5-nearest neighbors, and a codebook of size 16k | |
- Max-pooled on a 1x1 grid and 3x3 grid, so a total of 10 bins. | |
This gives us 10 * 16k = 160k dimensional features. | |
- Adagrad (multinomial) logistic regression on all training data. | |
""" | |
import os | |
import cPickle as pickle | |
import numpy as np | |
import copy | |
import networkx as nx | |
import matplotlib.pyplot as plt | |
from scipy.io import loadmat | |
from sklearn.preprocessing import StandardScaler | |
import h5py | |
import tc | |
from tc import DataSource | |
# data/imagenet should contain classifier output data. | |
imagenet_dir = os.path.join(tc.repo_dir, 'data/imagenet') | |
imagenet_pickle_filename = os.path.join(os.path.dirname(__file__), 'imagenet_%d_tax.pickle') | |
# The Hedging Your Bets code release should be extracted to the ext/ folder. | |
# (Download from http://www.image-net.org/projects/hedging/). | |
ilsvrc65_meta_filename = tc.repo_dir + '/ext/hedging-1.0/code/ilsvrc65_meta.mat' | |
# I ran the MATLAB demo of the Hedging release and outut their classifier | |
# outputs on the validation and test sets of ILSVRC65. | |
ilsvrc65_clf_outputs_val = tc.repo_dir + '/test/support/ilsvrc65_clf_outputs_val.mat' | |
ilsvrc65_clf_outputs_test = tc.repo_dir + '/test/support/ilsvrc65_clf_outputs_test.mat' | |
def process_graph(g, nodes): | |
""" | |
Parameters | |
---------- | |
g : networkx.DiGraph | |
nodes : list | |
List of node names, all of which are in g. | |
The first K nodes in this list correspond to the leaf nodes, and are in | |
the same order as observations that we may see. | |
Returns | |
------- | |
graph : dict | |
'g': the original graph, augmented with 'leaves' field | |
""" | |
assert(len(g.nodes()) == len(nodes)) | |
N = len(nodes) | |
# augment graph with 'leaves' field | |
g = copy.deepcopy(g) | |
compute_leave_sets(g, nodes) | |
heights = np.array([g.node[node]['height'] for node in nodes]).flatten() | |
leaf_mask = np.atleast_2d(heights == 0) | |
K = leaf_mask.sum() | |
# Construct K x N matrix of leaf membership. | |
leaf_membership = np.zeros((K, N)) | |
leaf_membership[:, :K] = np.eye(K) | |
for i in range(K, N): | |
leaf_membership[g.node[nodes[i]]['leaf_inds'], i] = 1 | |
num_leaves = leaf_membership.sum(0) | |
rewards = np.log2(K / num_leaves).flatten() | |
graph = { | |
'g': g, 'nodes': nodes, 'heights': heights, 'rewards': rewards, | |
'leaf_membership': leaf_membership} | |
return graph | |
def compute_leave_sets(graph, nodes): | |
""" | |
For each node, compute the set of leaves under it, and store it under | |
the given key. | |
This function is able to handle large graphs, as it is not recursive. | |
Code originally by my labmate Yangqing Jia. | |
Parameters | |
---------- | |
graph : nx.DiGraph | |
nodes : list of strings | |
""" | |
leaves_key = 'leaves' | |
leaf_inds_key = 'leaf_inds' | |
if len(graph.nodes()) == 0: | |
return | |
stack = [n for n in graph.nodes() if len(graph.predecessors(n)) == 0] | |
if len(stack) == 0: | |
raise ValueError('The graph does not have any root: possible cycles?') | |
expanded = set() | |
while len(stack) > 0: | |
node = stack[-1] | |
if stack[-1] in expanded: | |
# all children in this stack has been computed, so we can compute | |
# the node now. | |
if len(graph.successors(node)) == 0: | |
# I am a leaf | |
graph.node[node][leaves_key] = set([node]) | |
else: | |
leaves = set() | |
for c in graph.successors(node): | |
leaves.update(graph.node[c][leaves_key]) | |
graph.node[node][leaves_key] = leaves | |
del stack[-1] | |
else: | |
expanded.add(node) | |
stack += graph.successors(node) | |
# also store indices to nodes list | |
for node in nodes: | |
inds = [nodes.index(n) for n in graph.node[node][leaves_key]] | |
graph.node[node][leaf_inds_key] = sorted(inds) | |
class ILSVRC65(DataSource): | |
""" | |
The ILSVRC65 dataset, as described in the Hedging Your Bets CVPR 2012 paper. | |
""" | |
def __init__(self, dirname, max_budget=None): | |
self.dirname = dirname | |
self.graph = self.load_graph() | |
g = self.graph['g'] | |
# TODO: is this the right way to determine names? | |
self.labels = [g.node[node]['word'] for node in self.graph['nodes']] | |
self.actions = [g.node[node]['word'] for node in np.array(self.graph['nodes'])[self.graph['heights'] == 0]] | |
self.action_dims = np.ones(len(self.actions), dtype=int) | |
self.action_costs = np.ones(len(self.actions), dtype=float) | |
self.max_budget = max_budget | |
if max_budget is None: | |
self.max_budget = int(sum(self.action_costs) / 2) | |
self.data_filename = self.dirname + '/ilsvrc65.h5' | |
if not os.path.exists(self.data_filename): | |
X, y, X_test, y_test = self.load_data() | |
# NOTE: imagenet should not be standardized, because | |
# the features are already all in [0,1] and the classifier | |
# can be doing simple argmax over average of feature channels. | |
# standardization ruins performance in that case. | |
#ss = StandardScaler() | |
#X = ss.fit_transform(X) | |
#X_test = ss.transform(X_test) | |
with h5py.File(self.data_filename, 'w') as f: | |
f.create_dataset('X', data=X) | |
f.create_dataset('y', data=y) | |
f.create_dataset('X_test', data=X_test) | |
f.create_dataset('y_test', data=y_test) | |
self.N = self.X.shape[0] | |
self.N_test = self.X_test.shape[0] | |
@property | |
def name(self): | |
return 'ilsvrc65_{}'.format(self.max_budget) | |
@staticmethod | |
def load_graph(): | |
ilsvrc65 = loadmat(ilsvrc65_meta_filename) | |
g65 = nx.DiGraph() | |
# Load nodes in the order of the meta file (leaves first). | |
nodes = [] | |
for x in ilsvrc65['synsets']: | |
nodes.append(x['WNID'][0][0]) | |
g65.add_node(nodes[-1], {'word': x['words'][0][0], 'height': x['height'][0][0]}) | |
for x in ilsvrc65['synsets']: | |
node = x['WNID'][0][0] | |
for child in x['children'][0][0]: | |
g65.add_edge(node, nodes[child - 1]) | |
graph = process_graph(g65, nodes) | |
return graph | |
@staticmethod | |
def load_data(): | |
""" | |
Load the classifier outputs for val and test sets. | |
""" | |
data = loadmat(ilsvrc65_clf_outputs_val) | |
X = data['leaf_probs'] | |
y = data['labels'] - 1 | |
data = loadmat(ilsvrc65_clf_outputs_test) | |
X_test = data['leaf_probs'] | |
y_test = data['labels'] - 1 | |
return X, y, X_test, y_test | |
class ImageNet(object): | |
""" | |
DataSource for the ImageNet dataset. | |
Parameters | |
---------- | |
max_leaves : int, optional | |
The desired number of leaf nodes per group. | |
Getting exactly this number in every group is extremely unlikely: instead, expect | |
different smaller numbers close to this. | |
method : string in ['random', 'size'], optional | |
The method used to split the classes into groups. | |
- 'random': leaf nodes are selected completely randomly. | |
- 'size': all nodes are sorted by the number of leaf nodes they are | |
ancestors to, and groups are formed by taking nodes smaller than | |
max_leaves and removing the selected leaf nodes from all other nodes, | |
iteratively. | |
""" | |
def __init__(self, max_leaves=200, method='size'): | |
num = 1000 # can be 7404 (for ImageNet10K?) | |
self.g = pickle.load(open(imagenet_pickle_filename % num)) | |
self.nodes = self.g.nodes() | |
if method == 'size': | |
self.groups = self.select_groups_by_size(self.g, self.nodes, max_leaves) | |
elif method == 'random': | |
self.groups = self.select_groups_random | |
else: | |
raise('Unknown method!') | |
self.labels = self.g.nodes() | |
#self.X, self.y = self.load_data('val') | |
@staticmethod | |
def load_data(s): | |
""" | |
Parameters | |
---------- | |
s : string in ['train', 'val', 'test'] | |
""" | |
assert(s in ['train', 'val', 'test']) | |
X = np.load('data/imagenet/{0}_predict/{0}_prob.npy'.format(s)) | |
y = np.load('data/imagenet/{}_predict/labels_ascii_sorted.npy'.format(s)) | |
return X, y | |
@staticmethod | |
def select_groups_by_size(g, nodes, max_leaves): | |
""" | |
Select leaf node groups by inner-node sizes. | |
Parameters | |
---------- | |
g : networkx.DiGraph | |
nodes : list of strings | |
max_leaves : int | |
The desired number of leaf nodes per group. | |
Getting exactly this number in every group is extremely unlikely: | |
instead, expect different smaller numbers close to this. | |
Returns | |
------- | |
selected_groups : list of (node name, list of leaf node names) tuples. | |
""" | |
g = copy.deepcopy(g) | |
compute_leave_sets(g, nodes) | |
selected_groups = [] | |
while True: | |
all_num_leaves = [len(g.node[node]['leaves']) for node in g.nodes()] | |
if sum(all_num_leaves) == 0: | |
break | |
ind = np.argsort(-np.array(all_num_leaves)) | |
sorted_nodes = np.array(g.nodes())[ind] | |
for node in sorted_nodes: | |
leaves = g.node[node]['leaves'] | |
if len(leaves) > 0 and len(leaves) <= max_leaves: | |
selected_groups.append((node, leaves)) | |
for node_prime in np.array(g.nodes()): | |
if node_prime == node: | |
continue | |
g.node[node_prime]['leaves'] -= leaves | |
g.node[node]['leaves'] = set([]) | |
break | |
assert(sum(len(x[-1]) for x in selected_groups) == 1000) | |
for group in selected_groups: | |
node, leaves = group | |
print(node, g.node[node]['word'], len(leaves)) | |
return selected_groups | |
@staticmethod | |
def select_groups_randomly(g, max_leaves): | |
""" | |
Select leaf node groups randomly, such that all groups have max_leaves | |
leaves (if not evenly divisible, the last group will be smaller). | |
""" | |
# TODO implement | |
pass | |
def plot_groups(self, filename=None): | |
return self.plot_selected_groups(self.g, self.groups, filename) | |
@staticmethod | |
def plot_selected_groups(g, selected_groups, filename=None): | |
""" | |
Display the ImageNet DAG and highlight the selected groups | |
in different colors. | |
Parameters | |
---------- | |
g : networkx.DiGraph | |
selected_groups : list of ('node', list of leaf nodes) tuples | |
filename : string, optional | |
If given, write plot out to filename. | |
""" | |
fig = plt.figure(figsize=(12, 12)) | |
gray = (0.85, 0.85, 0.85, 1) | |
graydark = (0.5, 0.5, 0.5, 1) | |
ns = 30 | |
pos = nx.graphviz_layout(g, prog='sfdp') | |
nx.draw_networkx_nodes( | |
g, pos, with_labels=False, node_size=ns, | |
node_color=gray, linewidth=None).set_edgecolor(graydark) | |
root = [n for n in g.nodes() if g.predecessors(n) == []] | |
nx.draw_networkx_nodes( | |
g, pos, nodelist=root, with_labels=False, | |
node_size=ns * 2, node_color=gray, | |
linewidth=None).set_edgecolor(graydark) | |
nx.draw_networkx_edges(g, pos, arrows=False, edge_color=[gray] * len(g.edges())) | |
cmap = plt.get_cmap('Accent') | |
colors = np.linspace(0, 1, len(selected_groups)) | |
darken = lambda c, r: (c[0] * r, c[1] * r, c[2] * r, c[3]) | |
for i, group in enumerate(selected_groups): | |
nx.draw_networkx_nodes(g, pos, node_size=ns, nodelist=group[-1], node_color=cmap(colors[i])).set_edgecolor(darken(cmap(colors[i]), 0.75)) | |
plt.axis('equal') | |
plt.axis('off') | |
if filename is not None: | |
plt.savefig(filename, dpi=300, facecolor='none') | |
return fig | |
if __name__ == '__main__': | |
ds = ImageNet() | |
print(ds) | |
fig = ds.plot_groups(filename="test.pdf") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment