Created
December 7, 2016 23:52
-
-
Save ink1/eeb47f06d4726ae1c9674779d097e763 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Modified | |
https://github.com/fchollet/keras/blob/master/examples/neural_style_transfer.py | |
to demonstrate a problem with layer output_shape when using Theano backend. | |
Neural style transfer with Keras. | |
Run the script with: | |
``` | |
python neural_style_transfer.py path_to_your_base_image.jpg path_to_your_reference.jpg prefix_for_results | |
``` | |
e.g.: | |
``` | |
python neural_style_transfer.py img/tuebingen.jpg img/starry_night.jpg results/my_result | |
``` | |
It is preferable to run this script on GPU, for speed. | |
Example result: https://twitter.com/fchollet/status/686631033085677568 | |
# Details | |
Style transfer consists in generating an image | |
with the same "content" as a base image, but with the | |
"style" of a different picture (typically artistic). | |
This is achieved through the optimization of a loss function | |
that has 3 components: "style loss", "content loss", | |
and "total variation loss": | |
- The total variation loss imposes local spatial continuity between | |
the pixels of the combination image, giving it visual coherence. | |
- The style loss is where the deep learning keeps in --that one is defined | |
using a deep convolutional neural network. Precisely, it consists in a sum of | |
L2 distances between the Gram matrices of the representations of | |
the base image and the style reference image, extracted from | |
different layers of a convnet (trained on ImageNet). The general idea | |
is to capture color/texture information at different spatial | |
scales (fairly large scales --defined by the depth of the layer considered). | |
- The content loss is a L2 distance between the features of the base | |
image (extracted from a deep layer) and the features of the combination image, | |
keeping the generated image close enough to the original one. | |
# References | |
- [A Neural Algorithm of Artistic Style](http://arxiv.org/abs/1508.06576) | |
''' | |
from __future__ import print_function | |
import sys | |
from keras.preprocessing.image import load_img, img_to_array | |
from scipy.misc import imsave | |
import numpy as np | |
from scipy.optimize import fmin_l_bfgs_b | |
import time | |
import argparse | |
from keras.applications import vgg16 | |
from keras import backend as K | |
parser = argparse.ArgumentParser(description='Neural style transfer with Keras.') | |
parser.add_argument('base_image_path', metavar='base', type=str, | |
help='Path to the image to transform.') | |
parser.add_argument('style_reference_image_path', metavar='ref', type=str, | |
help='Path to the style reference image.') | |
parser.add_argument('result_prefix', metavar='res_prefix', type=str, | |
help='Prefix for the saved results.') | |
args = parser.parse_args() | |
base_image_path = args.base_image_path | |
style_reference_image_path = args.style_reference_image_path | |
result_prefix = args.result_prefix | |
# these are the weights of the different loss components | |
total_variation_weight = 1. | |
style_weight = 1. | |
content_weight = 0.025 | |
# dimensions of the generated picture. | |
img_nrows = 400 | |
img_ncols = 400 | |
assert img_ncols == img_nrows, 'Due to the use of the Gram matrix, width and height must match.' | |
# util function to open, resize and format pictures into appropriate tensors | |
def preprocess_image(image_path): | |
img = load_img(image_path, target_size=(img_nrows, img_ncols)) | |
img = img_to_array(img) | |
img = np.expand_dims(img, axis=0) | |
img = vgg16.preprocess_input(img) | |
return img | |
# util function to convert a tensor into a valid image | |
def deprocess_image(x): | |
if K.image_dim_ordering() == 'th': | |
x = x.reshape((3, img_nrows, img_ncols)) | |
x = x.transpose((1, 2, 0)) | |
else: | |
x = x.reshape((img_nrows, img_ncols, 3)) | |
# Remove zero-center by mean pixel | |
x[:, :, 0] += 103.939 | |
x[:, :, 1] += 116.779 | |
x[:, :, 2] += 123.68 | |
# 'BGR'->'RGB' | |
x = x[:, :, ::-1] | |
x = np.clip(x, 0, 255).astype('uint8') | |
return x | |
# get tensor representations of our images | |
base_image = K.variable(preprocess_image(base_image_path)) | |
style_reference_image = K.variable(preprocess_image(style_reference_image_path)) | |
# this will contain our generated image | |
if K.image_dim_ordering() == 'th': | |
combination_image = K.placeholder((1, 3, img_nrows, img_ncols)) | |
else: | |
combination_image = K.placeholder((1, img_nrows, img_ncols, 3)) | |
# combine the 3 images into a single Keras tensor | |
input_tensor = K.concatenate([base_image, | |
style_reference_image, | |
combination_image], axis=0) | |
# build the VGG16 network with our 3 images as input | |
# the model will be loaded with pre-trained ImageNet weights | |
model = vgg16.VGG16(input_tensor=input_tensor, | |
weights='imagenet', include_top=False) | |
print('Model loaded.') | |
# get the symbolic outputs of each "key" layer (we gave them unique names). | |
outputs_dict = dict([(layer.name, layer.output) for layer in model.layers]) | |
shape_dict = dict([(layer.name, layer.output_shape) for layer in model.layers]) | |
for layer in model.layers: | |
s = shape_dict[layer.name] | |
print( layer.name, s ) | |
sys.exit(0) | |
# compute the neural style loss | |
# first we need to define 4 util functions | |
# the gram matrix of an image tensor (feature-wise outer product) | |
def gram_matrix(x): | |
assert K.ndim(x) == 3 | |
if K.image_dim_ordering() == 'th': | |
features = K.batch_flatten(x) | |
else: | |
features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) | |
gram = K.dot(features, K.transpose(features)) | |
return gram | |
# the "style loss" is designed to maintain | |
# the style of the reference image in the generated image. | |
# It is based on the gram matrices (which capture style) of | |
# feature maps from the style reference image | |
# and from the generated image | |
def style_loss(style, combination): | |
assert K.ndim(style) == 3 | |
assert K.ndim(combination) == 3 | |
S = gram_matrix(style) | |
C = gram_matrix(combination) | |
channels = 3 | |
size = img_nrows * img_ncols | |
return K.sum(K.square(S - C)) / (4. * (channels ** 2) * (size ** 2)) | |
# an auxiliary loss function | |
# designed to maintain the "content" of the | |
# base image in the generated image | |
def content_loss(base, combination): | |
return K.sum(K.square(combination - base)) | |
# the 3rd loss function, total variation loss, | |
# designed to keep the generated image locally coherent | |
def total_variation_loss(x): | |
assert K.ndim(x) == 4 | |
if K.image_dim_ordering() == 'th': | |
a = K.square(x[:, :, :img_nrows-1, :img_ncols-1] - x[:, :, 1:, :img_ncols-1]) | |
b = K.square(x[:, :, :img_nrows-1, :img_ncols-1] - x[:, :, :img_nrows-1, 1:]) | |
else: | |
a = K.square(x[:, :img_nrows-1, :img_ncols-1, :] - x[:, 1:, :img_ncols-1, :]) | |
b = K.square(x[:, :img_nrows-1, :img_ncols-1, :] - x[:, :img_nrows-1, 1:, :]) | |
return K.sum(K.pow(a + b, 1.25)) | |
# combine these loss functions into a single scalar | |
loss = K.variable(0.) | |
layer_features = outputs_dict['block4_conv2'] | |
base_image_features = layer_features[0, :, :, :] | |
combination_features = layer_features[2, :, :, :] | |
loss += content_weight * content_loss(base_image_features, | |
combination_features) | |
feature_layers = ['block1_conv1', 'block2_conv1', | |
'block3_conv1', 'block4_conv1', | |
'block5_conv1'] | |
for layer_name in feature_layers: | |
layer_features = outputs_dict[layer_name] | |
style_reference_features = layer_features[1, :, :, :] | |
combination_features = layer_features[2, :, :, :] | |
sl = style_loss(style_reference_features, combination_features) | |
loss += (style_weight / len(feature_layers)) * sl | |
loss += total_variation_weight * total_variation_loss(combination_image) | |
# get the gradients of the generated image wrt the loss | |
grads = K.gradients(loss, combination_image) | |
outputs = [loss] | |
if type(grads) in {list, tuple}: | |
outputs += grads | |
else: | |
outputs.append(grads) | |
f_outputs = K.function([combination_image], outputs) | |
def eval_loss_and_grads(x): | |
if K.image_dim_ordering() == 'th': | |
x = x.reshape((1, 3, img_nrows, img_ncols)) | |
else: | |
x = x.reshape((1, img_nrows, img_ncols, 3)) | |
outs = f_outputs([x]) | |
loss_value = outs[0] | |
if len(outs[1:]) == 1: | |
grad_values = outs[1].flatten().astype('float64') | |
else: | |
grad_values = np.array(outs[1:]).flatten().astype('float64') | |
return loss_value, grad_values | |
# this Evaluator class makes it possible | |
# to compute loss and gradients in one pass | |
# while retrieving them via two separate functions, | |
# "loss" and "grads". This is done because scipy.optimize | |
# requires separate functions for loss and gradients, | |
# but computing them separately would be inefficient. | |
class Evaluator(object): | |
def __init__(self): | |
self.loss_value = None | |
self.grads_values = None | |
def loss(self, x): | |
assert self.loss_value is None | |
loss_value, grad_values = eval_loss_and_grads(x) | |
self.loss_value = loss_value | |
self.grad_values = grad_values | |
return self.loss_value | |
def grads(self, x): | |
assert self.loss_value is not None | |
grad_values = np.copy(self.grad_values) | |
self.loss_value = None | |
self.grad_values = None | |
return grad_values | |
evaluator = Evaluator() | |
# run scipy-based optimization (L-BFGS) over the pixels of the generated image | |
# so as to minimize the neural style loss | |
if K.image_dim_ordering() == 'th': | |
x = np.random.uniform(0, 255, (1, 3, img_nrows, img_ncols)) - 128. | |
else: | |
x = np.random.uniform(0, 255, (1, img_nrows, img_ncols, 3)) - 128. | |
for i in range(10): | |
print('Start of iteration', i) | |
start_time = time.time() | |
x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(), | |
fprime=evaluator.grads, maxfun=20) | |
print('Current loss value:', min_val) | |
# save current generated image | |
img = deprocess_image(x.copy()) | |
fname = result_prefix + '_at_iteration_%d.png' % i | |
imsave(fname, img) | |
end_time = time.time() | |
print('Image saved as', fname) | |
print('Iteration %d completed in %ds' % (i, end_time - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment