rizar · August 29, 2015 14:07
diff --git a/MTsketch.py b/MTsketch.py
 # INTRODUCTION
 # ------------
 #
 # Bart's logistic regression oneliner has served us well
 # forcing us to think how to make the framework 
 # suitable for quick-and-dirty experimentation with small 
 # standard models. It did not force us to think about 
 # scalability and flexibility necessary for big
 # experiments with big models.
 #
 # To fill this gap I made this sketch of how the machine
 # translation code would look like in a framework of my dream.
 # I present three things:
 # - machine translation model a single brick called Translator
 # - example routine `create_and_configures' 
 #   that creates a Translator and configures it
 # - example routine that takes a Translator, applies it and 
 #   modifies the theano graph along the lines of 
 #   standard and advanced regularization methods
 # 
 # As we diverged significantly in our understanding what "bricks" 
 # and "blocks" are, a quick recap of the assumptions made 
 # in this code follows.
 #
 # Bricks ana Blocks
 #------------------
 #
 # The basic thing is a brick. A brick is an
 # object that can generate theano graph given inputs and using
 # its parameters. This is called "application of a brick".
 # A brick can own subordinate bricks called "slaves". When applied
 # a brick can use its slaves to generate theano graph. 
 # Two restrictions are made:
 # (1) a brick can be a slave of only one brick,
 # (2) when applied a brick can use only its slave bricks.
 # Thus bricks form a tree-shaped hierarchy.
 #
 # The theano graph generated during application of a brick is called
 # a block. Thanks to resrictions (1) and (2) we know that the blocks
 # also form a tree-like structure.
 #
 # Brick life-cycle
 # ----------------
 #
 # Creation: a brick is created with minimal number of required
 # parameters. Corresponds to __init__ method in the code. 
 # The brick gradually receives the rest of its configuration
 # from the user and the chain of its owners.
 # 
 # Initialization: a configured brick finetunes configuration of its 
 # slaves (and possibly slaves of slaves), initializes parameters of 
 # itself and hands the initialization request to the slaves.
 #
 # Application: the brick is applied by the user or its owner. 
 # Initialization can be triggered from here if not done.
 # 
 # NOTE: the life-cycle above is a few steps back from 
 # the bulky 6-stage scheme to what Bart proposed 
 # much earlier.  I lost understanding why it's bad - 
 # at least for NMT we can postpone all applications until
 # all the initialization is done. 
 # 
 # Names
 # -----
 #
 # All bricks and blocks have names. The only resriction is 
 # (3) slaves of a brick have different names.
 # This restriction allows to refer a brick
 # by a chain of names of its owners and its name.
 # We avoid long composite names like "A_B_C_D" in this way.

 from theano.tensor import TT

 # I expect that all the components from the list below
 # will be standard in the future framework.
 from blocks import
    UNDEF
    Brick,\
    Identity,\
    Tanh,\
    Maxout,
    Softmax,\
    Linear,\
    MLP,\
    NamedDaisyChain,\
    EmbeddingDict,\
    RecurrentNetwork,\
    GatedRecurrentTransition,\
    LSTMTransition,\
    BidirectionalNetwork,\
    SearchAttention,\
    AttentionBasedSequenceGenerator
 from blocks.initialization import Constant, IsotropicGaussian
 from blocks.access import BrickSelector, BlockSelector
 import blocks.modification

 class Translator(Brick):
    """Machine Translation Model as a single Brick!

    Parameters:
    -----------
        encoder_transition:
            A Brick to be used as a recurrent transition in the encoder.
        decoder_transition:
            A Brick to be used as a recurrent transition in the decoder.
        num_input_words: int
            Input vocabulary size.
        num_output_words: int
            Output vocabulary size.
        dim : int
            Main dimensionality of the model.
        word_emb_dim : int
            Dimensionality of word embeddings.
        weights_init : blocks.Initalization
            Default initizalition for weights.
        recurrent_weights_init : blocks.Initialization
            Default initialization for recurrent weights.
        bias_init : blocks.Initialization
            Default initialization for biases.
    Notes:
    ------
        The choice of `encoder_transition` and `decoder_transition` as
        mandatory constructor parameters is not arbitrary. That is 
        to know exactly the number of inputs for these transitions
        to instantiate other Bricks. For instance if `encoder_transition`
        is a gatedRNN, than the bricks splitting the input signal
        into three should be created. This is done behind the 
        scene in `BidirectionalNetwork` and 
        `AttentionBasedSequenceGenerator`.
    """
    @Brick.lazy
    def __init__(self,
            encoder_transition,
            decoder_transition,
            num_input_words, num_output_words,
            dim, word_emb_dim,
            weights_init, recurrent_weights_init
            **kwargs):
        super(Traslator, self).__init__(kwargs)

        self.num_input_words = num_input_words
        self.num_output_words = num_output_words
        self.dim = dim
        self.word_emb_dim = word_emb_dim
        self.weights_init = weights_init
        self.recurrent_weights_init = recurrent_weights_init
        self.biases_init = biases_init

        # In order to be able to pass a Brick object as an argument,
        # we should allow to specify the owner after construction.
        # However I believe that
        # 1) we should only allow to set the `owner` attribute once.
        #    It is not hard to check.
        # 2) A Brick should not be allowed to call `apply` of any Brick
        #    except its slaves. It is harder to check... Thread-local
        #    Brick attribute storing a reference of `self` of the last apply
        #    call??
        # We also expect that if a Brick receives a slave as an argument
        # and it does not have a name, it should set a default one.
        # For instance, after the call below the following structure in the
        # Brick namespace should emerge (assuming that encoder_transition
        # has three inputs like gatedRNN does):
        #
        #       /input_dict
        #       /encoder
        #           /splitter
        #               /part0
        #               /part1
        #               /part2
        #           /forward
        #               /transition
        #           /backward
        #               /transition
        #
        self.input_dict = EmbeddingDict(owner=self, name='input_dict')
        self.encoder = BidirectionalNetwork(
                owner=self,
                name="encoder",
                input_transform=MLP(),
                transition_class=encoder_transition)

        # This call should build the following structure
        #
        #   /decoder
        #       /attention
        #       /transition
        #       /generator
        #       /feedback
        #       /feedback_splitter
        #           /part0
        #           /part1
        #           /part2
        #
        # under an assumption that decoder_transition is a gatedRNN.
        self.decoder = AttentionBasedSequenceGenerator(
                owner=self,
                name="decoder",
                transition=decoder_transition,
                attention=SearchAttention(),
                generator=NamedDaisyChain(
                    # The call MLP() is expected to create
                    # an MLP with a single hidden layer
                    # and Identity() activation.
                    presoftmax=MLP(),
                    softmax=Softmax()),
                feedback=EmbeddingDict(),
                feedback_transformer=MLP())

    def _initialize(self):
        """Configure the slaves assuming the Brick is configured.

        Notes
        -----
            This something like merged `_push_prealloc_config`,
            `_allocate', '_push_preinit_config' and `_initialize` 
            from my last proposal.
        """
        self.input_dict.num_words = self.num_input_words
        self.input_dict.dim = self.word_emb_dim

        self.encoder.input_dim = self.word_emb_dim
        self.encoder.dim = dim

        self.decoder.input_dim = 2 * dim
        self.decoder.hidden_dim = dim
        self.decoder.feedback_dim = self.word_emb_dim
        # The default `presoftmax` Brick is MLP(), which is just
        # a convenice wrapper on Linear. But the user might
        # want to have something deeper. The old machine translation code
        # allowed that by `deep_out` switch which was in practice always
        # turned on.
        # Here, we configure `presoftmax` very carefully to allow
        # maximum flexibility. It can be replaced by any MLP.
        self.decoder.generator.presoftmax.dims[0] = dim
        self.decoder.generator.presoftmax.dims[-1] = self.num_output_words
        # Note: we do not configure dimensionalities of `decoder.attention`
        # Because `decoder` is responsible for that. Same applies
        # for `decoder.feedback_transformer`.
        self.decoder.feedback.num_words = self.num_output_words
        self.decoder.feedback.dim = self.word_emb_dim

        self.input_dict.weights_init = self.weights_init
        init_attrs = ['weights_init', 'biases_init',
                'recurrent_weights_init']
        for slave in [self.encoder, self.decoder]:
            for attr in init_attrs:
                setattr(slave, attr, getattr(self, attr))

    @Brick.wrapped_apply
    # Note that Brick.wrapped_apply has to look for input variables
    # also in keyword arguments like `in_mask` and `out_mask`.
    def log_likelihood(in_sents, out_sents, in_mask=None, out_mask=None, **kwargs):
        """Compute log-likelihood of generating `out_sents` given `in_sents`.

        Parameters:
        -----------
        in_sents : theano matrix of int64
            Batch of input sentences.
        out_sents : theano matrix of int64
            Batch of output sentences.
        in_mask : theano matrix of float32
            Input mask.
        out_mask : theano matrix of float32
            Output mask.

        Returns:
        -------
        log_likelihood : theano vector of float32
            Log-likelihoods of generation event for each sentence pair.
        """
        return self.decoder.log_likelihood(
                    input_seqs=self.encoder.apply(
                        self.input_dict.lookup(in_sents),
                        in_mask),
                    output_seqs=out_sents,
                    in_mask=in_mask,
                    out_mask=out_mask)

    @Brick.wrapper_apply
    def sample(in_sents, in_mask=None, n_steps=None, temperature=1, *kwargs):
        """Sample translations from input sentences.
        """
        return self.decoder.sample(
                    input_seqs=self.encoder.apply(
                        self.input_dict.lookup(in_sents),
                        in_mask),
                    in_mask=in_mask,
                    n_steps=n_steps,
                    temperature=temperature))

 def create_and_configure(config):
    """Creates and configures a Translator.

    This is a demonstration how a global configuration of an
    experiment can be interpreted to create and initialize a Translator.

    Here I should make a note: at first I thought that a python
    script that modifies the Translator can serve as a configuration.
    But that would be awful, because any change in the main code would
    invalidate all available configurations. This issue is by the
    way present in PyLearn2: YAML-file approach works only with frozen 
    interfaces.

    Thus we need middle-ware like this function, that
    interprets configuration stored in a very simple format:
    python dictionary. In case the code of the experiment (or even)
    the library change, one should write a new function like this one.
    This is very close to old groundhog "state" approach, the difference being
    that in the old NMT code the Encoder and Decoder objects had direct
    access to the state freezing it forever.
    """

    t = Translator(
            encoder_transition=GatedRecurrentTransition(
                use_reset_gate=config['enc_reset_gate'],
                use_update_gate=config['enc_update_gate']),
            decoder_transition=GatedRecurrentTransition(
                use_reset_gate=config['enc_reset_gate'],
                use_update_gate=config['enc_update_gate']),
            num_input_words=config['num_input_words'],
            num_output_words=config['num_output_words']
            dim=config['dim'],
            word_emb_dim=config['word_emb_dim'],
            weights_init=eval(config['weights_init']),
            recurrent_weights_init=eval(config['recurrent_weights_init']),
            biases_init=Constant(0))

    # Let's change initialization scheme of a particular matrix.
    a = t.decoder.attention
    a.input_projection.weights_init = config['att_inp_proj_weights_init']
    # But this is not enough, because this change will be overriden
    # by the owner at the configuration propagation stage. We should
    # inform the owner that this brick is an exception.
    a.init_exceptions.append(a.input_projection.name)
    # Now it's fine: `a` will not propagate initialization information to the brick
    # named `a.input_projection.name`.

    # This is how structural changes can be performed:
    if config['deep_out']:
        g = t.decoder.generator
        new_presoftmax = MLP(
                name=g.presoftmax.name,
                activations=[Maxout(2), Idendity()],
                # We can be lazy and specify only what we have to specify.
                dims=[UNDEF, self.word_emb_dim, UNDEF])
        # Slave replacement procedure.
        # Always three steps - long but explicit.
        g.presoftmax.leave_owner()
        new_presoftmax.set_owner(g)
        # In `set_owner` we will check that the owner Brick
        # does not have two slaves with same names. Thus forgetting to
        # call `leave_owner` will not go unnoticed.
        # Similarly forgetting to call `set_owner` should be
        # noticed at some point.
        g.presoftmax = new_presoftmax

    # Two ways to apply weight decay. The first one: using
    # Brick __dict__'s:
    rate = config['weight_decay_encoder']:
    if rate:
        # BrickSelector is as convienient way to access
        # subtree of a Brick.
        for param in BrickSelector(t.encoder).get_params():
            param.weight_decay_rate = rate
        # Weight decay was applied to all parameters of `encoder`
        # and its slaves.

    # The second one, Jan's style:
    rules = config['weight_decay_rules']:
    for path, param_pattern, rate in rules:
        # Examples of paths (trying to base on Jan's syntax'):
        #
        # / - the top-most brick
        # /encoder - the `encoder` brick
        # /encoder// - the `encoder` brick and all its descendants
        # /decoder/feedback* - captures `decoder/feedback` and `decoder/feedback_transformer`
        # /decoder/.../path* - captures `decoder/splitter/path{0,1,2}`
        #
        # `param_pattern` is a regular expression to chooce a subset of params.
        for param in BrickSelector(t).get_bricks(path).get_params(param_pattern):
            param.weight_decay_rate = rate

    return t

 def get_cost(translator, config):
    """Builds regularized cost for `translator` according to config.
    """
    in_sents= TT.lmatrix('x')
    out_sents = TT.lmatrix('y')
    in_mask = TT.matrix('x_mask')
    out_mask = TT.matix('y_mask')
    cost = t.log_likelihood(in_sents, out_sents, in_mask, out_mask)

    # Now `cost` is a variable from a computation graph enriched with
    # references to blocks. The block structure of the computation graph
    # can be retrieved by traversal.
    # BlockSelector is an accessor to the hierarchical structure of blocks.
    # It can be build by specying the top-most variables or,
    # by giving a set of `end_variables`.
    bs = BlockSelector(end_variables=[cost])

    # Thanks to strict hierarchy that we imposed on bricks,
    # we can refer to a block
    # with an alternating concatentation of brick names and
    # block names. Examples:
    #
    # /translator:log_likelihood/input_word_dict:lookup
    # /translator:log_likelihood/encoder:apply
    #
    # One should of course allow all sorts of pattern matching, e.g.
    # `:<name>` part can be omitted meaning "take all blocks of this brick".
    # Regular expression can "//" syntax can also be added.

    # Let's apply dropout to output weights
    dropout = config.get('output_weights_dropout')
    if dropout:
        block = bs.get_blocks("/translator/decoder/generator/presoftmax/linear1")
        # Thanks to thoughtful copying that we do in by `Brick.wrapped.apply`
        # input and output variables of the block are always copies of smth.
        # It is very easy to replace theano Identity Apply node 
        # by something more interesting.
        blocks.modification.replace_io_variable(
                blocks.outputs[0],
                Dropout(prob=dropout).apply)
        # Note that at the exploitation stage the activations should be
        # then divided by 2. Thus this simple way of adding dropout is
        # beautiful but very dangerous. We should probably insert Dropout
        # brick before by manipulating the Brick tree
        # in the way similar to `deep_out` implementation (see above)

    # A noise addition routine can be implemented as follows.
    # One can find all usecases of a shared variable in a block by
    # traversing the part of the theano graph between its inputs
    # and outputs. Then the shared variable can be replaced by its sum
    # with noise.
    noise = config.get('output_weights_noise')
    if noise:
        block = bs.get_blocks("/translator/decoder/generator/presoftmax/linear1")
        block.modification.add_gaussian_noise(
                block=block,
                parameters=[block.W],
                level=noise)

    # `cost` without dropout and noise addition is returned.
    # Some upper level class (Model?) is still responsible to add
    # weight decay terms to it as described by `weight_decay_rate` tags.
    return cost
	# INTRODUCTION
	# ------------
	#
	# Bart's logistic regression oneliner has served us well
	# forcing us to think how to make the framework
	# suitable for quick-and-dirty experimentation with small
	# standard models. It did not force us to think about
	# scalability and flexibility necessary for big
	# experiments with big models.
	#
	# To fill this gap I made this sketch of how the machine
	# translation code would look like in a framework of my dream.
	# I present three things:
	# - machine translation model a single brick called Translator
	# - example routine `create_and_configures'
	# that creates a Translator and configures it
	# - example routine that takes a Translator, applies it and
	# modifies the theano graph along the lines of
	# standard and advanced regularization methods
	#
	# As we diverged significantly in our understanding what "bricks"
	# and "blocks" are, a quick recap of the assumptions made
	# in this code follows.
	#
	# Bricks ana Blocks
	#------------------
	#
	# The basic thing is a brick. A brick is an
	# object that can generate theano graph given inputs and using
	# its parameters. This is called "application of a brick".
	# A brick can own subordinate bricks called "slaves". When applied
	# a brick can use its slaves to generate theano graph.
	# Two restrictions are made:
	# (1) a brick can be a slave of only one brick,
	# (2) when applied a brick can use only its slave bricks.
	# Thus bricks form a tree-shaped hierarchy.
	#
	# The theano graph generated during application of a brick is called
	# a block. Thanks to resrictions (1) and (2) we know that the blocks
	# also form a tree-like structure.
	#
	# Brick life-cycle
	# ----------------
	#
	# Creation: a brick is created with minimal number of required
	# parameters. Corresponds to __init__ method in the code.
	# The brick gradually receives the rest of its configuration
	# from the user and the chain of its owners.
	#
	# Initialization: a configured brick finetunes configuration of its
	# slaves (and possibly slaves of slaves), initializes parameters of
	# itself and hands the initialization request to the slaves.
	#
	# Application: the brick is applied by the user or its owner.
	# Initialization can be triggered from here if not done.
	#
	# NOTE: the life-cycle above is a few steps back from
	# the bulky 6-stage scheme to what Bart proposed
	# much earlier. I lost understanding why it's bad -
	# at least for NMT we can postpone all applications until
	# all the initialization is done.
	#
	# Names
	# -----
	#
	# All bricks and blocks have names. The only resriction is
	# (3) slaves of a brick have different names.
	# This restriction allows to refer a brick
	# by a chain of names of its owners and its name.
	# We avoid long composite names like "A_B_C_D" in this way.

	from theano.tensor import TT

	# I expect that all the components from the list below
	# will be standard in the future framework.
	from blocks import
	UNDEF
	Brick,\
	Identity,\
	Tanh,\
	Maxout,
	Softmax,\
	Linear,\
	MLP,\
	NamedDaisyChain,\
	EmbeddingDict,\
	RecurrentNetwork,\
	GatedRecurrentTransition,\
	LSTMTransition,\
	BidirectionalNetwork,\
	SearchAttention,\
	AttentionBasedSequenceGenerator
	from blocks.initialization import Constant, IsotropicGaussian
	from blocks.access import BrickSelector, BlockSelector
	import blocks.modification

	class Translator(Brick):
	"""Machine Translation Model as a single Brick!

	Parameters:
	-----------
	encoder_transition:
	A Brick to be used as a recurrent transition in the encoder.
	decoder_transition:
	A Brick to be used as a recurrent transition in the decoder.
	num_input_words: int
	Input vocabulary size.
	num_output_words: int
	Output vocabulary size.
	dim : int
	Main dimensionality of the model.
	word_emb_dim : int
	Dimensionality of word embeddings.
	weights_init : blocks.Initalization
	Default initizalition for weights.
	recurrent_weights_init : blocks.Initialization
	Default initialization for recurrent weights.
	bias_init : blocks.Initialization
	Default initialization for biases.
	Notes:
	------
	The choice of `encoder_transition` and `decoder_transition` as
	mandatory constructor parameters is not arbitrary. That is
	to know exactly the number of inputs for these transitions
	to instantiate other Bricks. For instance if `encoder_transition`
	is a gatedRNN, than the bricks splitting the input signal
	into three should be created. This is done behind the
	scene in `BidirectionalNetwork` and
	`AttentionBasedSequenceGenerator`.
	"""
	@Brick.lazy
	def __init__(self,
	encoder_transition,
	decoder_transition,
	num_input_words, num_output_words,
	dim, word_emb_dim,
	weights_init, recurrent_weights_init
	**kwargs):
	super(Traslator, self).__init__(kwargs)

	self.num_input_words = num_input_words
	self.num_output_words = num_output_words
	self.dim = dim
	self.word_emb_dim = word_emb_dim
	self.weights_init = weights_init
	self.recurrent_weights_init = recurrent_weights_init
	self.biases_init = biases_init

	# In order to be able to pass a Brick object as an argument,
	# we should allow to specify the owner after construction.
	# However I believe that
	# 1) we should only allow to set the `owner` attribute once.
	# It is not hard to check.
	# 2) A Brick should not be allowed to call `apply` of any Brick
	# except its slaves. It is harder to check... Thread-local
	# Brick attribute storing a reference of `self` of the last apply
	# call??
	# We also expect that if a Brick receives a slave as an argument
	# and it does not have a name, it should set a default one.
	# For instance, after the call below the following structure in the
	# Brick namespace should emerge (assuming that encoder_transition
	# has three inputs like gatedRNN does):
	#
	# /input_dict
	# /encoder
	# /splitter
	# /part0
	# /part1
	# /part2
	# /forward
	# /transition
	# /backward
	# /transition
	#
	self.input_dict = EmbeddingDict(owner=self, name='input_dict')
	self.encoder = BidirectionalNetwork(
	owner=self,
	name="encoder",
	input_transform=MLP(),
	transition_class=encoder_transition)

	# This call should build the following structure
	#
	# /decoder
	# /attention
	# /transition
	# /generator
	# /feedback
	# /feedback_splitter
	# /part0
	# /part1
	# /part2
	#
	# under an assumption that decoder_transition is a gatedRNN.
	self.decoder = AttentionBasedSequenceGenerator(
	owner=self,
	name="decoder",
	transition=decoder_transition,
	attention=SearchAttention(),
	generator=NamedDaisyChain(
	# The call MLP() is expected to create
	# an MLP with a single hidden layer
	# and Identity() activation.
	presoftmax=MLP(),
	softmax=Softmax()),
	feedback=EmbeddingDict(),
	feedback_transformer=MLP())

	def _initialize(self):
	"""Configure the slaves assuming the Brick is configured.

	Notes
	-----
	This something like merged `_push_prealloc_config`,
	`_allocate', '_push_preinit_config' and `_initialize`
	from my last proposal.
	"""
	self.input_dict.num_words = self.num_input_words
	self.input_dict.dim = self.word_emb_dim

	self.encoder.input_dim = self.word_emb_dim
	self.encoder.dim = dim

	self.decoder.input_dim = 2 * dim
	self.decoder.hidden_dim = dim
	self.decoder.feedback_dim = self.word_emb_dim
	# The default `presoftmax` Brick is MLP(), which is just
	# a convenice wrapper on Linear. But the user might
	# want to have something deeper. The old machine translation code
	# allowed that by `deep_out` switch which was in practice always
	# turned on.
	# Here, we configure `presoftmax` very carefully to allow
	# maximum flexibility. It can be replaced by any MLP.
	self.decoder.generator.presoftmax.dims[0] = dim
	self.decoder.generator.presoftmax.dims[-1] = self.num_output_words
	# Note: we do not configure dimensionalities of `decoder.attention`
	# Because `decoder` is responsible for that. Same applies
	# for `decoder.feedback_transformer`.
	self.decoder.feedback.num_words = self.num_output_words
	self.decoder.feedback.dim = self.word_emb_dim

	self.input_dict.weights_init = self.weights_init
	init_attrs = ['weights_init', 'biases_init',
	'recurrent_weights_init']
	for slave in [self.encoder, self.decoder]:
	for attr in init_attrs:
	setattr(slave, attr, getattr(self, attr))

	@Brick.wrapped_apply
	# Note that Brick.wrapped_apply has to look for input variables
	# also in keyword arguments like `in_mask` and `out_mask`.
	def log_likelihood(in_sents, out_sents, in_mask=None, out_mask=None, **kwargs):
	"""Compute log-likelihood of generating `out_sents` given `in_sents`.

	Parameters:
	-----------
	in_sents : theano matrix of int64
	Batch of input sentences.
	out_sents : theano matrix of int64
	Batch of output sentences.
	in_mask : theano matrix of float32
	Input mask.
	out_mask : theano matrix of float32
	Output mask.

	Returns:
	-------
	log_likelihood : theano vector of float32
	Log-likelihoods of generation event for each sentence pair.
	"""
	return self.decoder.log_likelihood(
	input_seqs=self.encoder.apply(
	self.input_dict.lookup(in_sents),
	in_mask),
	output_seqs=out_sents,
	in_mask=in_mask,
	out_mask=out_mask)

	@Brick.wrapper_apply
	def sample(in_sents, in_mask=None, n_steps=None, temperature=1, *kwargs):
	"""Sample translations from input sentences.
	"""
	return self.decoder.sample(
	input_seqs=self.encoder.apply(
	self.input_dict.lookup(in_sents),
	in_mask),
	in_mask=in_mask,
	n_steps=n_steps,
	temperature=temperature))

	def create_and_configure(config):
	"""Creates and configures a Translator.

	This is a demonstration how a global configuration of an
	experiment can be interpreted to create and initialize a Translator.

	Here I should make a note: at first I thought that a python
	script that modifies the Translator can serve as a configuration.
	But that would be awful, because any change in the main code would
	invalidate all available configurations. This issue is by the
	way present in PyLearn2: YAML-file approach works only with frozen
	interfaces.

	Thus we need middle-ware like this function, that
	interprets configuration stored in a very simple format:
	python dictionary. In case the code of the experiment (or even)
	the library change, one should write a new function like this one.
	This is very close to old groundhog "state" approach, the difference being
	that in the old NMT code the Encoder and Decoder objects had direct
	access to the state freezing it forever.
	"""

	t = Translator(
	encoder_transition=GatedRecurrentTransition(
	use_reset_gate=config['enc_reset_gate'],
	use_update_gate=config['enc_update_gate']),
	decoder_transition=GatedRecurrentTransition(
	use_reset_gate=config['enc_reset_gate'],
	use_update_gate=config['enc_update_gate']),
	num_input_words=config['num_input_words'],
	num_output_words=config['num_output_words']
	dim=config['dim'],
	word_emb_dim=config['word_emb_dim'],
	weights_init=eval(config['weights_init']),
	recurrent_weights_init=eval(config['recurrent_weights_init']),
	biases_init=Constant(0))

	# Let's change initialization scheme of a particular matrix.
	a = t.decoder.attention
	a.input_projection.weights_init = config['att_inp_proj_weights_init']
	# But this is not enough, because this change will be overriden
	# by the owner at the configuration propagation stage. We should
	# inform the owner that this brick is an exception.
	a.init_exceptions.append(a.input_projection.name)
	# Now it's fine: `a` will not propagate initialization information to the brick
	# named `a.input_projection.name`.

	# This is how structural changes can be performed:
	if config['deep_out']:
	g = t.decoder.generator
	new_presoftmax = MLP(
	name=g.presoftmax.name,
	activations=[Maxout(2), Idendity()],
	# We can be lazy and specify only what we have to specify.
	dims=[UNDEF, self.word_emb_dim, UNDEF])
	# Slave replacement procedure.
	# Always three steps - long but explicit.
	g.presoftmax.leave_owner()
	new_presoftmax.set_owner(g)
	# In `set_owner` we will check that the owner Brick
	# does not have two slaves with same names. Thus forgetting to
	# call `leave_owner` will not go unnoticed.
	# Similarly forgetting to call `set_owner` should be
	# noticed at some point.
	g.presoftmax = new_presoftmax

	# Two ways to apply weight decay. The first one: using
	# Brick __dict__'s:
	rate = config['weight_decay_encoder']:
	if rate:
	# BrickSelector is as convienient way to access
	# subtree of a Brick.
	for param in BrickSelector(t.encoder).get_params():
	param.weight_decay_rate = rate
	# Weight decay was applied to all parameters of `encoder`
	# and its slaves.

	# The second one, Jan's style:
	rules = config['weight_decay_rules']:
	for path, param_pattern, rate in rules:
	# Examples of paths (trying to base on Jan's syntax'):
	#
	# / - the top-most brick
	# /encoder - the `encoder` brick
	# /encoder// - the `encoder` brick and all its descendants
	# /decoder/feedback* - captures `decoder/feedback` and `decoder/feedback_transformer`
	# /decoder/.../path* - captures `decoder/splitter/path{0,1,2}`
	#
	# `param_pattern` is a regular expression to chooce a subset of params.
	for param in BrickSelector(t).get_bricks(path).get_params(param_pattern):
	param.weight_decay_rate = rate

	return t

	def get_cost(translator, config):
	"""Builds regularized cost for `translator` according to config.
	"""
	in_sents= TT.lmatrix('x')
	out_sents = TT.lmatrix('y')
	in_mask = TT.matrix('x_mask')
	out_mask = TT.matix('y_mask')
	cost = t.log_likelihood(in_sents, out_sents, in_mask, out_mask)

	# Now `cost` is a variable from a computation graph enriched with
	# references to blocks. The block structure of the computation graph
	# can be retrieved by traversal.
	# BlockSelector is an accessor to the hierarchical structure of blocks.
	# It can be build by specying the top-most variables or,
	# by giving a set of `end_variables`.
	bs = BlockSelector(end_variables=[cost])

	# Thanks to strict hierarchy that we imposed on bricks,
	# we can refer to a block
	# with an alternating concatentation of brick names and
	# block names. Examples:
	#
	# /translator:log_likelihood/input_word_dict:lookup
	# /translator:log_likelihood/encoder:apply
	#
	# One should of course allow all sorts of pattern matching, e.g.
	# `:<name>` part can be omitted meaning "take all blocks of this brick".
	# Regular expression can "//" syntax can also be added.

	# Let's apply dropout to output weights
	dropout = config.get('output_weights_dropout')
	if dropout:
	block = bs.get_blocks("/translator/decoder/generator/presoftmax/linear1")
	# Thanks to thoughtful copying that we do in by `Brick.wrapped.apply`
	# input and output variables of the block are always copies of smth.
	# It is very easy to replace theano Identity Apply node
	# by something more interesting.
	blocks.modification.replace_io_variable(
	blocks.outputs[0],
	Dropout(prob=dropout).apply)
	# Note that at the exploitation stage the activations should be
	# then divided by 2. Thus this simple way of adding dropout is
	# beautiful but very dangerous. We should probably insert Dropout
	# brick before by manipulating the Brick tree
	# in the way similar to `deep_out` implementation (see above)

	# A noise addition routine can be implemented as follows.
	# One can find all usecases of a shared variable in a block by
	# traversing the part of the theano graph between its inputs
	# and outputs. Then the shared variable can be replaced by its sum
	# with noise.
	noise = config.get('output_weights_noise')
	if noise:
	block = bs.get_blocks("/translator/decoder/generator/presoftmax/linear1")
	block.modification.add_gaussian_noise(
	block=block,
	parameters=[block.W],
	level=noise)

	# `cost` without dropout and noise addition is returned.
	# Some upper level class (Model?) is still responsible to add
	# weight decay terms to it as described by `weight_decay_rate` tags.
	return cost