Bart van Merriënboer bartvm

Myia (Theano 2.0)

Automatic differentiation vs. symbolic

In the literature the term automatic differentiation (AD) is reserved for a specific technique in which the gradient of a program is calculated by creating an adjoint program, which performs the gradient calculation of the given program. Note that this adjoint program includes all control flow statements. There are two approaches to implementing AD: Source code transformation (SCT) and operator overloading (OO). With source code transformation we generate the adjoint program in the host language e.g. given a Python function we manipulate the abstract syntax tree (AST) directly in order to create a new Python function which performs the gradient computation. Operator overloading on the other hand overloads each operator to add an entry to a tape (Wengert list). Once the function exits, the gradient is calculated by going through the tape in reverse order, applying the gradient operators.

Theano does not employ AD but "[a highly optimized form of

	1 1 2.2973299250376 2.2973299250376 [644/1881]
	1 2 2.0864643271487 2.2762433652487
	1 3 2.1500752827293 2.2636265569968
	1 4 1.9636232815994 2.2336262294571
	1 5 2.0671982139671 2.2169834279081
	1 6 2.1094124953551 2.2062263346528
	1 7 1.9331442971793 2.1789181309054
	1 8 4.0584201436111 2.366868332176
	1 9 1.8639954152644 2.3165810404848
	1 10 1.6428338575222 2.2492063221886

	local dict_size = probs:size(2)
	local max_out_arcs = out_arcs:size(2)
	local max_in_arcs = in_arcs:size(2)
	local state_probs = torch.narrow(params.state_probs, 1, 1, seq_len)
	state_probs.value:fill(0)
	for i = 2, seq_len do
	local starts = torch.ones(max_in_arcs):long() * i
	local origins = starts:add(-lengths:index(1, in_arcs[i]))
	local indices = ((origins - 1) * dict_size + 1):add(in_arcs[i])
	local in_arc_probs = torch.index(probs:view(seq_len * dict_size), 1, indices)

	-- Vectorized approach
	local dict_size = probs:size(2)
	local max_out_arcs = out_arcs:size(2)
	local max_in_arcs = in_arcs:size(2)
	local state_probs = torch.narrow(params.state_probs, 1, 1, seq_len + 1)
	state_probs.value:fill(0)
	for i = 2, seq_len + 1 do
	local starts = torch.ones(max_in_arcs):long() * i
	local origins = starts:add(-lengths:index(1, in_arcs[i]))
	local indices = ((origins - 1) * dict_size + 1):add(in_arcs[i] - 1)

	if opt.hogwild > 1 then
	local ipc = require 'libipc'
	local q = ipc.workqueue('examples')
	local q2 = ipc.workqueue('done')
	local ids = ipc.workqueue('ids')
	for i = 1, opt.hogwild do
	ids:write(i)
	end

	-- Initialize the states

	#define LUA_LIB
	#include "lua.h"
	#include "lauxlib.h"

	static int createtable_create(lua_State *L) {
	lua_createtable(L, luaL_optstring(L, 1, 0), luaL_optstring(L, 2, 0));
	return 1;
	}

	static const luaL_Reg createtablelib[] = {

	local nn = require 'nn'

	Flatten, parent = torch.class('nn.Flatten', 'nn.Container')

	function Flatten:__init(flattenDims)
	-- This container wraps a module and flattens the first N dimensions before
	-- unflattening them again (in this case time and batches)
	self.flattenDims = flattenDims
	parent.__init(self)
	end

	#include <cstdio>
	#include <thread>
	#include "tbb/flow_graph.h"
	#include "TH.h"
	#include <array>
	#include <tuple>

	// This is an op that gets added to the TBB graph
	class Cadd {
	public:

	import os
	import weakref
	import logging
	import atexit
	import threading
	import queue
	import itertools

	_threads_queues = weakref.WeakKeyDictionary()
	_shutdown = False

	#!/bin/env bash

	# Usage: msub-batch.sh -l walltime=73:00:00 [-w 43200] -- -l nodes=1:gpus=2
	# Automatically submits a series of jobs that are dependent on each other.
	# By default it submits jobs of 24 hours, but a different interval can be
	# given using the -w flag (in seconds). Each job can look at the MOAB_JOBID
	# and MOAB_DEPEND variables to see what its own ID are and whether it has
	# dependencies

	MAX_RUNTIME=$((24 * 60 * 60))