sjmielke · December 15, 2017 19:14
diff --git a/sgld.py b/sgld.py
 import matplotlib.pyplot as plt
 import numpy as np
 import torch

 a = -.1
 b = -.3
 c = 1.0
 d = 1.5

 def p(x):
  return np.exp(a*x**4 + b*x**3 + c*x**2 + d*x - 5.1574089034945)

 def logp(x):
  return a*x**4 + b*x**3 + c*x**2 + d*x # - 5.1574089034945

 def dlogp(x):
  # return np.exp(a*x**4 + b*x**3 + c*x**2 + d*x + e) * (4*a*x**3 + 3*b*x**2 + 2*c*x + d)
  if isinstance(x, int) and x > 2.5:
    return -1
  elif isinstance(x, int) and x < -4:
    return 1
  else:
    # print(4*a*x**3 + 3*b*x**2 + 2*c*x + d)
    return 4*a*x**3 + 3*b*x**2 + 2*c*x + d

 def do_sgld():
  xax = plt.subplot(3, 1, 1)
  plt.xticks([])
  plt.yticks([])

  changeiter = None
  changethreshold = 0.01
  def getstepsize(i):
    global changeiter
    ss = max(0.9 ** i, changethreshold)
    if changeiter is None and ss == changethreshold:
      changeiter = i
    return ss

  x = -2
  xs = []
  ys = []
  maxiters = 200000
  for i in range(maxiters):
    if i % 10000 == 0:
      print(i, "/", maxiters, " ->  lr =", getstepsize(i))
    xs.append(x)
    ys.append(logp(x))
    # plt.plot(xs[-2:], np.clip(ys[-2:], -10, None), 'g-', alpha = 0.01 + 0.05 * float(i) / maxiters)
    # Pure gradient
    grad = dlogp(x)
    grad = max(grad, -5 + np.random.normal(0, 3, 1)[0])
    grad = min(grad, 5 + np.random.normal(0, 3, 1)[0])
    # Noise it up
    grad = getstepsize(i) * 0.5 * dlogp(x)
    noise = np.random.normal(0, np.sqrt(getstepsize(i)), 1)[0]
    x += grad + noise
    # Clamp
    oldx = x
    x = max(x, -4.5 - 2)
    x = min(x,  3   + 2)
    # if np.abs(x - xs[-1]) < 0.0001 and oldx == x:
    #   break

  plt.axvline(xs[-1])
  plt.plot(xs[::50], np.clip(ys[::50], -10, None), 'bo', alpha = 400.0 / maxiters)

  plt.subplot(3, 1, 2, sharex = xax)
  plt.xticks([])
  plt.yticks([])

  ycoords = np.arange(0, len(xs)) / (-.1 * len(xs))
  plt.plot(xs, ycoords, 'ko', markersize = .5, alpha = 0.025)
  plt.axhline(ycoords[changeiter])


  plt.subplot(3, 1, 3, sharex = xax)
  plt.xticks([])
  plt.yticks([])

  X = np.arange(-4.5, 3, 0.01)
  plt.plot(X, p(X))
  # plt.plot(X, logp(X))
  # plt.plot(X, dlogp(X))

  # now my binning
  binsize = 0.15
  bins = np.arange(-4.5 - 2, 3 + 2, binsize)
  counters = np.zeros(len(bins))
  counters_burnedin = np.zeros(len(bins))

  for i, x in enumerate(xs):
    where = min((max(min(x, 3 + 2), -4.5 - 2) + 4.5 + 2) / (4.5 + 2 + 3 + 2), 0.9999999)
    counters[int(where * len(bins))] += getstepsize(i)
    if i > changeiter:
      counters_burnedin[int(where * len(bins))] += getstepsize(i)

  # counters[0] = 0
  # counters[-1] = 0
  plt.bar(bins, counters / (binsize * sum(counters)), alpha = 0.4, width = binsize, color = 'red')
  plt.bar(bins, counters_burnedin / (binsize * sum(counters_burnedin)), alpha = 0.4, width = binsize, color = 'green')

  plt.show()

 def do_bbb():
  torch.manual_seed(42)

  gaussian_mu = torch.autograd.Variable(torch.Tensor([-1]), requires_grad = True)
  gaussian_rho = torch.autograd.Variable(torch.Tensor([1.0]), requires_grad = True) # close enough

  unifgaussian = torch.distributions.Normal(torch.Tensor([0.0]), 1.0)

  xax = plt.subplot(2, 1, 1)
  X = np.arange(-4.5, 3, 0.01)
  plt.plot(X, logp(X))

  xs = []
  ys = []
  maxiters = 1000
  for i in range(maxiters):
    # TODO: torch.distributions.Normal may also do the reparametrization?
    eps = torch.autograd.Variable(torch.clamp(unifgaussian.sample(), -2, 2), requires_grad = False)
    w = gaussian_mu + torch.log(1 + torch.exp(gaussian_rho)) * eps
    xs.append(w.data[0])
    ys.append(logp(w.data[0]))

    plt.plot(xs[-2:], np.clip(ys[-2:], -5, None), 'g-', alpha = 0.01 + 0.05 * float(i) / maxiters)

    q = torch.distributions.Normal(gaussian_mu, torch.log(1 + torch.exp(gaussian_rho)))
    log_q = q.log_prob(w)
    log_p = logp(w)
    f = log_q - log_p

    w_grad = []
    def set_w_grad(g):
      w_grad.append(g)
    w.register_hook(set_w_grad)
    f.backward()
    w_grad = w_grad[0]
    
    update_mu = w_grad + gaussian_mu.grad
    update_rho = w_grad * (eps / (1.0 + torch.exp(-gaussian_rho))) + gaussian_rho.grad

    gaussian_mu.data -= 0.01 * update_mu.data
    gaussian_rho.data -= 0.01 * update_rho.data

    if i % 100 == 0:
      print("Mu: {:6.3f} / Rho: {:6.3f}".format(gaussian_mu.data[0], gaussian_rho.data[0]))

    for param in [gaussian_mu, gaussian_rho]:
      param.grad.data.fill_(0)

  # print(xs)
  # print(ys)

  plt.subplot(2, 1, 2, sharex = xax)
  plt.plot(X, p(X))

  X_ = torch.autograd.Variable(torch.from_numpy(X)).type(torch.FloatTensor)
  plt.plot(X, np.exp(q.log_prob(X_).data.numpy()))
  
  plt.show()

 do_sgld()
 # do_bbb()
	import matplotlib.pyplot as plt
	import numpy as np
	import torch

	a = -.1
	b = -.3
	c = 1.0
	d = 1.5

	def p(x):
	return np.exp(ax4 + bx*3 + cx*2 + dx - 5.1574089034945)

	def logp(x):
	return ax4 + bx*3 + cx*2 + dx # - 5.1574089034945

	def dlogp(x):
	# return np.exp(ax4 + bx*3 + cx*2 + dx + e) * (4ax*3 + 3bx2 + 2c*x + d)
	if isinstance(x, int) and x > 2.5:
	return -1
	elif isinstance(x, int) and x < -4:
	return 1
	else:
	# print(4ax*3 + 3bx2 + 2c*x + d)
	return 4ax*3 + 3bx2 + 2c*x + d

	def do_sgld():
	xax = plt.subplot(3, 1, 1)
	plt.xticks([])
	plt.yticks([])

	changeiter = None
	changethreshold = 0.01
	def getstepsize(i):
	global changeiter
	ss = max(0.9 ** i, changethreshold)
	if changeiter is None and ss == changethreshold:
	changeiter = i
	return ss

	x = -2
	xs = []
	ys = []
	maxiters = 200000
	for i in range(maxiters):
	if i % 10000 == 0:
	print(i, "/", maxiters, " -> lr =", getstepsize(i))
	xs.append(x)
	ys.append(logp(x))
	# plt.plot(xs[-2:], np.clip(ys[-2:], -10, None), 'g-', alpha = 0.01 + 0.05 * float(i) / maxiters)
	# Pure gradient
	grad = dlogp(x)
	grad = max(grad, -5 + np.random.normal(0, 3, 1)[0])
	grad = min(grad, 5 + np.random.normal(0, 3, 1)[0])
	# Noise it up
	grad = getstepsize(i) * 0.5 * dlogp(x)
	noise = np.random.normal(0, np.sqrt(getstepsize(i)), 1)[0]
	x += grad + noise
	# Clamp
	oldx = x
	x = max(x, -4.5 - 2)
	x = min(x, 3 + 2)
	# if np.abs(x - xs[-1]) < 0.0001 and oldx == x:
	# break

	plt.axvline(xs[-1])
	plt.plot(xs[::50], np.clip(ys[::50], -10, None), 'bo', alpha = 400.0 / maxiters)

	plt.subplot(3, 1, 2, sharex = xax)
	plt.xticks([])
	plt.yticks([])

	ycoords = np.arange(0, len(xs)) / (-.1 * len(xs))
	plt.plot(xs, ycoords, 'ko', markersize = .5, alpha = 0.025)
	plt.axhline(ycoords[changeiter])


	plt.subplot(3, 1, 3, sharex = xax)
	plt.xticks([])
	plt.yticks([])

	X = np.arange(-4.5, 3, 0.01)
	plt.plot(X, p(X))
	# plt.plot(X, logp(X))
	# plt.plot(X, dlogp(X))

	# now my binning
	binsize = 0.15
	bins = np.arange(-4.5 - 2, 3 + 2, binsize)
	counters = np.zeros(len(bins))
	counters_burnedin = np.zeros(len(bins))

	for i, x in enumerate(xs):
	where = min((max(min(x, 3 + 2), -4.5 - 2) + 4.5 + 2) / (4.5 + 2 + 3 + 2), 0.9999999)
	counters[int(where * len(bins))] += getstepsize(i)
	if i > changeiter:
	counters_burnedin[int(where * len(bins))] += getstepsize(i)

	# counters[0] = 0
	# counters[-1] = 0
	plt.bar(bins, counters / (binsize * sum(counters)), alpha = 0.4, width = binsize, color = 'red')
	plt.bar(bins, counters_burnedin / (binsize * sum(counters_burnedin)), alpha = 0.4, width = binsize, color = 'green')

	plt.show()

	def do_bbb():
	torch.manual_seed(42)

	gaussian_mu = torch.autograd.Variable(torch.Tensor([-1]), requires_grad = True)
	gaussian_rho = torch.autograd.Variable(torch.Tensor([1.0]), requires_grad = True) # close enough

	unifgaussian = torch.distributions.Normal(torch.Tensor([0.0]), 1.0)

	xax = plt.subplot(2, 1, 1)
	X = np.arange(-4.5, 3, 0.01)
	plt.plot(X, logp(X))

	xs = []
	ys = []
	maxiters = 1000
	for i in range(maxiters):
	# TODO: torch.distributions.Normal may also do the reparametrization?
	eps = torch.autograd.Variable(torch.clamp(unifgaussian.sample(), -2, 2), requires_grad = False)
	w = gaussian_mu + torch.log(1 + torch.exp(gaussian_rho)) * eps
	xs.append(w.data[0])
	ys.append(logp(w.data[0]))

	plt.plot(xs[-2:], np.clip(ys[-2:], -5, None), 'g-', alpha = 0.01 + 0.05 * float(i) / maxiters)

	q = torch.distributions.Normal(gaussian_mu, torch.log(1 + torch.exp(gaussian_rho)))
	log_q = q.log_prob(w)
	log_p = logp(w)
	f = log_q - log_p

	w_grad = []
	def set_w_grad(g):
	w_grad.append(g)
	w.register_hook(set_w_grad)
	f.backward()
	w_grad = w_grad[0]

	update_mu = w_grad + gaussian_mu.grad
	update_rho = w_grad * (eps / (1.0 + torch.exp(-gaussian_rho))) + gaussian_rho.grad

	gaussian_mu.data -= 0.01 * update_mu.data
	gaussian_rho.data -= 0.01 * update_rho.data

	if i % 100 == 0:
	print("Mu: {:6.3f} / Rho: {:6.3f}".format(gaussian_mu.data[0], gaussian_rho.data[0]))

	for param in [gaussian_mu, gaussian_rho]:
	param.grad.data.fill_(0)

	# print(xs)
	# print(ys)

	plt.subplot(2, 1, 2, sharex = xax)
	plt.plot(X, p(X))

	X_ = torch.autograd.Variable(torch.from_numpy(X)).type(torch.FloatTensor)
	plt.plot(X, np.exp(q.log_prob(X_).data.numpy()))

	plt.show()

	do_sgld()
	# do_bbb()