Mocuto · December 20, 2017 05:53
diff --git a/Network.scala b/Network.scala
  // From: "Character-Aware Neural Language Models" https://arxiv.org/abs/1508.06615

  val kernelsPerWidth = 5
    val numKernels = kernelWidths.length * kernelsPerWidth

    val biasInterval = scala.util.Random.nextGaussian() / 0.5

    log.info("Build model....")
    val confWithoutConv = new NeuralNetConfiguration.Builder()
      .learningRate(0.01)
      .graphBuilder()
      .addInputs("input")
    val confWithConv = kernelWidths.zipWithIndex.foldLeft(confWithoutConv) { case (c, (w, i)) =>
      val cn = s"C$i"
      val pn = s"P$i"
      val huLength = w2ohLength - w  + 1
      // Convolution
      val nc = c.addLayer(cn, new ConvolutionLayer.Builder(c2ohLength, w)
        .nIn(1) // Number of channels in
        .stride(1, 1)
        .nOut(kernelsPerWidth) // Number of kernels (filters) out
        .activation("tanh")
        .build(), "input"
      // Pooling
      ).addLayer(pn, new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX)
              .kernelSize(1, huLength)
              .stride(1,1)
              .build(), cn)
      nc
    }

    val conf = confWithConv.addVertex("merge", new MergeVertex(), (0 until kernelWidths.length).map(i => s"P$i").toArray : _* )
      // Highway Network behaviour
      .addLayer("t_sigmoid", new DenseLayer.Builder()
        .activation("sigmoid")
        .biasInit(-2.0 + biasInterval)
        .nOut(numKernels)
        .build(), "merge")
      .addLayer("g", new DenseLayer.Builder()
        .activation("relu")
        .nOut(numKernels)
        .build(), "merge")
      // Take hammard product of transform gate and nonlinearity
      .addVertex("t hammard g", new ElementWiseVertex(ElementWiseVertex.Op.Product), "t_sigmoid", "g")
      .addVertex("-t", new ScaleVertex(-1), "t_sigmoid")
      .addVertex("1 - t", new ShiftVertex(1), "-t")
      .addVertex("(1 - t) hammard merge", new ElementWiseVertex(ElementWiseVertex.Op.Product), "1 - t", "merge")
      .addVertex("H1", new ElementWiseVertex(ElementWiseVertex.Op.Add), "t hammard g", "(1 - t) hammard merge")
      .addLayer("out-intent", new RnnOutputLayer.Builder()
        .activation("sigmoid")
        .lossFunction(LossFunctions.LossFunction.XENT)
        .nIn(numKernels)
        .nOut(TrainingExample.IntentVecSize)
        .build(), "H1")
      .addLayer("out-iobe", new RnnOutputLayer.Builder()
        .activation("sigmoid")
        .lossFunction(LossFunctions.LossFunction.XENT)
        .nIn(numKernels)
        .nOut(TrainingExample.InsideOutsideVecSize)
        .build(), "H1")
      .addLayer("out-entityType", new RnnOutputLayer.Builder()
        .activation("sigmoid")
        .lossFunction(LossFunctions.LossFunction.XENT)
        .nIn(numKernels)
        .nOut(TrainingExample.EntityTypeVecSize)
        .build(), "H1")
      .addLayer("out-onehotword", new RnnOutputLayer.Builder()
        .activation("sigmoid")
        .lossFunction(LossFunctions.LossFunction.XENT)
        .nIn(numKernels)
        .nOut(w2ohLength)
        .build(), "H1")
      .setOutputs("out-intent","out-iobe", "out-entityType", "out-onehotword")
      .setInputTypes(InputType.convolutionalFlat(c2ohLength, w2ohLength, 1))
      .build();

    val model = new ComputationGraph(conf)
	// From: "Character-Aware Neural Language Models" https://arxiv.org/abs/1508.06615

	val kernelsPerWidth = 5
	val numKernels = kernelWidths.length * kernelsPerWidth

	val biasInterval = scala.util.Random.nextGaussian() / 0.5

	log.info("Build model....")
	val confWithoutConv = new NeuralNetConfiguration.Builder()
	.learningRate(0.01)
	.graphBuilder()
	.addInputs("input")
	val confWithConv = kernelWidths.zipWithIndex.foldLeft(confWithoutConv) { case (c, (w, i)) =>
	val cn = s"C$i"
	val pn = s"P$i"
	val huLength = w2ohLength - w + 1
	// Convolution
	val nc = c.addLayer(cn, new ConvolutionLayer.Builder(c2ohLength, w)
	.nIn(1) // Number of channels in
	.stride(1, 1)
	.nOut(kernelsPerWidth) // Number of kernels (filters) out
	.activation("tanh")
	.build(), "input"
	// Pooling
	).addLayer(pn, new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX)
	.kernelSize(1, huLength)
	.stride(1,1)
	.build(), cn)
	nc
	}

	val conf = confWithConv.addVertex("merge", new MergeVertex(), (0 until kernelWidths.length).map(i => s"P$i").toArray : _* )
	// Highway Network behaviour
	.addLayer("t_sigmoid", new DenseLayer.Builder()
	.activation("sigmoid")
	.biasInit(-2.0 + biasInterval)
	.nOut(numKernels)
	.build(), "merge")
	.addLayer("g", new DenseLayer.Builder()
	.activation("relu")
	.nOut(numKernels)
	.build(), "merge")
	// Take hammard product of transform gate and nonlinearity
	.addVertex("t hammard g", new ElementWiseVertex(ElementWiseVertex.Op.Product), "t_sigmoid", "g")
	.addVertex("-t", new ScaleVertex(-1), "t_sigmoid")
	.addVertex("1 - t", new ShiftVertex(1), "-t")
	.addVertex("(1 - t) hammard merge", new ElementWiseVertex(ElementWiseVertex.Op.Product), "1 - t", "merge")
	.addVertex("H1", new ElementWiseVertex(ElementWiseVertex.Op.Add), "t hammard g", "(1 - t) hammard merge")
	.addLayer("out-intent", new RnnOutputLayer.Builder()
	.activation("sigmoid")
	.lossFunction(LossFunctions.LossFunction.XENT)
	.nIn(numKernels)
	.nOut(TrainingExample.IntentVecSize)
	.build(), "H1")
	.addLayer("out-iobe", new RnnOutputLayer.Builder()
	.activation("sigmoid")
	.lossFunction(LossFunctions.LossFunction.XENT)
	.nIn(numKernels)
	.nOut(TrainingExample.InsideOutsideVecSize)
	.build(), "H1")
	.addLayer("out-entityType", new RnnOutputLayer.Builder()
	.activation("sigmoid")
	.lossFunction(LossFunctions.LossFunction.XENT)
	.nIn(numKernels)
	.nOut(TrainingExample.EntityTypeVecSize)
	.build(), "H1")
	.addLayer("out-onehotword", new RnnOutputLayer.Builder()
	.activation("sigmoid")
	.lossFunction(LossFunctions.LossFunction.XENT)
	.nIn(numKernels)
	.nOut(w2ohLength)
	.build(), "H1")
	.setOutputs("out-intent","out-iobe", "out-entityType", "out-onehotword")
	.setInputTypes(InputType.convolutionalFlat(c2ohLength, w2ohLength, 1))
	.build();

	val model = new ComputationGraph(conf)