Created
December 12, 2015 07:42
-
-
Save komiya-atsushi/716b8413c40735711ce2 to your computer and use it in GitHub Desktop.
評価メトリクスとして Logarithmic loss (LogLoss) を利用する Evaluator の実装。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.apache.spark.ml.evaluation | |
import org.apache.commons.math3.util.FastMath | |
import org.apache.spark.ml.param.ParamMap | |
import org.apache.spark.ml.param.shared.{HasLabelCol, HasProbabilityCol} | |
import org.apache.spark.ml.util.{Identifiable, SchemaUtils} | |
import org.apache.spark.mllib.linalg.{Vector, VectorUDT} | |
import org.apache.spark.sql.types.DoubleType | |
import org.apache.spark.sql.{DataFrame, Row} | |
/** | |
* Evaluator for probability prediction, using logarithmic loss, which expects two input columns: probability and label. | |
*/ | |
class BinaryLogLossEvaluator(override val uid: String) | |
extends Evaluator with HasProbabilityCol with HasLabelCol { | |
def this() = this(Identifiable.randomUID("binLogLossEval")) | |
/** @group setParam */ | |
def setProbabilityCol(value: String): this.type = set(probabilityCol, value) | |
/** @group setParam */ | |
def setLabelCol(value: String): this.type = set(labelCol, value) | |
override def evaluate(dataset: DataFrame): Double = { | |
val schema = dataset.schema | |
SchemaUtils.checkColumnType(schema, $(probabilityCol), new VectorUDT) | |
SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) | |
val epsilon = 1e-15 | |
val minusLogLoss = dataset.select($(probabilityCol), $(labelCol)) | |
.map { case Row(probabilities: Vector, label: Double) => | |
val probability = Math.max(epsilon, Math.min(1 - epsilon, probabilities(1))) | |
label * FastMath.log(probability) + (1 - label) * FastMath.log(1 - probability) | |
} | |
.mean() | |
-1.0 * minusLogLoss | |
} | |
override def isLargerBetter: Boolean = false | |
override def copy(extra: ParamMap): BinaryLogLossEvaluator = defaultCopy(extra) | |
} |
Nice code. I had some issues with the "probabilities(1)" on a VectorUDF (for unknown reasons).
Solved using this:
val getPOne = udf((v: org.apache.spark.ml.linalg.Vector) => v(1))
converting vector column to double column
.withColumn("probability", getPOne($"probability"))
and then mapping
"case Row(probabilities: Double, label: Integer)"
instead of
"case Row(probabilities: Vector, label: Double)"
@alwaysprep : the rawPrediction definition depends on the algorithm used. It is for example the result of the scalar product in logistic regression for example. It is then the "probability " column that need to be used.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If I remember correctly log loss should use rawPrediction column not the probability column. Could you please clarify that. By the way, thank you for the implementation, it helps alot.