Created
August 24, 2016 05:35
-
-
Save RobColeman/1dcc8a9bd27317b89fdde2c22c049100 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.chartboost.adrel.preprocessing.featureComputers | |
import com.chartboost.adrel.dataModels.EcpmPredictionRequest | |
import com.chartboost.adrel.util.JsonSaving | |
import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors => SparkVectors} | |
import scala.util.hashing.MurmurHash3 | |
/** | |
* The meta-data for computing features. In this format to easily save and load with models. | |
* | |
* | |
* @param featureName the name of the feature | |
* @param blockNumber ordinal, the order of the block in the vector | |
* @param offset the index at which this block starts | |
* @param length the length of this block. It follows that the block ends at vector index (offset + length) | |
* @param totalVectorSize the total size of the feature vector (maybe redundant, might remove) | |
*/ | |
case class FeatureComputerParams( | |
featureName: String, | |
blockNumber: Int, | |
offset: Int, | |
length: Int, | |
totalVectorSize: Int = -1 | |
) extends FeatureBlock with JsonSaving | |
/** | |
* A block of indices within the feature vector | |
* blockNumber: ordinal, the order of the block in the vector | |
* offset: the index at which this block starts | |
* length: the length of this block. It follows that the block ends at vector index (offset + length) | |
* val totalVectorSize: the total size of the feature vector (maybe redundant, might remove) | |
*/ | |
abstract class FeatureBlock { | |
val blockNumber: Int | |
val offset: Int | |
val length: Int | |
val totalVectorSize: Int | |
} | |
/** | |
* Use pattern matching to select the feature computer of choice | |
*/ | |
object FeatureComputerRouter { | |
def apply( | |
predictionRequest: EcpmPredictionRequest, | |
computedFeatures: Map[Int,Double], | |
computerParams: FeatureComputerParams | |
): Map[Int,Double] = { | |
computerParams.featureName match { | |
case "PubAppFourthDegreeHashedFeature" => | |
PubAppFourthDegreeHashedFeature.compute(predictionRequest, computedFeatures, computerParams) | |
case "ModelFourthDegreeHashedFeature" => | |
ModelFourthDegreeHashedFeature.compute(predictionRequest, computedFeatures, computerParams) | |
case _ => computedFeatures | |
} | |
} | |
def toSparkVector(computedFeatures: Map[Int,Double], numFeatures: Int): SparkVector = { | |
SparkVectors.sparse(numFeatures, computedFeatures.keys.toArray, computedFeatures.values.toArray) | |
} | |
} | |
abstract class FeatureComputer { | |
/** | |
* The feature value type, e.g. categorical, count, real-valued | |
*/ | |
val valueType: String // use enumeration | |
/** | |
* The feature name | |
*/ | |
val name: String | |
/** | |
* Compute feature and insert it into the provided computed features map | |
* @param predictionRequest the request and campaign data from which to compute features | |
* @param computedFeatures the computed features, prior to this feature, possibly an empty map | |
* @return the computedFeatures map with the new, computed, feature inserted | |
*/ | |
def compute( | |
predictionRequest: EcpmPredictionRequest, | |
computedFeatures: Map[Int,Double], | |
computerParams: FeatureComputerParams | |
): Map[Int,Double] = { | |
val idx = computeIdx(predictionRequest, computerParams) | |
val value = computeValue(predictionRequest, computerParams) | |
insertIntoFeatureMap(idx, value, computedFeatures, computerParams) | |
} | |
/** | |
* compute the feature vector index, within this feature block, where the feature belongs | |
* @param predictionRequest the request and campaign data | |
* @return the feature index, within this feature block | |
*/ | |
def computeIdx(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Int | |
/** | |
* Compute the feature value from the request and campaign data | |
* @param predictionRequest the request and campaign data | |
* @return the value, at the appropriate index, to be inserted, or added, to the feature vector | |
*/ | |
def computeValue(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Double | |
/** | |
* Given a computed feature index, within this feature block, and the feature value, update the computedFeatures map | |
* @param idx the feature index within this feature block | |
* @param value the computed feature value | |
* @param computedFeatures the computed features, prior to this feature, possibly an empty map | |
* @return | |
*/ | |
protected def insertIntoFeatureMap( | |
idx: Int, | |
value: Double, | |
computedFeatures: Map[Int,Double], | |
computerParams: FeatureComputerParams | |
): Map[Int,Double] = { | |
val offsetIndex = idx + computerParams.offset | |
computedFeatures + (offsetIndex -> (computedFeatures.getOrElse(offsetIndex, 0.0) + value)) | |
} | |
} | |
/** | |
* Hashing feature computers insert features into indices by performing a hash function on the feature name | |
*/ | |
abstract class HashingFeatureComputer extends FeatureComputer { | |
/** | |
* Generate the feature name, used to compute the feature vector index | |
* @param predictionRequest | |
* @return | |
*/ | |
def genFeatureName(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): String | |
/** | |
* compute the feature index by hashing the name | |
* @param predictionRequest the request and campaign data | |
* @return the feature index, within this feature block | |
*/ | |
def computeIdx(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Int = { | |
val featureName: String = genFeatureName(predictionRequest, computerParams) | |
val rawHash = MurmurHash3.stringHash(featureName) % computerParams.length | |
if (rawHash < 0) rawHash + computerParams.length else rawHash | |
} | |
} | |
/** | |
* One hot encode features, but hash them together | |
*/ | |
abstract class OneHotHashingFeatureComputer extends HashingFeatureComputer { | |
/** | |
* Belonging to a category is represented by a 1.0 in the index to which that category is assigned. | |
*/ | |
def computeValue(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Double = 1.0 | |
} | |
abstract class CategoricalFourthDegreeHashedFeature extends OneHotHashingFeatureComputer { | |
def genFeatureName(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): String = { | |
genFourthDegreeFeatureName(predictionRequest, computerParams) | |
} | |
def retrieveThisCategory( | |
predictionRequest: EcpmPredictionRequest, | |
computerParams: FeatureComputerParams | |
): String | |
def genFourthDegreeFeatureName( | |
predictionRequest: EcpmPredictionRequest, | |
computerParams: FeatureComputerParams | |
): String = { | |
s"${predictionRequest.advCampaignId}#${predictionRequest.advConditionSet}#${predictionRequest.adType}#${retrieveThisCategory(predictionRequest, computerParams)}" | |
} | |
} | |
object PubAppFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature { | |
val valueType: String = "Categorical" | |
val name: String = "PubAppFourthDegreeHashedFeature" | |
def retrieveThisCategory( | |
predictionRequest: EcpmPredictionRequest, | |
computerParams: FeatureComputerParams | |
): String = predictionRequest.publisherApp | |
} | |
object ModelFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature { | |
val valueType: String = "Categorical" | |
val name: String = "ModelFourthDegreeHashedFeature" | |
def retrieveThisCategory( | |
predictionRequest: EcpmPredictionRequest, | |
computerParams: FeatureComputerParams | |
): String = predictionRequest.model | |
} | |
object ReachabilityFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature { | |
val valueType: String = "Categorical" | |
val name: String = "ReachabilityFourthDegreeHashedFeature" | |
def retrieveThisCategory( | |
predictionRequest: EcpmPredictionRequest, | |
computerParams: FeatureComputerParams | |
): String = predictionRequest.reachability.toString | |
} | |
object CountryFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature { | |
val valueType: String = "Categorical" | |
val name: String = "ReachabilityFourthDegreeHashedFeature" | |
def retrieveThisCategory( | |
predictionRequest: EcpmPredictionRequest, | |
computerParams: FeatureComputerParams | |
): String = predictionRequest.country | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment