Skip to content

Instantly share code, notes, and snippets.

@cindywu
Created May 25, 2020 06:38
Show Gist options
  • Save cindywu/8b0af7a677ab078a758413f8dc9b2d6e to your computer and use it in GitHub Desktop.
Save cindywu/8b0af7a677ab078a758413f8dc9b2d6e to your computer and use it in GitHub Desktop.
machine learning systems chapter 4
// import Tokenizer and HasingTF... you might need IDF later?
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
// import Pipeline
import org.apache.spark.ml.Pipeline
// 4.1 extracting features
def extract(rawData: RawData): Feature = ???
// 4.2 extracting word features from squawks
case class Squawk(id: Int, text: String)
// i modified session.createDataFrame to spark.createDataFrame
val squawks = spark.createDataFrame(Seq(
Squawk(123, "Clouds sure make it hard to look on the bright side of things."),
Squawk(124, "Who really cares who gets the worm? I'm fine with sleeping in."),
Squawk(125, "Why don't french fries grow on tress?"))).toDF("squawkID", "squawk")
val tokenizer = new Tokenizer().setInputCol("squawk").setOutputCol("words")
val tokenized = tokenizer.transform(squawks)
tokenized.select("words", "squawkId").show()
// the above code works!
// 4.3 extracting word features from squawks
// defines a base trait for all types of features
trait FeatureType {
// requires feature types to have names
val name: String
// type parameter to hold the type of values generated by feature
type V
}
// defines a base trait for all features as an extension of feature types
trait Feature extends FeatureType {
// requires that features have values of the type spcified in the feature type
val value: V
}
// defines a case class for features consisting of word sequences
case class WordSequenceFeature(name: String, value: Seq[String]) extends Feature {
// specifies that the type of features being generated in a sequence of strings(words)
type V = Seq[String]
}
// selects a words column from the DataFrame
val wordsFeatures = tokenized.select("words")
// maps over rows and applies a function to each
.map(row =>
// create an instance of WordSequenceFeature named words
WordSequenceFeature("words",
// gets extracted words out of a row
row.getSeq[String] (0)))
// prints features for inspection
wordsFeatures.show()
// above code works!
// 4.4 transforming features
def transform(feature: Feature): Feature = ???
// 4.5 transforming words to term frequencies
// instantiates an instance of a class to calculate term frequencies
val hashingTF = new HashingTF()
// defines an input column to read from when consuming DataFrames
.setInputCol("words")
// defines an ouput to put term frequencies in
.setOutputCol("termFrequencies")
// OR
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("termFrequencies")
// executes the transformation
val tfs = hashingTF.transform(tokenized)
// prints term frequencies for inspection
tfs.select("termFrequencies").show()
// 4.6 using spark ml pipelines
// instantiates a new pipeline
val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF)) // sets the two stages of this pipeline
// executes the pipeline
val pipelineHashed = pipeline.fit(squawks) // changed (squawksDF) to (squawks)
// prints the type of the result of the pipeline, a pipeline model
println(pipelineHashed.getClass)
// above code works!
// case class representing a numerical feature where the value is an integer
case class IntFeature(name: String, value: Int) extends Feature {
// specifies that these are integer features
type V = Int
}
// case class representing a boolean feature
case class BooleanFeature(name: String, value: Boolean) extends Feature {
// specifies that these are boolean features
type V = Boolean
}
// function that takes a numeric integer feature and threshold and returns a boolean feature
def binarize(feature: IntFeature, threshold: Double): BooleanFeature = {
// adds the name of the transform function to the resulting feature name
BooleanFeature("binarized-" + feature.name, feature.value > threshold)
}
// constant defining the cutoff for a squawker to be super
val SUPER_THRESHOLD = 1000000
// raw numbers of followers for the squirrel and the sloth
val squirrelFollowers = 12
val slothFollowers = 23584166
// numeric integer feature representing the number of followers
val squirrelFollowersFeature = IntFeature("followers", squireelFollowers)
val slothFollowersFeature = IntFeature("followers", slothFollowers)
// boolean feature indicating the squirrel is not a super squawker
// !! this does not work
val squirrelIsSuper = binarize(squirrelFollowers, SUPER_THRESHOLD)
// boolean feature indicating the sloth is a super squawker
// !! this does not work
// 41: error: type mismatch;
// found : Int
// required: IntFeature
// val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD)
val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD)
// the above code does not work!
// 4.8 creating concept labels from features
// defines labels as subtypes of features
trait Label extends Feature
// creates a case class for boolean labels
case class BooleanLabel(name: String, value: Boolean) extends Label {
type V = Boolean
}
// defines a simple converesion function from boolean features to boolean labels
def toBooleanLabel (feature: BooleanFeature) = {
BooleanLabel(feature.name, feature.value)
}
// converts super squawker feature values into concept labels
// !! these will break because squirrelIsSuper and slothIsSuper is broken above
val squirrelLabel = toBooleanLabel(squirrelIsSuper)
val slothLabel = toBooleanLabel(slothIsSuper)
// prints label values for inspection
Seq(squirrelLabel, slothLabel).foreach(println)
// the above code does not work!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment