Created
May 25, 2020 06:38
-
-
Save cindywu/8b0af7a677ab078a758413f8dc9b2d6e to your computer and use it in GitHub Desktop.
machine learning systems chapter 4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// import Tokenizer and HasingTF... you might need IDF later? | |
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} | |
// import Pipeline | |
import org.apache.spark.ml.Pipeline | |
// 4.1 extracting features | |
def extract(rawData: RawData): Feature = ??? | |
// 4.2 extracting word features from squawks | |
case class Squawk(id: Int, text: String) | |
// i modified session.createDataFrame to spark.createDataFrame | |
val squawks = spark.createDataFrame(Seq( | |
Squawk(123, "Clouds sure make it hard to look on the bright side of things."), | |
Squawk(124, "Who really cares who gets the worm? I'm fine with sleeping in."), | |
Squawk(125, "Why don't french fries grow on tress?"))).toDF("squawkID", "squawk") | |
val tokenizer = new Tokenizer().setInputCol("squawk").setOutputCol("words") | |
val tokenized = tokenizer.transform(squawks) | |
tokenized.select("words", "squawkId").show() | |
// the above code works! | |
// 4.3 extracting word features from squawks | |
// defines a base trait for all types of features | |
trait FeatureType { | |
// requires feature types to have names | |
val name: String | |
// type parameter to hold the type of values generated by feature | |
type V | |
} | |
// defines a base trait for all features as an extension of feature types | |
trait Feature extends FeatureType { | |
// requires that features have values of the type spcified in the feature type | |
val value: V | |
} | |
// defines a case class for features consisting of word sequences | |
case class WordSequenceFeature(name: String, value: Seq[String]) extends Feature { | |
// specifies that the type of features being generated in a sequence of strings(words) | |
type V = Seq[String] | |
} | |
// selects a words column from the DataFrame | |
val wordsFeatures = tokenized.select("words") | |
// maps over rows and applies a function to each | |
.map(row => | |
// create an instance of WordSequenceFeature named words | |
WordSequenceFeature("words", | |
// gets extracted words out of a row | |
row.getSeq[String] (0))) | |
// prints features for inspection | |
wordsFeatures.show() | |
// above code works! | |
// 4.4 transforming features | |
def transform(feature: Feature): Feature = ??? | |
// 4.5 transforming words to term frequencies | |
// instantiates an instance of a class to calculate term frequencies | |
val hashingTF = new HashingTF() | |
// defines an input column to read from when consuming DataFrames | |
.setInputCol("words") | |
// defines an ouput to put term frequencies in | |
.setOutputCol("termFrequencies") | |
// OR | |
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("termFrequencies") | |
// executes the transformation | |
val tfs = hashingTF.transform(tokenized) | |
// prints term frequencies for inspection | |
tfs.select("termFrequencies").show() | |
// 4.6 using spark ml pipelines | |
// instantiates a new pipeline | |
val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF)) // sets the two stages of this pipeline | |
// executes the pipeline | |
val pipelineHashed = pipeline.fit(squawks) // changed (squawksDF) to (squawks) | |
// prints the type of the result of the pipeline, a pipeline model | |
println(pipelineHashed.getClass) | |
// above code works! | |
// case class representing a numerical feature where the value is an integer | |
case class IntFeature(name: String, value: Int) extends Feature { | |
// specifies that these are integer features | |
type V = Int | |
} | |
// case class representing a boolean feature | |
case class BooleanFeature(name: String, value: Boolean) extends Feature { | |
// specifies that these are boolean features | |
type V = Boolean | |
} | |
// function that takes a numeric integer feature and threshold and returns a boolean feature | |
def binarize(feature: IntFeature, threshold: Double): BooleanFeature = { | |
// adds the name of the transform function to the resulting feature name | |
BooleanFeature("binarized-" + feature.name, feature.value > threshold) | |
} | |
// constant defining the cutoff for a squawker to be super | |
val SUPER_THRESHOLD = 1000000 | |
// raw numbers of followers for the squirrel and the sloth | |
val squirrelFollowers = 12 | |
val slothFollowers = 23584166 | |
// numeric integer feature representing the number of followers | |
val squirrelFollowersFeature = IntFeature("followers", squireelFollowers) | |
val slothFollowersFeature = IntFeature("followers", slothFollowers) | |
// boolean feature indicating the squirrel is not a super squawker | |
// !! this does not work | |
val squirrelIsSuper = binarize(squirrelFollowers, SUPER_THRESHOLD) | |
// boolean feature indicating the sloth is a super squawker | |
// !! this does not work | |
// 41: error: type mismatch; | |
// found : Int | |
// required: IntFeature | |
// val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD) | |
val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD) | |
// the above code does not work! | |
// 4.8 creating concept labels from features | |
// defines labels as subtypes of features | |
trait Label extends Feature | |
// creates a case class for boolean labels | |
case class BooleanLabel(name: String, value: Boolean) extends Label { | |
type V = Boolean | |
} | |
// defines a simple converesion function from boolean features to boolean labels | |
def toBooleanLabel (feature: BooleanFeature) = { | |
BooleanLabel(feature.name, feature.value) | |
} | |
// converts super squawker feature values into concept labels | |
// !! these will break because squirrelIsSuper and slothIsSuper is broken above | |
val squirrelLabel = toBooleanLabel(squirrelIsSuper) | |
val slothLabel = toBooleanLabel(slothIsSuper) | |
// prints label values for inspection | |
Seq(squirrelLabel, slothLabel).foreach(println) | |
// the above code does not work! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment