cindywu · May 25, 2020 06:38
diff --git a/mls-ch-4.scala b/mls-ch-4.scala
 // import Tokenizer and HasingTF... you might need IDF later?
 import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
 // import Pipeline
 import org.apache.spark.ml.Pipeline

 // 4.1 extracting features

 def extract(rawData: RawData): Feature = ???

 // 4.2 extracting word features from squawks

 case class Squawk(id: Int, text: String)

 // i modified session.createDataFrame to spark.createDataFrame
 val squawks = spark.createDataFrame(Seq(
  Squawk(123, "Clouds sure make it hard to look on the bright side of things."),
  Squawk(124, "Who really cares who gets the worm? I'm fine with sleeping in."),
  Squawk(125, "Why don't french fries grow on tress?"))).toDF("squawkID", "squawk")

 val tokenizer = new Tokenizer().setInputCol("squawk").setOutputCol("words")

 val tokenized = tokenizer.transform(squawks)

 tokenized.select("words", "squawkId").show()

 // the above code works!

 // 4.3 extracting word features from squawks

 // defines a base trait for all types of features
 trait FeatureType {
  // requires feature types to have names
  val name: String
  // type parameter to hold the type of values generated by feature
  type V
 }

 // defines a base trait for all features as an extension of feature types
 trait Feature extends FeatureType {
  // requires that features have values of the type spcified in the feature type
  val value: V
 }

 // defines a case class for features consisting of word sequences
 case class WordSequenceFeature(name: String, value: Seq[String]) extends Feature {
  // specifies that the type of features being generated in a sequence of strings(words)
  type V = Seq[String]
 }

 // selects a words column from the DataFrame
 val wordsFeatures = tokenized.select("words")
  // maps over rows and applies a function to each
  .map(row =>
  // create an instance of WordSequenceFeature named words
  WordSequenceFeature("words",
    // gets extracted words out of a row
    row.getSeq[String] (0)))

 // prints features for inspection
 wordsFeatures.show()

 // above code works!

 // 4.4 transforming features

 def transform(feature: Feature): Feature = ???

 // 4.5 transforming words to term frequencies

 // instantiates an instance of a class to calculate term frequencies
 val hashingTF = new HashingTF()
 // defines an input column to read from when consuming DataFrames
 .setInputCol("words")
 // defines an ouput to put term frequencies in
 .setOutputCol("termFrequencies")

 // OR

 val hashingTF = new HashingTF().setInputCol("words").setOutputCol("termFrequencies")

 // executes the transformation
 val tfs = hashingTF.transform(tokenized)

 // prints term frequencies for inspection
 tfs.select("termFrequencies").show()

 // 4.6 using spark ml pipelines

 // instantiates a new pipeline
 val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF)) // sets the two stages of this pipeline

 // executes the pipeline
 val pipelineHashed = pipeline.fit(squawks) // changed (squawksDF) to (squawks)

 // prints the type of the result of the pipeline, a pipeline model
 println(pipelineHashed.getClass)

 // above code works!

 // case class representing a numerical feature where the value is an integer
 case class IntFeature(name: String, value: Int) extends Feature {
  // specifies that these are integer features
  type V = Int
 }

 // case class representing a boolean feature
 case class BooleanFeature(name: String, value: Boolean) extends Feature {
  // specifies that these are boolean features
  type V = Boolean
 }

 // function that takes a numeric integer feature and threshold and returns a boolean feature
 def binarize(feature: IntFeature, threshold: Double): BooleanFeature = {
  // adds the name of the transform function to the resulting feature name
  BooleanFeature("binarized-" + feature.name, feature.value > threshold)
 }

 // constant defining the cutoff for a squawker to be super
 val SUPER_THRESHOLD = 1000000

 // raw numbers of followers for the squirrel and the sloth
 val squirrelFollowers = 12
 val slothFollowers = 23584166

 // numeric integer feature representing the number of followers
 val squirrelFollowersFeature = IntFeature("followers", squireelFollowers)
 val slothFollowersFeature = IntFeature("followers", slothFollowers)

 // boolean feature indicating the squirrel is not a super squawker
 // !! this does not work
 val squirrelIsSuper = binarize(squirrelFollowers, SUPER_THRESHOLD)
 // boolean feature indicating the sloth is a super squawker

 // !! this does not work
 // 41: error: type mismatch;
 //  found   : Int
 //  required: IntFeature
 //       val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD)
 val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD)


 // the above code does not work!

 // 4.8 creating concept labels from features

 // defines labels as subtypes of features
 trait Label extends Feature

 // creates a case class for boolean labels
 case class BooleanLabel(name: String, value: Boolean) extends Label {
  type V = Boolean
 }

 // defines a simple converesion function from boolean features to boolean labels
 def toBooleanLabel (feature: BooleanFeature) = {
  BooleanLabel(feature.name, feature.value)
 }

 // converts super squawker feature values into concept labels
 // !! these will break because squirrelIsSuper and slothIsSuper is broken above
 val squirrelLabel = toBooleanLabel(squirrelIsSuper)
 val slothLabel = toBooleanLabel(slothIsSuper)

 // prints label values for inspection
 Seq(squirrelLabel, slothLabel).foreach(println)

 // the above code does not work!
	// import Tokenizer and HasingTF... you might need IDF later?
	import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
	// import Pipeline
	import org.apache.spark.ml.Pipeline

	// 4.1 extracting features

	def extract(rawData: RawData): Feature = ???

	// 4.2 extracting word features from squawks

	case class Squawk(id: Int, text: String)

	// i modified session.createDataFrame to spark.createDataFrame
	val squawks = spark.createDataFrame(Seq(
	Squawk(123, "Clouds sure make it hard to look on the bright side of things."),
	Squawk(124, "Who really cares who gets the worm? I'm fine with sleeping in."),
	Squawk(125, "Why don't french fries grow on tress?"))).toDF("squawkID", "squawk")

	val tokenizer = new Tokenizer().setInputCol("squawk").setOutputCol("words")

	val tokenized = tokenizer.transform(squawks)

	tokenized.select("words", "squawkId").show()

	// the above code works!

	// 4.3 extracting word features from squawks

	// defines a base trait for all types of features
	trait FeatureType {
	// requires feature types to have names
	val name: String
	// type parameter to hold the type of values generated by feature
	type V
	}

	// defines a base trait for all features as an extension of feature types
	trait Feature extends FeatureType {
	// requires that features have values of the type spcified in the feature type
	val value: V
	}

	// defines a case class for features consisting of word sequences
	case class WordSequenceFeature(name: String, value: Seq[String]) extends Feature {
	// specifies that the type of features being generated in a sequence of strings(words)
	type V = Seq[String]
	}

	// selects a words column from the DataFrame
	val wordsFeatures = tokenized.select("words")
	// maps over rows and applies a function to each
	.map(row =>
	// create an instance of WordSequenceFeature named words
	WordSequenceFeature("words",
	// gets extracted words out of a row
	row.getSeq[String] (0)))

	// prints features for inspection
	wordsFeatures.show()

	// above code works!

	// 4.4 transforming features

	def transform(feature: Feature): Feature = ???

	// 4.5 transforming words to term frequencies

	// instantiates an instance of a class to calculate term frequencies
	val hashingTF = new HashingTF()
	// defines an input column to read from when consuming DataFrames
	.setInputCol("words")
	// defines an ouput to put term frequencies in
	.setOutputCol("termFrequencies")

	// OR

	val hashingTF = new HashingTF().setInputCol("words").setOutputCol("termFrequencies")

	// executes the transformation
	val tfs = hashingTF.transform(tokenized)

	// prints term frequencies for inspection
	tfs.select("termFrequencies").show()

	// 4.6 using spark ml pipelines

	// instantiates a new pipeline
	val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF)) // sets the two stages of this pipeline

	// executes the pipeline
	val pipelineHashed = pipeline.fit(squawks) // changed (squawksDF) to (squawks)

	// prints the type of the result of the pipeline, a pipeline model
	println(pipelineHashed.getClass)

	// above code works!

	// case class representing a numerical feature where the value is an integer
	case class IntFeature(name: String, value: Int) extends Feature {
	// specifies that these are integer features
	type V = Int
	}

	// case class representing a boolean feature
	case class BooleanFeature(name: String, value: Boolean) extends Feature {
	// specifies that these are boolean features
	type V = Boolean
	}

	// function that takes a numeric integer feature and threshold and returns a boolean feature
	def binarize(feature: IntFeature, threshold: Double): BooleanFeature = {
	// adds the name of the transform function to the resulting feature name
	BooleanFeature("binarized-" + feature.name, feature.value > threshold)
	}

	// constant defining the cutoff for a squawker to be super
	val SUPER_THRESHOLD = 1000000

	// raw numbers of followers for the squirrel and the sloth
	val squirrelFollowers = 12
	val slothFollowers = 23584166

	// numeric integer feature representing the number of followers
	val squirrelFollowersFeature = IntFeature("followers", squireelFollowers)
	val slothFollowersFeature = IntFeature("followers", slothFollowers)

	// boolean feature indicating the squirrel is not a super squawker
	// !! this does not work
	val squirrelIsSuper = binarize(squirrelFollowers, SUPER_THRESHOLD)
	// boolean feature indicating the sloth is a super squawker

	// !! this does not work
	// 41: error: type mismatch;
	// found : Int
	// required: IntFeature
	// val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD)
	val slothIsSuper = binarize(slothFollowers, SUPER_THRESHOLD)


	// the above code does not work!

	// 4.8 creating concept labels from features

	// defines labels as subtypes of features
	trait Label extends Feature

	// creates a case class for boolean labels
	case class BooleanLabel(name: String, value: Boolean) extends Label {
	type V = Boolean
	}

	// defines a simple converesion function from boolean features to boolean labels
	def toBooleanLabel (feature: BooleanFeature) = {
	BooleanLabel(feature.name, feature.value)
	}

	// converts super squawker feature values into concept labels
	// !! these will break because squirrelIsSuper and slothIsSuper is broken above
	val squirrelLabel = toBooleanLabel(squirrelIsSuper)
	val slothLabel = toBooleanLabel(slothIsSuper)

	// prints label values for inspection
	Seq(squirrelLabel, slothLabel).foreach(println)

	// the above code does not work!