This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val path : String = { | |
import scala.sys.process._ | |
val x : String = "/pio/hadoop/hadoop-2.5.2/bin/hdfs dfs -ls /data/dab/prod-view" #| | |
"tail -1" !! | |
x.replace("\n", "").split(" ").takeRight(1)(0) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val path : String = { | |
import scala.sys.process._ | |
val x : String = "/pio/hadoop/hadoop-2.5.2/bin/hdfs dfs -ls /data/dab/prod-view" #| | |
"tail -1" !! | |
x.replace("\n", "").split(" ").takeRight(1)(0) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def splitDataTest ( | |
sqlContext : SQLContext, | |
data : DataFrame, | |
rowCol : String, | |
colCol : String, | |
tokenizer : String => Array[String], | |
idf : Boolean = true, | |
numFolds : Int | |
) : Seq[(AssociatedData, RDD[TestObservation])] = { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def aggHomogeneity (input : RDD[(Int, Int, Double)]) : RDD[(Int, Int, Double)] = { | |
input.cache | |
// Get row sums. | |
val rowSums: HashMap[Int, Double] = HashMap( | |
input.groupBy(_._1) | |
.map(e => (e._1, e._2.foldLeft(0.0)((b, a) => b + a._3))) | |
.collect : _* | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def aggHomogeneity (input : RDD[(Int, Int, Double)]) : RDD[(Int, Int, Double)] = { | |
input.cache | |
// Get row sums. | |
val rowSums: HashMap[Int, Double] = HashMap( | |
input.groupBy(_._1) | |
.map(e => (e._1, e._2.foldLeft(0.0)((b, a) => b + a._3))) | |
.collect : _* | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val rowSums: HashMap[Int, Double] = HashMap( | |
input.map(e => (e._1, e._3)) | |
.reduceByKey(_ + _) | |
.collect : _* | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def aggHomogeneity (input : RDD[(Int, Int, Double)]) : RDD[(Int, Int, Double)] = { | |
input.cache | |
// Get row sums. | |
val rowSums: HashMap[Int, Double] = HashMap( | |
input.map(e => (e._1, e._3)) | |
.reduceByKey(_ + _) | |
.collect : _* | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def timePartition ( | |
data: DataFrame, | |
timeCol : String, | |
startTime : Long = 1404198000000L | |
) : Seq[DataFrame] = { | |
val addTimeMilli : Long = 604800000L // Discretize time into weeks. | |
val rightNowMilli : Long = new Date().getTime | |
(1 to (rightNowMilli / addTimeMilli).toInt).map(k => new Timestamp(startTime + k.toLong * addTimeMilli)) | |
.map(time => data.filter(data(timeCol) < time)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def timePartition ( | |
data: DataFrame, | |
timeCol : String, | |
outputCol : String, | |
startTime : Long = 1404198000000L | |
) : DataFrame = { | |
val addTime : Long = 604800000L // Discretize time into weeks. | |
val udf : UserDefinedFunction = functions.udf( | |
(t : Timestamp) => ((t.getTime - startTime) / addTime).toInt | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val filterRecentTimeUDF : Long => UserDefinedFunction = { | |
(l : Long) => { | |
functions.udf( | |
// s refers to eventTime, t refers to userEventTime | |
(s : Timestamp, t: Timestamp) => if (s.getTime - t.getTime < l) true else false | |
) | |
} | |
} |
OlderNewer