This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from pyspark.context import SparkContext | |
from numpy import array, random as np_random | |
from sklearn import linear_model as lm | |
from sklearn.base import copy | |
N = 10000 # Number of data points | |
D = 10 # Numer of dimensions | |
ITERATIONS = 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spark.streaming.StreamingContext._ | |
import spark.streaming.{Seconds, StreamingContext} | |
import spark.SparkContext._ | |
import spark.storage.StorageLevel | |
import spark.streaming.examples.twitter.TwitterInputDStream | |
import com.twitter.algebird.HyperLogLog._ | |
import com.twitter.algebird._ | |
/** | |
* Example of using HyperLogLog monoid from Twitter's Algebird together with Spark Streaming's |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spark.streaming.{Seconds, StreamingContext} | |
import spark.storage.StorageLevel | |
import spark.streaming.examples.twitter.TwitterInputDStream | |
import com.twitter.algebird._ | |
import spark.streaming.StreamingContext._ | |
import spark.SparkContext._ | |
/** | |
* Example of using CountMinSketch monoid from Twitter's Algebird together with Spark Streaming's | |
* TwitterInputDStream |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spark.SparkContext | |
import SparkContext._ | |
/** | |
* A port of [[http://blog.echen.me/2012/02/09/movie-recommendations-and-more-via-mapreduce-and-scalding/]] | |
* to Spark. | |
* Uses movie ratings data from MovieLens 100k dataset found at [[http://www.grouplens.org/node/73]] | |
*/ | |
object MovieSimilarities { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// An Example is an observation with optional target value and features in the form of a vector of Doubles | |
case class Example(target: Option[Double] = None, features: Vector[Double]) | |
// Base model API looks something like: | |
abstract class BaseModel(val modelSettings: Settings) | |
extends Serializable | |
with Logging { | |
def fit(data: RDD[Example]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
trait Configured { | |
val config = ConfigFactory.load().getConfig("spark") | |
} | |
object ConfigUtils { | |
def asOption[T](t: => T): Option[T] = { | |
try { | |
Option(t) | |
} catch { | |
case e: ConfigException.Missing => None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def newHadoopFileAsText[K, V, F <: NewInputFormat[K, V]]( | |
path: String, | |
inputFormatClazz: String, | |
keyClazz: String, | |
valueClazz: String, | |
delimiter: String): JavaRDD[String] = { | |
implicit val kcm = ClassManifest.fromClass(Class.forName(keyClazz)) | |
implicit val vcm = ClassManifest.fromClass(Class.forName(valueClazz)) | |
implicit val fcm = ClassManifest.fromClass(Class.forName(inputFormatClazz)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Python RDD creation functions // | |
// SequenceFile converted to Text and then to String | |
def sequenceFileAsText(path: String) = { | |
implicit val kcm = ClassManifest.fromClass(classOf[Text]) | |
implicit val fcm = ClassManifest.fromClass(classOf[SequenceFileAsTextInputFormat]) | |
new JavaPairRDD(sc | |
.newAPIHadoopFile[Text, Text, SequenceFileAsTextInputFormat](path) | |
.map{ case (k, v) => (k.toString, v.toString) } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
13/12/09 23:01:02 INFO spark.SparkContext: Job finished: runJob at PythonRDD.scala:288, took 0.175237 s | |
Count raw: 683; Count svml: 683 | |
Raw sample: [u'2.000000 1:1000025.000000 2:5.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7:1.000000 8:3.000000 9:1.000000 10:1.000000', u'2.000000 1:1002945.000000 2:5.000000 3:4.000000 4:4.000000 5:5.000000 6:7.000000 7:10.000000 8:3.000000 9:2.000000 10:1.000000', u'2.000000 1:1015425.000000 2:3.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7:2.000000 8:3.000000 9:1.000000 10:1.000000', u'2.000000 1:1016277.000000 2:6.000000 3:8.000000 4:8.000000 5:1.000000 6:3.000000 7:4.000000 8:3.000000 9:7.000000 10:1.000000', u'2.000000 1:1017023.000000 2:4.000000 3:1.000000 4:1.000000 5:3.000000 6:2.000000 7:1.000000 8:3.000000 9:1.000000 10:1.000000', u'4.000000 1:1017122.000000 2:8.000000 3:10.000000 4:10.000000 5:8.000000 6:7.000000 7:10.000000 8:9.000000 9:7.000000 10:1.000000', u'2.000000 1:1018099.000000 2:1.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Elasticsearch : "1.1.1" | |
Spark : "0.9.1-hadoop1" | |
Shark : "0.9.1-hadoop1" | |
elasticsearch-hadoop-hive : "elasticsearch-hadoop-hive-2.0.0.RC1.jar" | |
elasticsearch-hadoop : 2.0.0RC1 | |
- Spark using ESInputFormat works fine. However the type returned from the "date" field ("_ts") is Text. I convert that toString the toLong to get the timestamp, which I can use as I wish within Spark | |
- Shark returns NULL for the timestamp field. There's nothing funny about the timestamps themselves as I can access them in Spark (as Text) and I can do date math on that field in elasticsearch queries. This also returns nulls in EC2, so it's not just on my machine. |
OlderNewer