Skip to content

Instantly share code, notes, and snippets.

View josep2's full-sized avatar
🎯
Focusing

Jowanza Joseph josep2

🎯
Focusing
View GitHub Profile
val result = g.stronglyConnectedComponents.maxIter(5).run()
result.show()
+---+-------+---+--------+----------+---------+
| id| name|age| cash| fruit|component|
+---+-------+---+--------+----------+---------+
| a| Alice| 34| 234| Apples| 0|
| g| Gabby| 60| 23433| Oranges| 6|
| f| Fanny| 36| 333| Apples| 5|
| b| Bob| 36|23232323| Bananas| 1|
val results = g.pageRank.resetProbability(0.15).tol(0.05).run()
// resetProbability and TOL are convergence parameters
results.edges.select("src", "dst", "weight").show()
+---+---+------+
|src|dst|weight|
+---+---+------+
| d| a| 1.0|
| a| b| 0.5|
| a| e| 0.5|
// Borrowed some parts from the GraphFrame docs for my blog: https://graphframes.github.io/user-guide.html
import org.graphframes._
val v = sqlContext.createDataFrame(List(
("a", "Alice", 34, 234, "Apples"),
("b", "Bob", 36, 23232323, "Bananas"),
("c", "Charlie", 30, 2123, "Grapefruit"),
("d", "David", 29, 2321111, "Bananas"),
("e", "Esther", 32, 1, "Watermelon"),
("f", "Fanny", 36, 333, "Apples" ),
// From https://github.com/amplab/succinct
import edu.berkeley.cs.succinct.sql._
// Create a schema
val citySchema = StructType(Seq(
StructField("Name", StringType, false),
StructField("Length", IntegerType, true),
StructField("Area", DoubleType, false),
StructField("Airport", BooleanType, true)))
import edu.berkeley.cs.succinct._
val conf = new SparkConf().setAppName("Ranking Example")
val sc = new SparkContext(conf)
// A large file of raw hip hop lyrics ~ 100 GB
val hipHopRDD = sc.textFile("/hiphopcorpus").map(_.getBytes)
import edu.berkeley.cs.succinct._
val conf = new SparkConf().setAppName("Ranking Example")
val sc = new SparkContext(conf)
// A large file of raw hip hop lyrics ~ 100 GB
val hipHopRDD = sc.textFile("/hiphopcorpus").map(_.getBytes)
// Create a General linear model parameters object from the Gaussian family in this case
val glmParams = new GLMParameters(Family.gaussian)
// Give it the data to train. You can get around using the key attribute by Lock and Update
glmParams._train = h2oData.key
// Set your dependent variable
glmParams._response_column = "Some Dependent Variable"
+-------+-----------+
| x | y |
+-------+-----------+
|9285958|0.492819875|
|6295780|0.334126593|
| 549166|0.029145073|
| 408830|0.021697228|
| 362166|0.019220699|
| 305267|0.016200983|
| 270418|0.014351493|
import org.apache.spark.h2o.H2OContext
import org.apache.spark.sql.SparkSession
object PartyStarted extends App {
val sparkSession = SparkSession.builder
.master("local[*]")
.appName("Try H2o")
.config("spark.sql.crossJoin.enabled", "true")
"ai.h2o" %% "sparkling-water-core" % "2.0.3"