Skip to content

Instantly share code, notes, and snippets.

View invkrh's full-sized avatar

Hao Ren invkrh

View GitHub Profile
import org.apache.spark._
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
object Word2VecPerfTest extends App {
val start = System.currentTimeMillis()
val conf = new SparkConf().setMaster("local[*]").setAppName("word2vec")
val sc = new SparkContext(conf)
@invkrh
invkrh / quote.scala
Created July 5, 2016 22:17
Spark SQL single quote escape problem
package me.invkrh.ad2vec
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object Test extends App {
val sc = new SparkContext("local[2]", "test", new SparkConf)
val hiveContext = new HiveContext(sc)
val i = 164545
val weights = new SparseArray[Double](Array(164545, 3657, 165734),
Array(1.0d, 2.0d, 3.0d), 3, 262144, 0d
)
val res = weights.apply(i)
package fr.leboncoin.ad2vec
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.unsafe.hash.Murmur3_x86_32._
import org.apache.spark.unsafe.types.UTF8String
def mergeSort(input: Array[Int]): Array[Int] = {
if (input.size == 1) {
input
} else {
val mid = input.size / 2
val (left, right) = input.splitAt(mid)
merge(mergeSort(left), mergeSort(right))
}
}
# Example .scalafmt, comments are supported.
style = default
maxColumn = 100
docstrings = JavaDoc
continuationIndent.defnSite = 2
continuationIndent.callSite = 2
package fr.leboncoin.botdet.job.offline
object Test extends App {
implicit class IntOps(x: Int) {
def **(p: Int): Int = (1 to p).foldLeft(1) {
case (acc, elem) => acc * x
}
}
@invkrh
invkrh / MultivariateGaussianCheck.scala
Last active December 3, 2016 20:39
Test case checks the numerical stability of MultivariateGaussian.pdf when sigma is near singluar
import breeze.linalg.{det, DenseMatrix}
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
object GaussianTest extends App {
val x = Vectors.dense(2.1, 3.1, 3.9)
val mu = Vectors.dense(2, 3, 4)
val sigma =
Matrices.dense(3, 3, Array(1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 3.0, 0.0, 11.0).map(_ + 0.0001))
import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global
import scala.util.{Failure, Success}
object Test extends App {
println("1 " + Thread.currentThread().getId)
try {
Future {
println("2 " + Thread.currentThread().getId)
throw new RuntimeException()
// below Q1 – 1.5 × IQR or above Q3 + 1.5 × IQR
val yieldUplift = deltaDF.select("delta_exploit_relative_yield_uplift").collect().map(_.getDouble(0)).toList.sorted
def getMedian(sorted: List[Double]): Double = {
val len = sorted.size
if (len % 2 == 0) {
(sorted(len / 2 - 1) + sorted(len / 2)) / 2
} else {
sorted(len / 2)