Skip to content

Instantly share code, notes, and snippets.

View gbraccialli's full-sized avatar

Gui Braccialli gbraccialli

View GitHub Profile
case class Client(id: Integer, name: String, parent: Integer, value: Integer)
def append(arr: Seq[Int], element: Any): Seq[Int] = {
element match {
case null => arr
case i: Int => i +: arr
}
}
val udfAppend = udf(append _)
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, TextInputFormat}
import org.apache.spark.rdd.{NewHadoopRDD}
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.expressions.Window
val fc = classOf[TextInputFormat]
Do you have a communicable disease of public health significance? (Communicable diseases of public significance include chancroid, gonorrhea, granuloma inguinale, infectious leprosy, lymphogranuloma venereum, infectious stage syphilis, active tuberculosis, and other diseases as determined by the Department of Health and Human Services.)
Do you have a mental or physical disorder that poses or is likely to pose a threat to the safety or welfare of yourself or others?
Are you or have you ever been a drug abuser or addict?
Have you ever been arrested or convicted for any offense or crime, even though subject of a pardon, amnesty, or other similar action?
Have you ever violated, or engaged in a conspiracy to violate, any law relating to controlled substances?
Are you coming to the United States to engage in prostitution or unlawful commercialized vice or have you been engaged in prostitution or procuring prostitutes within the past 10 years?
Have you ever been involved in, or do you seek to engage in, money
Downloads/mongodb-osx-x86_64-3.4.13/bin/mongod --dbpath /Users/guilherme_braccialli/mongo_data
java -jar Downloads/spline-web-0.2.5-exec-war.jar -Dspline.mongodb.url=mongodb://localhost:27017 -Dspline.mongodb.name=spline
spark-shell --conf 'spark.driver.extraJavaOptions=-Dspline.mongodb.url=mongodb://localhost:27017 -Dspline.mongodb.name=spline -Dspline.persistence.factory=za.co.absa.spline.persistence.mongo.MongoPersistenceFactory' --packages za.co.absa.spline:spline-core:0.2.5,za.co.absa.spline:spline-persistence-mongo:0.2.5 --jars /Users/guilherme_braccialli/IdeaProjects/untitled/target/scala-2.11/test_listener_2.11-1.0.jar
import za.co.absa.spline.core.SparkLineageInitializer._
spark.enableLineageTracking()
http://localhost:8080/
----------------------
# AWS EXAMPLE
pyspark --packages org.apache.hadoop:hadoop-aws:2.7.5
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.conf.set("fs.s3a.access.key","XXXXXXXXXXXX")
spark.conf.set("fs.s3a.secret.key","XXXXXXXXXXXXXX")
df = spark.read.csv("s3a://yourbucket/tmp_o3001uc.csv")
# AZURE BLOB EXAMPLE
pyspark --packages org.apache.hadoop:hadoop-azure:2.7.5,com.microsoft.azure:azure-storage:7.0.0
import findspark
findspark.init()
#findspark.init(or path to spark binaries)
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.master("local[*]")\
.appName("jupyter")\
@gbraccialli
gbraccialli / create_random_data.scala
Last active September 20, 2018 13:51
spark_scala_python_udf_battle
//scala create datasets
def randomStr(size: Int): String = {
import scala.util.Random
return Random.alphanumeric.take(size).mkString("")
}
val udfRandomStr = udf(randomStr _)
val dfRnd = (1 to 30000).toDF.repartition(3000)
val dfRnd2 = (1 to 10).toDF.withColumnRenamed("value", "value2")
source,target,value
Burma,Alpharetta,1200
Alpharetta,pqp,120000
Burma,Atlanta,2337
Burma,Avondale Estates,112
Burma,Clarkston,1576
Burma,Decatur,2140
Burma,Duluth,99
Burma,Lawrenceville,22
Burma,Norcross,18
$ cat ~/.ssh/config
host your_internal_hostname
UserKnownHostsFile=/dev/null
StrictHostKeyChecking=no
Identityfile ~/.ssh/PEM_FILE_HERE
User your_username
Hostname SERVER_INTERNAL_IP
ProxyCommand ssh -i ~/.ssh/PEM_FILE_HERE your_username@BASTION_ENDPOINT nc %h %p
import seaborn as sns
pal = sns.color_palette(n_colors=50)
pal.as_hex()