Follow the guide to uninstall. Link Also, remove the existing folders and users.
#Remove conf and logs
rm -rf rm -rf /etc/hadoop
rm -rf rm -rf /etc/hbase
rm -rf rm -rf /etc/hive
import org.apache.spark.sql.functions.udf | |
import spark.sessionState.conf | |
conf.setConfString("spark.sql.pivotMaxValues", "" + Int.MaxValue) | |
val csv = spark.read.format("csv").option("header", true).load("/Users/tilak/Downloads/Pam/SalesAnalysis/data/store_sales_unified_2017.csv") | |
val uniqueKey: (String, String, String, String) => String = (x, y, z, v) => x + "_" + y + "_" + z + "_" + v | |
val someFn = udf(uniqueKey) | |
val newData = csv.withColumn("unique", someFn(csv.col("receipt_id"), csv.col("cash_register_id"), csv.col("sale_time"), csv.col("date"))) | |
val countArticles = newData.groupBy("unique", "article_id").count() | |
var articles = countArticles.select("article_id").distinct() | |
val articleIds = articles.collect.map(x => x(0)) |
import org.apache.spark.sql.functions.udf | |
val csv = spark.read.format("csv").option("header", true).load("/Users/tilak/Downloads/Pam/SalesAnalysis/data/store_sales_unified_2017.csv") | |
val uniqueKey: (String, String, String, String) => String = (x, y ,z , v) => x + "_" + y + "_" + z + "_" + v | |
val someFn = udf(uniqueKey) | |
val newData = csv.withColumn("unique", someFn(csv.col("receipt_id"), csv.col("cash_register_id"), csv.col("sale_time"), csv.col("date"))) | |
val countArticles = newData.groupBy("unique", "article_id").count() | |
val sameBill = countArticles.crossJoin(countArticles).filter(x => x.getString(0) == x.getString(3) && x.getString(1) != x.getString(4)) | |
val newNames = sameBill.columns.toList.zipWithIndex.map((x) => x._1 + "_" + x._2) |
Follow the guide to uninstall. Link Also, remove the existing folders and users.
#Remove conf and logs
rm -rf rm -rf /etc/hadoop
rm -rf rm -rf /etc/hbase
rm -rf rm -rf /etc/hive
yum install -y ant gcc g++ libkrb5-dev libmysqlclient-dev
yum install -y libssl-dev libsasl2-dev libsasl2-modules-gssapi-mit
yum install -y libsqlite3-dev libtidy-0.99-0 libxml2-dev libxslt-dev
yum install -y maven libldap2-dev python-dev python-simplejson python-setuptools
yum install -y libxslt-devel libxml++-devel libxml2-devel libffi-devel
# Run postgres instance | |
docker run --name postgres -p 5000:5432 debezium/postgres | |
# Run zookeeper instance | |
docker run -it --name zookeeper -p 2181:2181 -p 2888:2888 -p 3888:3888 debezium/zookeeper | |
# Run kafka instance | |
docker run -it --name kafka -p 9092:9092 --link zookeeper:zookeeper debezium/kafka | |
# Run kafka connect |
-- Finding difference in values for some key | |
SELECT * from A a,b WHERE a.id = b.id AND NOT a.name = b.name; | |
--Returns one row for each CHECK, UNIQUE, PRIMARY KEY, and/or FOREIGN KEY | |
SELECT * | |
FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS | |
WHERE CONSTRAINT_NAME='XYZ' | |
--Returns one row for each FOREIGN KEY constrain |
# ==================================================================== | |
# PullCsvFromS3 | |
# Pull CSV data from a directory S3 to our local system | |
# ==================================================================== | |
job.name=PullCsvFromS3 | |
job.description=Pull CSV data from a directory S3 to our local system | |
fs.uri=file:/// | |
# Set working directory |
# ==================================================================== | |
# PullCsvFromS3 | |
# Pull CSV data from a directory S3 to our local system | |
# ==================================================================== | |
job.name=PullCsvFromS3 | |
job.description=Pull CSV data from a directory S3 to our local system and write as AVRO files | |
fs.uri=file:/// | |
# Set working directory |
# ==================================================================== | |
# PullCsvFromS3 | |
# Pull CSV data from a directory S3 to MySQL | |
# ==================================================================== | |
job.name=PullCsvFromS3 | |
job.description=Pull CSV data from a directory S3 to MySQL | |
fs.uri=file:/// | |
# Set working directory |
import spark.implicits._ | |
import org.apache.spark.sql.SaveMode | |
val products = spark.sqlContext.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "products").option("user", "gobblin").option("password", "gobblin").option("url", "jdbc:mysql://localhost/mopar_demo").load() | |
val newProducts = spark.sqlContext.read.format("orc").load("/Users/tilak/gobblin/mopar-demo/output/org/apache/gobblin/copy/user/tilak/pricing.products_1521799535.csv/20180325023900_append/part.task_PullCsvFromS3_1521945534992_0_0.orc") | |
val newnewProducts = newProducts.except(products) | |
val dfWriter = newnewProducts.write.mode(SaveMode.Append) | |
val connectionProperties = new java.util.Properties() |