Follow the guide to uninstall. Link Also, remove the existing folders and users.
#Remove conf and logs
rm -rf rm -rf /etc/hadoop
rm -rf rm -rf /etc/hbase
rm -rf rm -rf /etc/hive| import org.apache.spark.sql.functions.udf | |
| import spark.sessionState.conf | |
| conf.setConfString("spark.sql.pivotMaxValues", "" + Int.MaxValue) | |
| val csv = spark.read.format("csv").option("header", true).load("/Users/tilak/Downloads/Pam/SalesAnalysis/data/store_sales_unified_2017.csv") | |
| val uniqueKey: (String, String, String, String) => String = (x, y, z, v) => x + "_" + y + "_" + z + "_" + v | |
| val someFn = udf(uniqueKey) | |
| val newData = csv.withColumn("unique", someFn(csv.col("receipt_id"), csv.col("cash_register_id"), csv.col("sale_time"), csv.col("date"))) | |
| val countArticles = newData.groupBy("unique", "article_id").count() | |
| var articles = countArticles.select("article_id").distinct() | |
| val articleIds = articles.collect.map(x => x(0)) |
| import org.apache.spark.sql.functions.udf | |
| val csv = spark.read.format("csv").option("header", true).load("/Users/tilak/Downloads/Pam/SalesAnalysis/data/store_sales_unified_2017.csv") | |
| val uniqueKey: (String, String, String, String) => String = (x, y ,z , v) => x + "_" + y + "_" + z + "_" + v | |
| val someFn = udf(uniqueKey) | |
| val newData = csv.withColumn("unique", someFn(csv.col("receipt_id"), csv.col("cash_register_id"), csv.col("sale_time"), csv.col("date"))) | |
| val countArticles = newData.groupBy("unique", "article_id").count() | |
| val sameBill = countArticles.crossJoin(countArticles).filter(x => x.getString(0) == x.getString(3) && x.getString(1) != x.getString(4)) | |
| val newNames = sameBill.columns.toList.zipWithIndex.map((x) => x._1 + "_" + x._2) |
Follow the guide to uninstall. Link Also, remove the existing folders and users.
#Remove conf and logs
rm -rf rm -rf /etc/hadoop
rm -rf rm -rf /etc/hbase
rm -rf rm -rf /etc/hiveyum install -y ant gcc g++ libkrb5-dev libmysqlclient-dev
yum install -y libssl-dev libsasl2-dev libsasl2-modules-gssapi-mit
yum install -y libsqlite3-dev libtidy-0.99-0 libxml2-dev libxslt-dev
yum install -y maven libldap2-dev python-dev python-simplejson python-setuptools
yum install -y libxslt-devel libxml++-devel libxml2-devel libffi-devel| # Run postgres instance | |
| docker run --name postgres -p 5000:5432 debezium/postgres | |
| # Run zookeeper instance | |
| docker run -it --name zookeeper -p 2181:2181 -p 2888:2888 -p 3888:3888 debezium/zookeeper | |
| # Run kafka instance | |
| docker run -it --name kafka -p 9092:9092 --link zookeeper:zookeeper debezium/kafka | |
| # Run kafka connect |
| -- Finding difference in values for some key | |
| SELECT * from A a,b WHERE a.id = b.id AND NOT a.name = b.name; | |
| --Returns one row for each CHECK, UNIQUE, PRIMARY KEY, and/or FOREIGN KEY | |
| SELECT * | |
| FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS | |
| WHERE CONSTRAINT_NAME='XYZ' | |
| --Returns one row for each FOREIGN KEY constrain |
| # ==================================================================== | |
| # PullCsvFromS3 | |
| # Pull CSV data from a directory S3 to our local system | |
| # ==================================================================== | |
| job.name=PullCsvFromS3 | |
| job.description=Pull CSV data from a directory S3 to our local system | |
| fs.uri=file:/// | |
| # Set working directory |
| # ==================================================================== | |
| # PullCsvFromS3 | |
| # Pull CSV data from a directory S3 to our local system | |
| # ==================================================================== | |
| job.name=PullCsvFromS3 | |
| job.description=Pull CSV data from a directory S3 to our local system and write as AVRO files | |
| fs.uri=file:/// | |
| # Set working directory |
| # ==================================================================== | |
| # PullCsvFromS3 | |
| # Pull CSV data from a directory S3 to MySQL | |
| # ==================================================================== | |
| job.name=PullCsvFromS3 | |
| job.description=Pull CSV data from a directory S3 to MySQL | |
| fs.uri=file:/// | |
| # Set working directory |
| import spark.implicits._ | |
| import org.apache.spark.sql.SaveMode | |
| val products = spark.sqlContext.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "products").option("user", "gobblin").option("password", "gobblin").option("url", "jdbc:mysql://localhost/mopar_demo").load() | |
| val newProducts = spark.sqlContext.read.format("orc").load("/Users/tilak/gobblin/mopar-demo/output/org/apache/gobblin/copy/user/tilak/pricing.products_1521799535.csv/20180325023900_append/part.task_PullCsvFromS3_1521945534992_0_0.orc") | |
| val newnewProducts = newProducts.except(products) | |
| val dfWriter = newnewProducts.write.mode(SaveMode.Append) | |
| val connectionProperties = new java.util.Properties() |