Note: reference article
Basic query example:
SELECT status_code,
COUNT(status_code) AS requests
FROM fastly_logs.example_com
GROUP BY requests
ORDER BY requests DESCNote: reference article
Basic query example:
SELECT status_code,
COUNT(status_code) AS requests
FROM fastly_logs.example_com
GROUP BY requests
ORDER BY requests DESCNote: reference article
Basic query example:
SELECT status_code,
COUNT(status_code) AS requests
FROM fastly_logs.example_com
GROUP BY requests
ORDER BY requests DESC| CREATE EXTERNAL TABLE IF NOT EXISTS default.table | |
| ( | |
| `id` int, | |
| `name` string, | |
| `timestamp` string, | |
| `is_debug` boolean | |
| ) | |
| ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' | |
| WITH SERDEPROPERTIES ( | |
| 'escapeChar'='\\', |
| 1.mapPartition() instead of map() - when some expensive initializations like DBconnection need to be done | |
| 2.RDD Parallelism: for No parent RDDs, example, sc.parallelize(',,,',4),Unless specified YARN will try to use as many CPU cores as available | |
| This could be tuned using spark.default.parallelism property. | |
| - to find default parallelism use sc.defaultParallelism | |
| rdd.getNumPartitions() | |
| rdd = sc.parallelize(<value>, numSlices=4) | |
| rdd.getNumPartitions() will return 4 | |
| void sixteenRandomBytes(unsigned char buf[16]) { | |
| for (int i = 0; i < 16; i++) { | |
| buf[i] = rand() & 0xff; | |
| } | |
| } | |
| // PKCS #7 padding | |
| // Do this before encrypting to get the message | |
| // up to a multiple of 16 bytes. | |
| size_t pad(unsigned char *buf, size_t messageLength) { |
| from pyspark.sql.types import StringType, FloatType, StructField, StructType | |
| from pyspark.sql import SparkSession, SQLContext, Row | |
| import pyspark | |
| # spark initialization | |
| spark_context = pyspark.SparkContext.getOrCreate() | |
| spark_session = SparkSession(spark_context) \ | |
| .builder \ | |
| .enableHiveSupport() \ | |
| .getOrCreate() |
| package com.test; | |
| import com.test.schema.ContactType; | |
| import org.apache.kafka.clients.consumer.ConsumerConfig; | |
| import org.apache.kafka.clients.consumer.ConsumerRecord; | |
| import org.apache.spark.SparkConf; | |
| import org.apache.spark.api.java.JavaPairRDD; | |
| import org.apache.spark.api.java.function.*; | |
| import org.apache.spark.streaming.Durations; | |
| import org.apache.spark.streaming.api.java.JavaDStream; |
| package com.experiments.calvin | |
| import org.apache.kafka.common.serialization.StringDeserializer | |
| import org.apache.spark.sql.SparkSession | |
| import org.apache.spark.streaming.kafka010._ | |
| import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent | |
| import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe | |
| import org.apache.spark.streaming.{Seconds, StreamingContext} | |
| object ExampleApp extends App { |
| ## Python 2.7 | |
| ## GlueLastRunDuration.py | |
| ## Version 1 | |
| ## by Lydon Carter October 2018 | |
| ## USE | |
| # Script to get a specific AWS Glue Job and tell you the duration of | |
| # the last run. | |
| # Notes: | |
| # -- The script will use the location you setup for your Glue Context in the "Needed stuff" |
| # AWS Version 4 signing example | |
| # taken from: | |
| # http://docs.aws.amazon.com/general/latest/gr/sigv4-signed-request-examples.html | |
| # https://docs.aws.amazon.com/general/latest/gr/sigv4_signing.html | |
| # https://www.javaquery.com/2016/01/aws-version-4-signing-process-complete.html | |
| # Lambda API (InvokeAsync) | |
| # http://docs.aws.amazon.com/lambda/latest/dg/API_InvokeAsync.html | |
| # See: http://docs.aws.amazon.com/general/latest/gr/sigv4_signing.html | |
| # This version makes a POST request and passes request parameters |