Kafka ,File Systems(CSV,Delimiter,Parquet,orc,avro,json),Socket
Kafka ,Console,meory,foreach
#IMP: Schema Definition is manadatory to process the data
################################################################################################ | |
# Listing files | |
# Recursively lists all files within the current directory | |
find . | |
# Recursively list all files ending with extension .txt | |
find . | grep ".txt" | |
# List files within current directory with a specific extension |
# Script to add a header and static values to CSV file | |
# Header | |
awk '{if(NR==1){$0="env,"$0; print $0} ;if(NR!=1){print $0}}' input.csv > output.csv | |
# Add rows | |
awk -F"," 'BEGIN { OFS = "," } {$1="2012-02-29 16:13:00"; print}' input.csv > output.csv | |
awk -F"," 'BEGIN { OFS = "," } {$0="XXXXX,"$0; print $0}' input.csv > output.csv |
import org.apache.spark.sql.Row | |
// Generate a test DataFrame with 2 rows | |
val df = Seq((1,"Red Green"),(2,"Blue White")).toDF("id","colors") | |
df.show | |
// Show ONLY the colors | |
df.map{ case Row(id:Int,colors:String) => colors}.show | |
// Create a row with both fields |
// Imports | |
import org.apache.spark.sql.Row | |
import org.apache.spark.sql.types._ | |
// Generate sample data | |
val rows = Seq((1,"Alpha","10/12/1990 12:10:10"), | |
(2,"Beta","11/12/1990 13:10:10"), | |
(3,"Tango","12/12/1990 14:10:10")) | |
// Define Schema for the rows |
package org.ajp.kafkaserdeserdemo | |
import org.apache.log4j.Logger | |
import org.apache.spark.sql.SparkSession | |
import org.apache.spark.sql.functions._ | |
import org.apache.spark.sql.types._ | |
import org.apache.spark.sql.avro.functions.to_avro | |
object kafkaserdeser extends Serializable { | |
@transient lazy val logger: Logger = Logger.getLogger(getClass.getName) |
package org.ajp.kafkademo | |
import org.apache.log4j.Logger | |
import org.apache.spark.sql.SparkSession | |
import org.apache.spark.sql.functions._ | |
import org.apache.spark.sql.streaming.Trigger | |
import org.apache.spark.sql.types._ | |
object kafkademo extends Serializable{ | |
@transient lazy val logger: Logger = Logger.getLogger(getClass.getName) |
package org.ajp.kafkademo | |
import org.apache.log4j.Logger | |
import org.apache.spark.sql.SparkSession | |
import org.apache.spark.sql.functions._ | |
import org.apache.spark.sql.streaming.Trigger | |
import org.apache.spark.sql.types._ | |
object kafkademo extends Serializable{ | |
@transient lazy val logger: Logger = Logger.getLogger(getClass.getName) |
Data 1.json | |
{"fullname": "Paisley Hoover", "sex": "female", "address": "Third Court Dr. Windermere, FL34786"} | |
{"fullname": "Paisley Hoover", "sex": "female", "address": "Third Court Dr. Windermere, FL34786"} | |
{"fullname": "Paisley Hoover", "sex": "female", "address": "Third Court Dr. Windermere, FL34786"} | |
Supported Schema | |
val schema = StructType(List( | |
StructField("fullname", StringType), | |
StructField("sex", StringType), |
with ten as ( | |
select 0 as number union | |
select 1 union | |
select 2 union | |
select 3 union | |
select 4 union | |
select 5 union | |
select 6 union | |
select 7 union | |
select 8 union |