Skip to content

Instantly share code, notes, and snippets.

@kdgregory
Last active December 3, 2018 19:42
Show Gist options
  • Save kdgregory/42f810438301592e2b1b004e3bf11ed6 to your computer and use it in GitHub Desktop.
Save kdgregory/42f810438301592e2b1b004e3bf11ed6 to your computer and use it in GitHub Desktop.
Examples for Logging presentation
# logging pipeline
log4j.appender.kinesis=com.kdgregory.log4j.aws.KinesisAppender
log4j.appender.kinesis.streamName=LoggingExample
log4j.appender.kinesis.batchDelay=200
log4j.appender.kinesis.layout=com.kdgregory.log4j.aws.JsonLayout
log4j.appender.kinesis.layout.tags=applicationName=Example,runDate={date},env=spark-dev
log4j.appender.kinesis.layout.enableHostname=true
log4j.appender.kinesis.layout.enableLocation=true
log4j.logger.com.kdgregory.sandbox=DEBUG
// Copyright (c) Keith D Gregory, all rights reserved
package com.kdgregory.sandbox.spark.concordance;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.List;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import scala.Tuple2;
/**
* Creates a simple concodance: a list of words with the files that they came from.
* <p>
* To invoke, pass two arguments: a directory containing the files to be parsed, and
* an output file. Note that the input files must be available on all nodes (eg, an
* HDFS or S3 filesystem), and that the output file will be written to the master.
*/
public class Main
{
private static Logger logger = LoggerFactory.getLogger(Main.class);
public static void main(String[] argv)
throws Exception
{
SparkContext sc = new SparkContext();
MDC.put("appName", sc.appName());
MDC.put("applicationId", sc.applicationId());
logger.info("starting job: {}", argv[0]);
JavaRDD<Tuple2<String,String>> rdd = sc.wholeTextFiles(argv[0], 8).toJavaRDD();
List<Tuple2<String,Iterable<String>>> results =
rdd.flatMap(new Parser(sc.appName(), sc.applicationId()))
.distinct()
.mapToPair(t -> t.swap())
.groupByKey()
.sortByKey()
.collect();
logger.debug("total number of distinct words: {}", results.size());
logger.debug("writing output file: {}", argv[1]);
try (FileOutputStream fos = new FileOutputStream(argv[1]))
{
PrintWriter out = new PrintWriter(new OutputStreamWriter(fos, "UTF-8"));
for (Tuple2<String,Iterable<String>> entry : results)
{
out.print(entry._1());
out.print(": ");
for (String value : entry._2())
{
out.print(value);
out.print(" ");
}
out.println();
}
}
logger.info("finished job");
}
}
// Copyright (c) Keith D Gregory, all rights reserved
package com.kdgregory.sandbox.spark.concordance;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import scala.Tuple2;
/**
* This function accepts a tuple containg filename and file text and extracts
* the words from that text. A word consists of alphanumeric characters and
* dashes, and is returned in lowercase..
*/
public class Parser
implements FlatMapFunction<Tuple2<String,String>,Tuple2<String,String>>
{
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(Parser.class);
private String appName;
private String applicationId;
public Parser(String appName, String applicationId)
{
this.appName = appName;
this.applicationId = applicationId;
}
@Override
public Iterator<Tuple2<String,String>> call(Tuple2<String,String> arg)
throws Exception
{
MDC.put("appName", appName);
MDC.put("applicationId", applicationId);
String filename = arg._1().replaceAll(".*\\/", "").replaceAll(".txt$", "");
MDC.put("filename", filename);
logger.info("source file: {}", arg._1());
String text = arg._2();
logger.debug("{} characters", text.length());
String[] words = text.replaceAll("--", " ").split("\\s+");
logger.debug("{} words", words.length);
List<Tuple2<String,String>> output = new ArrayList<>(words.length);
for (String word : words)
{
String cleanWord = word.toLowerCase().replaceAll("[^a-z0-9-]", "");
output.add(new Tuple2<>(filename, cleanWord));
}
logger.debug("file complete");
return output.iterator();
}
}
2018-05-16 12:04:03 INFO CoarseGrainedExecutorBackend:2608 - Started daemon with process name: 23900@ip-172-30-1-93
2018-05-16 12:04:03 INFO SignalUtils:54 - Registered signal handler for TERM
2018-05-16 12:04:03 INFO SignalUtils:54 - Registered signal handler for HUP
2018-05-16 12:04:03 INFO SignalUtils:54 - Registered signal handler for INT
2018-05-16 12:04:04 WARN NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2018-05-16 12:04:04 INFO SecurityManager:54 - Changing view acls to: ec2-user
2018-05-16 12:04:04 INFO SecurityManager:54 - Changing modify acls to: ec2-user
2018-05-16 12:04:04 INFO SecurityManager:54 - Changing view acls groups to:
2018-05-16 12:04:04 INFO SecurityManager:54 - Changing modify acls groups to:
2018-05-16 12:04:04 INFO SecurityManager:54 - SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(ec2-user); groups with view permissions: Set(); users with modify permissions: Set(ec2-user); groups with modify permissions: Set()
2018-05-16 12:04:05 INFO TransportClientFactory:267 - Successfully created connection to master/172.30.1.17:43871 after 110 ms (0 ms spent in bootstraps)
2018-05-16 12:04:05 INFO SecurityManager:54 - Changing view acls to: ec2-user
2018-05-16 12:04:05 INFO SecurityManager:54 - Changing modify acls to: ec2-user
2018-05-16 12:04:05 INFO SecurityManager:54 - Changing view acls groups to:
2018-05-16 12:04:05 INFO SecurityManager:54 - Changing modify acls groups to:
2018-05-16 12:04:05 INFO SecurityManager:54 - SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(ec2-user); groups with view permissions: Set(); users with modify permissions: Set(ec2-user); groups with modify permissions: Set()
2018-05-16 12:04:05 INFO TransportClientFactory:267 - Successfully created connection to master/172.30.1.17:43871 after 2 ms (0 ms spent in bootstraps)
2018-05-16 12:04:05 INFO DiskBlockManager:54 - Created local directory at /tmp/spark-1ebaea1b-9509-4c93-bfdf-6600f351d290/executor-70e1e187-2494-4996-b0b0-e441ba4f71d4/blockmgr-f93725aa-1880-42f6-99d8-cd5d68efc551
2018-05-16 12:04:05 INFO MemoryStore:54 - MemoryStore started with capacity 413.9 MB
2018-05-16 12:04:05 INFO CoarseGrainedExecutorBackend:54 - Connecting to driver: spark://CoarseGrainedScheduler@master:43871
2018-05-16 12:04:05 INFO WorkerWatcher:54 - Connecting to worker spark://[email protected]:37499
2018-05-16 12:04:05 INFO TransportClientFactory:267 - Successfully created connection to /172.30.1.93:37499 after 26 ms (0 ms spent in bootstraps)
2018-05-16 12:04:05 INFO WorkerWatcher:54 - Successfully connected to spark://[email protected]:37499
2018-05-16 12:04:05 INFO CoarseGrainedExecutorBackend:54 - Successfully registered with driver
2018-05-16 12:04:05 INFO Executor:54 - Starting executor ID 0 on host 172.30.1.93
2018-05-16 12:04:05 INFO Utils:54 - Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 45229.
2018-05-16 12:04:05 INFO NettyBlockTransferService:54 - Server created on 172.30.1.93:45229
2018-05-16 12:04:05 INFO BlockManager:54 - Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
2018-05-16 12:04:05 INFO BlockManagerMaster:54 - Registering BlockManager BlockManagerId(0, 172.30.1.93, 45229, None)
2018-05-16 12:04:05 INFO BlockManagerMaster:54 - Registered BlockManager BlockManagerId(0, 172.30.1.93, 45229, None)
2018-05-16 12:04:05 INFO BlockManager:54 - Initialized BlockManager: BlockManagerId(0, 172.30.1.93, 45229, None)
2018-05-16 12:04:06 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 0
2018-05-16 12:04:06 INFO Executor:54 - Running task 0.0 in stage 0.0 (TID 0)
2018-05-16 12:04:06 INFO Executor:54 - Fetching spark://master:43871/jars/sandbox-spark-2.3.0-SNAPSHOT.jar with timestamp 1526472242478
2018-05-16 12:04:06 INFO TransportClientFactory:267 - Successfully created connection to master/172.30.1.17:43871 after 9 ms (0 ms spent in bootstraps)
2018-05-16 12:04:06 INFO Utils:54 - Fetching spark://master:43871/jars/sandbox-spark-2.3.0-SNAPSHOT.jar to /tmp/spark-1ebaea1b-9509-4c93-bfdf-6600f351d290/executor-70e1e187-2494-4996-b0b0-e441ba4f71d4/spark-2d118c23-7f61-45cd-be48-760a67bb620e/fetchFileTemp8957172668352263863.tmp
2018-05-16 12:04:06 INFO Utils:54 - Copying /tmp/spark-1ebaea1b-9509-4c93-bfdf-6600f351d290/executor-70e1e187-2494-4996-b0b0-e441ba4f71d4/spark-2d118c23-7f61-45cd-be48-760a67bb620e/13083391631526472242478_cache to /home/ec2-user/spark-2.3.0-bin-hadoop2.7/work/app-20180516120402-0004/0/./sandbox-spark-2.3.0-SNAPSHOT.jar
2018-05-16 12:04:06 INFO Executor:54 - Adding file:/home/ec2-user/spark-2.3.0-bin-hadoop2.7/work/app-20180516120402-0004/0/./sandbox-spark-2.3.0-SNAPSHOT.jar to class loader
2018-05-16 12:04:06 INFO TorrentBroadcast:54 - Started reading broadcast variable 1
2018-05-16 12:04:06 INFO TransportClientFactory:267 - Successfully created connection to master/172.30.1.17:42887 after 2 ms (0 ms spent in bootstraps)
2018-05-16 12:04:06 INFO MemoryStore:54 - Block broadcast_1_piece0 stored as bytes in memory (estimated size 2.6 KB, free 413.9 MB)
2018-05-16 12:04:06 INFO TorrentBroadcast:54 - Reading broadcast variable 1 took 248 ms
2018-05-16 12:04:06 INFO MemoryStore:54 - Block broadcast_1 stored as values in memory (estimated size 4.5 KB, free 413.9 MB)
2018-05-16 12:04:06 INFO WholeTextFileRDD:54 - Input split: Paths:/tmp/FolgerShakespeare/Henry_IV_Part_2.txt:0+159845,/tmp/FolgerShakespeare/The_Comedy_of_Errors.txt:0+93232,/tmp/FolgerShakespeare/Twelfth_Night.txt:0+119203,/tmp/FolgerShakespeare/Cymbeline.txt:0+169065,/tmp/FolgerShakespeare/Measure_for_Measure.txt:0+132746,/tmp/FolgerShakespeare/Romeo_and_Juliet.txt:0+146692
2018-05-16 12:04:06 INFO TorrentBroadcast:54 - Started reading broadcast variable 0
2018-05-16 12:04:06 INFO MemoryStore:54 - Block broadcast_0_piece0 stored as bytes in memory (estimated size 23.0 KB, free 413.9 MB)
2018-05-16 12:04:06 INFO TorrentBroadcast:54 - Reading broadcast variable 0 took 18 ms
2018-05-16 12:04:07 INFO MemoryStore:54 - Block broadcast_0 stored as values in memory (estimated size 321.3 KB, free 413.6 MB)
2018-05-16 12:04:07 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Henry_IV_Part_2.txt
2018-05-16 12:04:08 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/The_Comedy_of_Errors.txt
2018-05-16 12:04:08 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Twelfth_Night.txt
2018-05-16 12:04:08 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Cymbeline.txt
2018-05-16 12:04:09 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Measure_for_Measure.txt
2018-05-16 12:04:09 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Romeo_and_Juliet.txt
2018-05-16 12:04:09 INFO Executor:54 - Finished task 0.0 in stage 0.0 (TID 0). 1196 bytes result sent to driver
2018-05-16 12:04:09 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 3
2018-05-16 12:04:09 INFO Executor:54 - Running task 3.0 in stage 0.0 (TID 3)
2018-05-16 12:04:09 INFO WholeTextFileRDD:54 - Input split: Paths:/tmp/FolgerShakespeare/Antony_and_Cleopatra.txt:0+156324,/tmp/FolgerShakespeare/Timon_of_Athens.txt:0+115153,/tmp/FolgerShakespeare/Much_Ado_About_Nothing.txt:0+127764,/tmp/FolgerShakespeare/The_Two_Noble_Kinsmen.txt:0+147285,/tmp/FolgerShakespeare/As_You_Like_It.txt:0+131850,/tmp/FolgerShakespeare/Hamlet.txt:0+182863
2018-05-16 12:04:09 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Antony_and_Cleopatra.txt
2018-05-16 12:04:10 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Timon_of_Athens.txt
2018-05-16 12:04:10 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Much_Ado_About_Nothing.txt
2018-05-16 12:04:10 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/The_Two_Noble_Kinsmen.txt
2018-05-16 12:04:10 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/As_You_Like_It.txt
2018-05-16 12:04:10 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Hamlet.txt
2018-05-16 12:04:10 INFO Executor:54 - Finished task 3.0 in stage 0.0 (TID 3). 1153 bytes result sent to driver
2018-05-16 12:04:10 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 5
2018-05-16 12:04:10 INFO Executor:54 - Running task 5.0 in stage 0.0 (TID 5)
2018-05-16 12:04:10 INFO WholeTextFileRDD:54 - Input split: Paths:/tmp/FolgerShakespeare/Lucrece.txt:0+87916,/tmp/FolgerShakespeare/Richard_III.txt:0+181578,/tmp/FolgerShakespeare/The_Taming_of_the_Shrew.txt:0+130309,/tmp/FolgerShakespeare/Othello.txt:0+159401,/tmp/FolgerShakespeare/Sonnets.txt:0+99254,/tmp/FolgerShakespeare/Henry_V.txt:0+157455
2018-05-16 12:04:10 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Lucrece.txt
2018-05-16 12:04:11 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Richard_III.txt
2018-05-16 12:04:11 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/The_Taming_of_the_Shrew.txt
2018-05-16 12:04:11 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Othello.txt
2018-05-16 12:04:11 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Sonnets.txt
2018-05-16 12:04:11 INFO Parser:48 - source file: file:/tmp/FolgerShakespeare/Henry_V.txt
2018-05-16 12:04:11 INFO Executor:54 - Finished task 5.0 in stage 0.0 (TID 5). 1153 bytes result sent to driver
2018-05-16 12:04:12 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 8
2018-05-16 12:04:12 INFO Executor:54 - Running task 1.0 in stage 1.0 (TID 8)
2018-05-16 12:04:12 INFO MapOutputTrackerWorker:54 - Updating epoch to 1 and clearing cache
2018-05-16 12:04:12 INFO TorrentBroadcast:54 - Started reading broadcast variable 2
2018-05-16 12:04:12 INFO MemoryStore:54 - Block broadcast_2_piece0 stored as bytes in memory (estimated size 2.7 KB, free 413.6 MB)
2018-05-16 12:04:12 INFO TorrentBroadcast:54 - Reading broadcast variable 2 took 10 ms
2018-05-16 12:04:12 INFO MemoryStore:54 - Block broadcast_2 stored as values in memory (estimated size 5.0 KB, free 413.6 MB)
2018-05-16 12:04:12 INFO MapOutputTrackerWorker:54 - Don't have map outputs for shuffle 1, fetching them
2018-05-16 12:04:12 INFO MapOutputTrackerWorker:54 - Doing the fetch; tracker endpoint = NettyRpcEndpointRef(spark://MapOutputTracker@master:43871)
2018-05-16 12:04:12 INFO MapOutputTrackerWorker:54 - Got the output locations
2018-05-16 12:04:12 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:12 INFO TransportClientFactory:267 - Successfully created connection to /172.30.1.61:40663 after 2 ms (0 ms spent in bootstraps)
2018-05-16 12:04:12 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 19 ms
2018-05-16 12:04:12 INFO Executor:54 - Finished task 1.0 in stage 1.0 (TID 8). 1368 bytes result sent to driver
2018-05-16 12:04:12 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 9
2018-05-16 12:04:12 INFO Executor:54 - Running task 2.0 in stage 1.0 (TID 9)
2018-05-16 12:04:12 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:12 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 10 ms
2018-05-16 12:04:12 INFO Executor:54 - Finished task 2.0 in stage 1.0 (TID 9). 1368 bytes result sent to driver
2018-05-16 12:04:12 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 11
2018-05-16 12:04:12 INFO Executor:54 - Running task 4.0 in stage 1.0 (TID 11)
2018-05-16 12:04:12 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:12 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 2 ms
2018-05-16 12:04:13 INFO Executor:54 - Finished task 4.0 in stage 1.0 (TID 11). 1325 bytes result sent to driver
2018-05-16 12:04:13 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 15
2018-05-16 12:04:13 INFO Executor:54 - Running task 1.0 in stage 2.0 (TID 15)
2018-05-16 12:04:13 INFO MapOutputTrackerWorker:54 - Updating epoch to 2 and clearing cache
2018-05-16 12:04:13 INFO TorrentBroadcast:54 - Started reading broadcast variable 3
2018-05-16 12:04:13 INFO MemoryStore:54 - Block broadcast_3_piece0 stored as bytes in memory (estimated size 3.2 KB, free 413.6 MB)
2018-05-16 12:04:13 INFO TorrentBroadcast:54 - Reading broadcast variable 3 took 13 ms
2018-05-16 12:04:13 INFO MemoryStore:54 - Block broadcast_3 stored as values in memory (estimated size 6.3 KB, free 413.6 MB)
2018-05-16 12:04:13 INFO MapOutputTrackerWorker:54 - Don't have map outputs for shuffle 0, fetching them
2018-05-16 12:04:13 INFO MapOutputTrackerWorker:54 - Doing the fetch; tracker endpoint = NettyRpcEndpointRef(spark://MapOutputTracker@master:43871)
2018-05-16 12:04:13 INFO MapOutputTrackerWorker:54 - Got the output locations
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 5 ms
2018-05-16 12:04:13 INFO Executor:54 - Finished task 1.0 in stage 2.0 (TID 15). 2115 bytes result sent to driver
2018-05-16 12:04:13 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 17
2018-05-16 12:04:13 INFO Executor:54 - Running task 3.0 in stage 2.0 (TID 17)
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 6 ms
2018-05-16 12:04:13 INFO Executor:54 - Finished task 3.0 in stage 2.0 (TID 17). 2062 bytes result sent to driver
2018-05-16 12:04:13 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 19
2018-05-16 12:04:13 INFO Executor:54 - Running task 5.0 in stage 2.0 (TID 19)
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 6 ms
2018-05-16 12:04:13 INFO Executor:54 - Finished task 5.0 in stage 2.0 (TID 19). 2088 bytes result sent to driver
2018-05-16 12:04:13 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 21
2018-05-16 12:04:13 INFO Executor:54 - Running task 0.0 in stage 5.0 (TID 21)
2018-05-16 12:04:13 INFO TorrentBroadcast:54 - Started reading broadcast variable 4
2018-05-16 12:04:13 INFO MemoryStore:54 - Block broadcast_4_piece0 stored as bytes in memory (estimated size 3.4 KB, free 413.6 MB)
2018-05-16 12:04:13 INFO TorrentBroadcast:54 - Reading broadcast variable 4 took 42 ms
2018-05-16 12:04:13 INFO MemoryStore:54 - Block broadcast_4 stored as values in memory (estimated size 6.4 KB, free 413.6 MB)
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:13 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 8 ms
2018-05-16 12:04:14 INFO Executor:54 - Finished task 0.0 in stage 5.0 (TID 21). 1325 bytes result sent to driver
2018-05-16 12:04:14 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 23
2018-05-16 12:04:14 INFO Executor:54 - Running task 2.0 in stage 5.0 (TID 23)
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 9 ms
2018-05-16 12:04:14 INFO Executor:54 - Finished task 2.0 in stage 5.0 (TID 23). 1368 bytes result sent to driver
2018-05-16 12:04:14 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 25
2018-05-16 12:04:14 INFO Executor:54 - Running task 4.0 in stage 5.0 (TID 25)
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 2 ms
2018-05-16 12:04:14 INFO Executor:54 - Finished task 4.0 in stage 5.0 (TID 25). 1325 bytes result sent to driver
2018-05-16 12:04:14 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 27
2018-05-16 12:04:14 INFO Executor:54 - Running task 6.0 in stage 5.0 (TID 27)
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 3 ms
2018-05-16 12:04:14 INFO Executor:54 - Finished task 6.0 in stage 5.0 (TID 27). 1325 bytes result sent to driver
2018-05-16 12:04:14 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 28
2018-05-16 12:04:14 INFO Executor:54 - Running task 0.0 in stage 6.0 (TID 28)
2018-05-16 12:04:14 INFO MapOutputTrackerWorker:54 - Updating epoch to 3 and clearing cache
2018-05-16 12:04:14 INFO TorrentBroadcast:54 - Started reading broadcast variable 5
2018-05-16 12:04:14 INFO MemoryStore:54 - Block broadcast_5_piece0 stored as bytes in memory (estimated size 2.1 KB, free 413.6 MB)
2018-05-16 12:04:14 INFO TorrentBroadcast:54 - Reading broadcast variable 5 took 10 ms
2018-05-16 12:04:14 INFO MemoryStore:54 - Block broadcast_5 stored as values in memory (estimated size 3.5 KB, free 413.6 MB)
2018-05-16 12:04:14 INFO MapOutputTrackerWorker:54 - Don't have map outputs for shuffle 2, fetching them
2018-05-16 12:04:14 INFO MapOutputTrackerWorker:54 - Doing the fetch; tracker endpoint = NettyRpcEndpointRef(spark://MapOutputTracker@master:43871)
2018-05-16 12:04:14 INFO MapOutputTrackerWorker:54 - Got the output locations
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 1 ms
2018-05-16 12:04:14 INFO Executor:54 - Finished task 0.0 in stage 6.0 (TID 28). 418968 bytes result sent to driver
2018-05-16 12:04:14 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 30
2018-05-16 12:04:14 INFO Executor:54 - Running task 2.0 in stage 6.0 (TID 30)
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:14 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 1 ms
2018-05-16 12:04:15 INFO Executor:54 - Finished task 2.0 in stage 6.0 (TID 30). 564993 bytes result sent to driver
2018-05-16 12:04:15 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 32
2018-05-16 12:04:15 INFO Executor:54 - Running task 4.0 in stage 6.0 (TID 32)
2018-05-16 12:04:15 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:15 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 3 ms
2018-05-16 12:04:15 INFO Executor:54 - Finished task 4.0 in stage 6.0 (TID 32). 549159 bytes result sent to driver
2018-05-16 12:04:15 INFO CoarseGrainedExecutorBackend:54 - Got assigned task 34
2018-05-16 12:04:15 INFO Executor:54 - Running task 6.0 in stage 6.0 (TID 34)
2018-05-16 12:04:15 INFO ShuffleBlockFetcherIterator:54 - Getting 7 non-empty blocks out of 7 blocks
2018-05-16 12:04:15 INFO ShuffleBlockFetcherIterator:54 - Started 1 remote fetches in 2 ms
2018-05-16 12:04:15 INFO Executor:54 - Finished task 6.0 in stage 6.0 (TID 34). 706956 bytes result sent to driver
2018-05-16 12:04:16 INFO CoarseGrainedExecutorBackend:54 - Driver commanded a shutdown
2018-05-16 12:04:16 ERROR CoarseGrainedExecutorBackend:43 - RECEIVED SIGNAL TERM
@kdgregory
Copy link
Author

kdgregory commented May 16, 2018

Setup machine:

  • Add EMR private key
  • Add /etc/hosts entries for master, workers
  • sudo yum remove java-1.7.0-openjdk.x86_64
  • sudo yum install java-1.8.0-openjdk-devel.x86_64

Download dependencies:

  • aws s3 cp --recursive s3://com-kdgregory-transfer/FolgerShakespeare/ /tmp
  • aws s3 cp s3://com-kdgregory-transfer/spark-2.3.0-bin-hadoop2.7.tgz /tmp
  • aws s3 cp s3://com-kdgregory-transfer/logging-dependencies-spark.tgz /tmp

Install Spark:

  • cd
  • tar zxvf /tmp/spark-2.3.0-bin-hadoop2.7.tgz
  • cd spark-2.3.0-bin-hadoop2.7/

Update logging configuration:

  • pushd jars/
  • tar zxvf /tmp/logging-dependencies-spark.tgz
  • popd
  • cp conf/log4j.properties.template conf/log4j.properties
  • vi conf/log4j.properties
  • Add content of log4j.properties.update, update rootLogger

Run Master:

  • sbin/start-master.sh
  • tail -f logs/spark-ec2-user-org.apache.spark.deploy.master.*.out

Run Workers:

  • sbin/start-slave.sh spark://master:7077
  • tail -f logs/spark-ec2-user-org.apache.spark.deploy.worker.*.out

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment