Skip to content

Instantly share code, notes, and snippets.

@winse
Last active August 29, 2015 14:02
Show Gist options
  • Save winse/a2e6373d235abfba1bd1 to your computer and use it in GitHub Desktop.
Save winse/a2e6373d235abfba1bd1 to your computer and use it in GitHub Desktop.
# 下载hadoop2编译好的版本
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ hadoop fs -put README.md ./
# 参考 http://spark.apache.org/docs/latest/quick-start.html
# http://spark.apache.org/examples.html
# https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ bin/spark-shell
...
scala> var textFile=sc.textFile("README.md")
14/06/25 15:10:45 INFO storage.MemoryStore: ensureFreeSpace(139131) called with curMem=0, maxMem=309225062
14/06/25 15:10:45 INFO storage.MemoryStore: Block broadcast_0 stored as values to memory (estimated size 135.9 KB, free 294.8 MB)
textFile: org.apache.spark.rdd.RDD[String] = MappedRDD[1] at textFile at <console>:12
scala> textFile.count()
14/06/25 15:10:48 INFO mapred.FileInputFormat: Total input paths to process : 1
...
14/06/25 15:10:48 INFO scheduler.TaskSetManager: Finished TID 1 in 255 ms on localhost (progress: 2/2)
14/06/25 15:10:48 INFO scheduler.DAGScheduler: Stage 0 (count at <console>:15) finished in 0.284 s
14/06/25 15:10:48 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
14/06/25 15:10:48 INFO spark.SparkContext: Job finished: count at <console>:15, took 0.429361 s
res0: Long = 127
scala> textFile.first()
14/06/25 15:11:45 INFO spark.SparkContext: Starting job: first at <console>:15
14/06/25 15:11:45 INFO scheduler.DAGScheduler: Got job 1 (first at <console>:15) with 1 output partitions (allowLocal=true)
14/06/25 15:11:45 INFO scheduler.DAGScheduler: Final stage: Stage 1(first at <console>:15)
14/06/25 15:11:45 INFO scheduler.DAGScheduler: Parents of final stage: List()
14/06/25 15:11:45 INFO scheduler.DAGScheduler: Missing parents: List()
14/06/25 15:11:45 INFO scheduler.DAGScheduler: Computing the requested partition locally
14/06/25 15:11:45 INFO rdd.HadoopRDD: Input split: hdfs://umcc97-44:9000/user/hadoop/README.md:0+2110
14/06/25 15:11:45 INFO spark.SparkContext: Job finished: first at <console>:15, took 0.034659 s
res1: String = # Apache Spark
scala> textFile.filter(line=>line.contains("Spark"))
res2: org.apache.spark.rdd.RDD[String] = FilteredRDD[2] at filter at <console>:15
scala> textFile.filter(line=>line.contains("Spark")).count()
14/06/25 15:12:42 INFO spark.SparkContext: Starting job: count at <console>:15
...
14/06/25 15:12:42 INFO scheduler.DAGScheduler: Completed ResultTask(2, 0)
14/06/25 15:12:42 INFO scheduler.TaskSetManager: Finished TID 2 in 42 ms on localhost (progress: 1/2)
14/06/25 15:12:42 INFO scheduler.DAGScheduler: Completed ResultTask(2, 1)
14/06/25 15:12:42 INFO scheduler.TaskSetManager: Finished TID 3 in 40 ms on localhost (progress: 2/2)
14/06/25 15:12:42 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool
14/06/25 15:12:42 INFO scheduler.DAGScheduler: Stage 2 (count at <console>:15) finished in 0.044 s
14/06/25 15:12:42 INFO spark.SparkContext: Job finished: count at <console>:15, took 0.057826 s
res3: Long = 15
scala> val wordCounts=textFile.flatMap(line=>line.split(" ")).map(word=>(word, 1)).reduceByKey((a,b)=>a+b)
wordCounts: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[8] at reduceByKey at <console>:14
scala> wordCounts.collect()
14/06/25 15:13:51 INFO spark.SparkContext: Starting job: collect at <console>:17
...
14/06/25 15:13:51 INFO scheduler.DAGScheduler: Stage 3 (collect at <console>:17) finished in 0.105 s
14/06/25 15:13:51 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool
14/06/25 15:13:51 INFO spark.SparkContext: Job finished: collect at <console>:17, took 0.387106 s
res4: Array[(String, Int)] = Array((means,1), (under,2), (this,4), (Because,1), (Python,2), (agree,1), (cluster.,1), (its,1), (YARN,,3), (have,2), (pre-built,1), (MRv1,,1), (locally.,1), (locally,2), (changed,1), (several,1), (only,1), (sc.parallelize(1,1), (This,2), (basic,1), (first,1), (requests,1), (documentation,1), (Configuration,1), (MapReduce,2), (without,1), (setting,1), ("yarn-client",1), ([params]`.,1), (any,2), (application,1), (prefer,1), (SparkPi,2), (<http://spark.apache.org/>,1), (version,3), (file,1), (documentation,,1), (test,1), (MASTER,1), (entry,1), (example,3), (are,2), (systems.,1), (params,1), (scala>,1), (<artifactId>hadoop-client</artifactId>,1), (refer,1), (configure,1), (Interactive,2), (artifact,1), (can,7), (file's,1), (build,3), (when,2), (2.0.X,,1), (Apac...
scala>
# 使用maven写个简单的hellowold提交到yarn
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ bin/spark-submit --class "SimpleApp" --master local[2] hello/simple-project-1.0.jar
Spark assembly has been built with Hive, including Datanucleus jars on classpath
14/06/25 15:37:19 INFO spark.SecurityManager: Changing view acls to: hadoop
...
14/06/25 15:37:22 INFO spark.SparkContext: Job finished: count at SimpleApp.java:16, took 0.029177 s
Lines with a: 73, lines with b: 35
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ bin/spark-submit --class "SimpleApp" --master yarn-cluster hello/simple-project-1.0.jar
Spark assembly has been built with Hive, including Datanucleus jars on classpath
14/06/25 15:31:28 INFO client.RMProxy: Connecting to ResourceManager at umcc97-79/10.18.97.79:8032
...
14/06/25 15:31:49 INFO yarn.Client: Application report from ASM:
application identifier: application_1403006477300_3408
appId: 3408
clientToAMToken: null
appDiagnostics:
appMasterHost: umcc97-142
appQueue: default
appMasterRpcPort: 0
appStartTime: 1403681493261
yarnAppState: FINISHED
distributedFinalState: SUCCEEDED
appTrackingUrl: umcc97-79:8088/proxy/application_1403006477300_3408/A
appUser: hadoop
# 使用web yarnserver:8088查看的任务日志
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ bin/spark-submit --class "SimpleApp" --master yarn-client hello/simple-project-1.0.jar
Spark assembly has been built with Hive, including Datanucleus jars on classpath
14/06/25 17:34:45 INFO spark.SecurityManager: Changing view acls to: hadoop
...
14/06/25 17:35:09 INFO cluster.YarnClientClusterScheduler: Adding task set 1.0 with 2 tasks
14/06/25 17:35:09 INFO scheduler.TaskSetManager: Starting task 1.0:0 as TID 2 on executor 2: umcc97-142 (PROCESS_LOCAL)
14/06/25 17:35:09 INFO scheduler.TaskSetManager: Serialized task 1.0:0 as 1902 bytes in 1 ms
14/06/25 17:35:09 INFO scheduler.TaskSetManager: Starting task 1.0:1 as TID 3 on executor 1: umcc97-144 (PROCESS_LOCAL)
14/06/25 17:35:09 INFO scheduler.TaskSetManager: Serialized task 1.0:1 as 1902 bytes in 0 ms
14/06/25 17:35:09 INFO scheduler.DAGScheduler: Completed ResultTask(1, 0)
14/06/25 17:35:09 INFO scheduler.TaskSetManager: Finished TID 2 in 32 ms on umcc97-142 (progress: 1/2)
14/06/25 17:35:09 INFO scheduler.DAGScheduler: Completed ResultTask(1, 1)
14/06/25 17:35:09 INFO scheduler.TaskSetManager: Finished TID 3 in 33 ms on umcc97-144 (progress: 2/2)
14/06/25 17:35:09 INFO cluster.YarnClientClusterScheduler: Removed TaskSet 1.0, whose tasks have all completed, from pool
14/06/25 17:35:09 INFO scheduler.DAGScheduler: Stage 1 (count at SimpleApp.java:16) finished in 0.036 s
14/06/25 17:35:09 INFO spark.SparkContext: Job finished: count at SimpleApp.java:16, took 0.054881 s
Lines with a: 73, lines with b: 35
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$
# 参考 http://spark.apache.org/docs/latest/spark-standalone.html
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ sbin/start-master.sh
starting org.apache.spark.deploy.master.Master, logging to /home/hadoop/spark-1.0.0-bin-hadoop2/sbin/../logs/spark-hadoop-org.apache.spark.deploy.master.Master-1-umcc97-44.out
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ bin/spark-class org.apache.spark.deploy.worker.Worker spark://umcc97-44:7077
Spark assembly has been built with Hive, including Datanucleus jars on classpath
14/06/25 16:01:27 INFO spark.SecurityManager: Changing view acls to: hadoop
14/06/25 16:01:27 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop)
14/06/25 16:01:27 INFO slf4j.Slf4jLogger: Slf4jLogger started
14/06/25 16:01:27 INFO Remoting: Starting remoting
14/06/25 16:01:28 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkWorker@umcc97-44:16211]
14/06/25 16:01:28 INFO worker.Worker: Starting Spark worker umcc97-44:16211 with 4 cores, 14.7 GB RAM
14/06/25 16:01:28 INFO worker.Worker: Spark home: /home/hadoop/spark-1.0.0-bin-hadoop2
14/06/25 16:01:28 INFO server.Server: jetty-8.y.z-SNAPSHOT
14/06/25 16:01:28 INFO server.AbstractConnector: Started [email protected]:8081
14/06/25 16:01:28 INFO ui.WorkerWebUI: Started WorkerWebUI at http://umcc97-44:8081
14/06/25 16:01:28 INFO worker.Worker: Connecting to master spark://umcc97-44:7077...
14/06/25 16:01:28 INFO worker.Worker: Successfully registered with master spark://umcc97-44:7077
[hadoop@umcc97-44 spark-1.0.0-bin-hadoop2]$ bin/spark-shell --master spark://umcc97-44:7077
Spark assembly has been built with Hive, including Datanucleus jars on classpath
...
Spark context available as sc.
...
scala> var textFile=sc.textFile("README.md")
14/06/25 16:11:48 INFO storage.MemoryStore: ensureFreeSpace(70553) called with curMem=139131, maxMem=309225062
14/06/25 16:11:48 INFO storage.MemoryStore: Block broadcast_1 stored as values to memory (estimated size 68.9 KB, free 294.7 MB)
textFile: org.apache.spark.rdd.RDD[String] = MappedRDD[8] at textFile at <console>:12
scala> val wordCounts=textFile.flatMap(line=>line.split(" ")).map(word=>(word, 1)).reduceByKey((a,b)=>a+b)
14/06/25 16:11:51 INFO mapred.FileInputFormat: Total input paths to process : 1
wordCounts: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[13] at reduceByKey at <console>:14
scala> wordCounts.collect()
14/06/25 16:11:54 INFO spark.SparkContext: Starting job: collect at <console>:17
...
14/06/25 16:11:54 INFO scheduler.DAGScheduler: Stage 2 (collect at <console>:17) finished in 0.081 s
14/06/25 16:11:54 INFO spark.SparkContext: Job finished: collect at <console>:17, took 0.407533 s
res1: Array[(String, Int)] = Array((means,1), (under,2), (this,4), (Because,1), (Python,2), (agree,1), (cluster.,1), (its,1), (YARN,,3), (have,2), (pre-built,1), (MRv1,,1), (locally.,1), (locally,2), (changed,1), (several,1), (only,1), (sc.parallelize(1,1), (This,2), (basic,1), (first,1), (requests,1), (documentation,1), (Configuration,1), (MapReduce,2), (without,1), (setting,1), ("yarn-client",1), ([params]`.,1), (any,2), (application,1), (prefer,1), (SparkPi,2), (<http://spark.apache.org/>,1), (version,3), (file,1), (documentation,,1), (test,1), (MASTER,1), (entry,1), (example,3), (are,2), (systems.,1), (params,1), (scala>,1), (<artifactId>hadoop-client</artifactId>,1), (refer,1), (configure,1), (Interactive,2), (artifact,1), (can,7), (file's,1), (build,3), (when,2), (2.0.X,,1), (Apac...
# 可以通过http://umcc97-44:8080/查看任务
scala> textFile.count()
# 多count几次,可以再http://umcc97-44:4040/stages/看到每次都会新增一条记录
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment