Last active
April 14, 2017 23:48
-
-
Save jeongho/d09a82bdf1ec6dced23208b2010ca9ad to your computer and use it in GitHub Desktop.
Spark terasort
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/ehiggs/spark-terasort | |
# https://samanaghazadeh.wordpress.com/2015/04/18/terasort-equivalent-for-apache-spark/ | |
10gb | |
application_1492200106363_0096 TeraGen (10GB) 2017/04/14 20:33:33 2017/04/14 20:34:41 1.1 min hdfs 2017/04/14 20:34:41 | |
application_1492200106363_0098 TeraSort 2017/04/14 20:46:55 2017/04/14 20:48:07 1.2 min hdfs 2017/04/14 20:48:07 | |
application_1492200106363_0099 TeraValidate 2017/04/14 20:48:51 2017/04/14 20:49:20 29 s hdfs 2017/04/14 20:49:20 | |
100gb | |
application_1492200106363_0097 TeraGen (100GB) 2017/04/14 20:35:37 2017/04/14 20:48:15 13 min hdfs 2017/04/14 20:48:15 | |
application_1492200106363_0101 TeraSort 2017/04/14 20:53:16 2017/04/14 20:57:42 4.4 min hdfs 2017/04/14 20:57:43 | |
application_1492200106363_0102 TeraValidate 2017/04/14 20:58:21 2017/04/14 20:59:11 50 s hdfs 2017/04/14 20:59:11 | |
100gb yarn terasort mr2 | |
application_1492200106363_0328 TeraGen Fri Apr 14 16:26:03 -0700 2017 2m41.059s | |
application_1492200106363_0307 TeraSort Fri Apr 14 16:20:30 -0700 2017 5m27.424s | |
application_1492200106363_0298 TeraValidate Fri Apr 14 16:26:03 -0700 2017 0m21.955s | |
400gb | |
application_1492200106363_0107 TeraGen (400GB) 2017/04/14 21:05:40 2017/04/14 21:55:03 49 min hdfs 2017/04/14 21:55:03 | |
application_1492200106363_0296 TeraSort 2017/04/14 22:52:30 2017/04/14 23:32:59 40 min hdfs 2017/04/14 23:41:06 | |
>>> 5505/5964 (192 failed) | |
1000gb | |
900gb | |
500gb | |
=========================================================================== | |
=========================================================================== | |
Input size: 900GB | |
Total number of records: 9000000000 | |
Number of output partitions: 2 | |
Number of records/output partition: 4500000000 | |
=========================================================================== | |
=========================================================================== | |
Exception in thread "main" java.lang.AssertionError: assertion failed: records per partition > 2147483647 | |
at scala.Predef$.assert(Predef.scala:179) | |
at com.github.ehiggs.spark.terasort.TeraGen$.main(TeraGen.scala:64) | |
at com.github.ehiggs.spark.terasort.TeraGen.main(TeraGen.scala) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:498) | |
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) | |
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) | |
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) | |
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) | |
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) | |
# spark-terasort yarn hdfs | |
spark-submit --class com.github.ehiggs.spark.terasort.TeraGen \ | |
--master yarn \ | |
spark-terasort-1.1-SNAPSHOT.jar \ | |
10g terasort_in | |
spark-submit --class com.github.ehiggs.spark.terasort.TeraSort \ | |
--master yarn \ | |
spark-terasort-1.1-SNAPSHOT.jar \ | |
terasort_in \ | |
terasort_out | |
spark-submit --class com.github.ehiggs.spark.terasort.TeraValidate \ | |
--master yarn \ | |
spark-terasort-1.1-SNAPSHOT.jar \ | |
terasort_out \ | |
terasort_validate | |
# spark-terasort yarn alluxio | |
spark-submit --class com.github.ehiggs.spark.terasort.TeraGen \ | |
--master yarn \ | |
--jars /opt/alluxio/alluxio/client/spark/alluxio-1.4.0-spark-client-jar-with-dependencies.jar \ | |
spark-terasort-1.1-SNAPSHOT.jar \ | |
10g alluxio://${ALLUXIO_MASTER_HOSTNAME}:19998/user/hdfs/terasort_in | |
spark-submit --class com.github.ehiggs.spark.terasort.TeraSort \ | |
--master yarn \ | |
--jars /opt/alluxio/alluxio/client/spark/alluxio-1.4.0-spark-client-jar-with-dependencies.jar \ | |
spark-terasort-1.1-SNAPSHOT.jar \ | |
alluxio://${ALLUXIO_MASTER_HOSTNAME}:19998/user/hdfs/terasort_in \ | |
alluxio://${ALLUXIO_MASTER_HOSTNAME}:19998/user/hdfs/terasort_out | |
spark-submit --class com.github.ehiggs.spark.terasort.TeraValidate \ | |
--master yarn \ | |
--jars /opt/alluxio/alluxio/client/spark/alluxio-1.4.0-spark-client-jar-with-dependencies.jar \ | |
spark-terasort-1.1-SNAPSHOT.jar \ | |
alluxio://${ALLUXIO_MASTER_HOSTNAME}:19998/user/hdfs/terasort_out \ | |
alluxio://${ALLUXIO_MASTER_HOSTNAME}:19998/user/hdfs/terasort_validate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment