Last active
November 30, 2018 19:49
-
-
Save ruebot/228e6b5b83c9b56c42d71ed43c308090 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ ./spark-shell --master local\[10\] --driver-memory 30G --conf spark.network.timeout='10000000' --conf spark.executor.heartbeatInterval='600s' --conf spark.driver.maxResultSize='4G' --jars ~/git/aut/target/aut-0.17.1-SNAPSHOT-fatjar.jar | |
2018-11-30 09:08:03 WARN Utils:66 - Your hostname, wombat resolves to a loopback address: 127.0.1.1; using 10.0.1.44 instead (on interface enp0s31f6) | |
2018-11-30 09:08:03 WARN Utils:66 - Set SPARK_LOCAL_IP if you need to bind to another address | |
2018-11-30 09:08:04 WARN NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
Setting default log level to "WARN". | |
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). | |
Spark context Web UI available at http://10.0.1.44:4040 | |
Spark context available as 'sc' (master = local[10], app id = local-1543586887449). | |
Spark session available as 'spark'. | |
Welcome to | |
____ __ | |
/ __/__ ___ _____/ /__ | |
_\ \/ _ \/ _ `/ __/ '_/ | |
/___/ .__/\_,_/_/ /_/\_\ version 2.4.0 | |
/_/ | |
Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_191) | |
Type in expressions to have them evaluated. | |
Type :help for more information. | |
scala> :paste | |
// Entering paste mode (ctrl-D to finish) | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
val r = RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc) | |
.keepValidPages() | |
.map(r => ExtractDomain(r.getUrl)) | |
.countItems() | |
.take(10) | |
// Exiting paste mode, now interpreting. | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
r: Array[(String, Int)] = Array((fairvotecanada.org,86336), (afn.ca,16348), (ppforum.com,9250), (canadiancrc.com,7178), (coat.ncf.ca,4779), (ccla.org,3870), (nosharia.com,3848), (web.net,3268), (gca.ca,2632), (equalvoice.ca,2070)) | |
scala> :paste | |
// Entering paste mode (ctrl-D to finish) | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
val r = RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc) | |
.keepValidPages() | |
.map(r => r.getUrl) | |
.take(10) | |
// Exiting paste mode, now interpreting. | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
r: Array[String] = Array(http://web.net/%7Eccr/, http://nosharia.com/, http://davidsuzuki.org/, http://policyalternatives.ca/, http://watserv1.uwaterloo.ca/%7Eplough/cnanw/cnanw.html, http://www.canadianlandmine.org/, http://egale.ca/, http://equalvoice.ca/index.htm, http://acp-cpa.ca/, http://westcan.org/) | |
scala> :paste | |
// Entering paste mode (ctrl-D to finish) | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
val r = RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc) | |
.keepValidPages() | |
.map(r => r.getUrl) | |
.saveAsTextFile("/tmp/aut-test-3") | |
// Exiting paste mode, now interpreting. | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
r: Unit = () | |
scala> :paste | |
// Entering paste mode (ctrl-D to finish) | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
val r = | |
RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc) | |
.keepValidPages() | |
.map(r => ExtractDomain(r.getUrl)) | |
.countItems() | |
.take(10) | |
// Exiting paste mode, now interpreting. | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
r: Array[(String, Int)] = Array((fairvotecanada.org,86336), (afn.ca,16348), (ppforum.com,9250), (canadiancrc.com,7178), (coat.ncf.ca,4779), (ccla.org,3870), (nosharia.com,3848), (web.net,3268), (gca.ca,2632), (equalvoice.ca,2070)) | |
// Entering paste mode (ctrl-D to finish) | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc) | |
.keepValidPages() | |
.map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString))) | |
.saveAsTextFile("/tmp/aut-test-6") | |
// Exiting paste mode, now interpreting. | |
import io.archivesunleashed._ | |
import io.archivesunleashed.matchbox._ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment