Created
May 14, 2020 15:54
-
-
Save ruebot/d14d15f43da723ba6143807df047e09c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from aut import *" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------------------------------------------------------------------------------+----------------+\n", | |
"|url |http_status_code|\n", | |
"+-----------------------------------------------------------------------------------+----------------+\n", | |
"|http://geocities.com/babiekaos/Links.html |200 |\n", | |
"|http://geocities.com/cloneaccount3/6490/ |200 |\n", | |
"|http://www.geocities.com/coledale28/hi-power-soldiers-music.html |200 |\n", | |
"|http://www.geocities.com/orvilleduncan811/12-day-of-christmas-sheet-music.html |200 |\n", | |
"|http://geocities.com/jtbm71/fotos/2000/ |200 |\n", | |
"|http://geocities.com/cancmay/s/sunshine.html |200 |\n", | |
"|http://www.talent-direct.com/cgi-bin/tal_pro.cgi?profile=ARZCdYbJU5KsMARKdUxiO4l3DY|200 |\n", | |
"|http://geocities.com/akimi919/sp_ph/?M=A |200 |\n", | |
"|http://geocities.com/cancmay/s/save-tonight.html |200 |\n", | |
"|http://www.geocities.com/orvilleduncan811/child-youth-elbow-knee-pad.html |200 |\n", | |
"+-----------------------------------------------------------------------------------+----------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .all()\\\n", | |
" .select(\"url\", \"http_status_code\")\\\n", | |
" .show(10, False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .all()\\\n", | |
" .select(\"url\", \"archive_filename\")\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+---------------------------+-----+\n", | |
"|Domain |count|\n", | |
"+---------------------------+-----+\n", | |
"|geocities.com |93886|\n", | |
"|www.geocities.com |29223|\n", | |
"|www.infocastfn.com |430 |\n", | |
"|rcm.amazon.com |201 |\n", | |
"|www.bagus.com |133 |\n", | |
"|www.globalimagegallery.com |130 |\n", | |
"|www.physforum.com |124 |\n", | |
"|www.internetarchaeology.org|121 |\n", | |
"|us.geocities.com |121 |\n", | |
"|www.spb.tvoe.tv |108 |\n", | |
"+---------------------------+-----+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"from pyspark.sql.functions import desc\n", | |
"\n", | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .select(Udf.extract_domain(\"url\").alias(\"Domain\"))\\\n", | |
" .groupBy(\"Domain\")\\\n", | |
" .count()\\\n", | |
" .sort(desc(\"count\"))\\\n", | |
" .show(10, False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"|crawl_date| domain| url| content|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"| 20091027| geocities.com|http://geocities....|Sushi Land Sushi ...|\n", | |
"| 20091027| geocities.com|http://geocities....|Andrea Cruz Welco...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|Hi Power Soldiers...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|12 Day Of Christm...|\n", | |
"| 20091027| geocities.com|http://geocities....|Index of /jtbm71/...|\n", | |
"| 20091027| geocities.com|http://geocities....|sunshine CanCMay ...|\n", | |
"| 20091027|www.talent-direct...|http://www.talent...|talent direct voi...|\n", | |
"| 20091027| geocities.com|http://geocities....|Index of /akimi91...|\n", | |
"| 20091027| geocities.com|http://geocities....|stardust CanCMay ...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|Child Youth Elbow...|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_html(\"content\").alias(\"content\"))\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+\n", | |
"| content|\n", | |
"+--------------------+\n", | |
"|Sushi Land Sushi ...|\n", | |
"|Andrea Cruz Welco...|\n", | |
"|Hi Power Soldiers...|\n", | |
"|12 Day Of Christm...|\n", | |
"|Index of /jtbm71/...|\n", | |
"|sunshine CanCMay ...|\n", | |
"|talent direct voi...|\n", | |
"|Index of /akimi91...|\n", | |
"|stardust CanCMay ...|\n", | |
"|Child Youth Elbow...|\n", | |
"+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .select(Udf.remove_html(Udf.remove_http_header(\"content\")).alias(\"content\"))\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"|crawl_date| domain| url| content|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"| 20091027| geocities.com|http://geocities....|Sushi Land Sushi ...|\n", | |
"| 20091027| geocities.com|http://geocities....|Andrea Cruz Welco...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|Hi Power Soldiers...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|12 Day Of Christm...|\n", | |
"| 20091027| geocities.com|http://geocities....|Index of /jtbm71/...|\n", | |
"| 20091027| geocities.com|http://geocities....|sunshine CanCMay ...|\n", | |
"| 20091027|www.talent-direct...|http://www.talent...|talent direct voi...|\n", | |
"| 20091027| geocities.com|http://geocities....|Index of /akimi91...|\n", | |
"| 20091027| geocities.com|http://geocities....|stardust CanCMay ...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|Child Youth Elbow...|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_html(Udf.remove_http_header(\"content\")).alias(\"content\"))\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"|crawl_date| domain| url| content|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"| 20091027| geocities.com|http://geocities....|Nori (seaweed) wa...|\n", | |
"| 20091027| geocities.com|http://geocities....| This site is about:|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|Hi Power Soldiers...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|12 Day Of Christm...|\n", | |
"| 20091027| geocities.com|http://geocities....| |\n", | |
"| 20091027| geocities.com|http://geocities....|CanCMay Sunshine ...|\n", | |
"| 20091027|www.talent-direct...|http://www.talent...| |\n", | |
"| 20091027| geocities.com|http://geocities....| |\n", | |
"| 20091027| geocities.com|http://geocities....|Save Tonight Mind...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|Child Youth Elbow...|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.extract_boilerplate(Udf.remove_http_header(\"content\")).alias(\"content\"))\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"|crawl_date| domain| url| content|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"| 20091027| geocities.com|http://geocities....|\r\n", | |
"<html>\r\n", | |
"\r\n", | |
"<head...|\n", | |
"| 20091027| geocities.com|http://geocities....|<html><head><titl...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|<html>\r\n", | |
"\r\n", | |
"<head>\r", | |
"...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|<html>\r\n", | |
"\r\n", | |
"<head>\r", | |
"...|\n", | |
"| 20091027| geocities.com|http://geocities....|<!DOCTYPE HTML PU...|\n", | |
"| 20091027| geocities.com|http://geocities....|<html>\r\n", | |
"<head><ti...|\n", | |
"| 20091027|www.talent-direct...|http://www.talent...|\r\n", | |
"\r\n", | |
"<!DOCTYPE htm...|\n", | |
"| 20091027| geocities.com|http://geocities....|<!DOCTYPE HTML PU...|\n", | |
"| 20091027| geocities.com|http://geocities....|<html>\r\n", | |
"<head><ti...|\n", | |
"| 20091027| www.geocities.com|http://www.geocit...|<html>\r\n", | |
"\r\n", | |
"<head>\r", | |
"...|\n", | |
"+----------+--------------------+--------------------+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_http_header(\"content\").alias(\"content\"))\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+\n", | |
"| udf_sha1| sha1| udf_md5| md5|udf_image_size|height|width|\n", | |
"+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+\n", | |
"|99d74a3b4fbd6d7cd...|9ca2bc31550f9369e...|ce1b5ab3e51fd9f6b...|ce4c718e925105232...| [0, 0]| 432| 288|\n", | |
"|245b94c90eac0dcd9...|ff0467d8d2cbc5d50...|9b8909a52d94b6d17...|f6b631a4db5f4c7a3...| [0, 0]| 103| 1200|\n", | |
"|cd19e4e7e2dd9e090...|faa81452f0c19b304...|a97c139a3a31467ae...|4f59788bde58d15d5...| [0, 0]| 1| 1|\n", | |
"|fd5eb52badba72a29...|0720946d3ced04976...|83ca84887072a62b9...|2677171223600bf34...| [0, 0]| 480| 1050|\n", | |
"|9333370d1f79af66c...|f9aa611fc62b735c3...|586628aaae7e0076a...|0a089830419a5c0ed...| [0, 0]| 315| 217|\n", | |
"|676b4a596a901024a...|5bb4bf5dfe39520a3...|dcec4d3ffac515f73...|a0210969ba9fac53a...| [0, 0]| 156| 136|\n", | |
"|4c99aa50462f84723...|b8a56b4dc015bdcc2...|fb51b7a1e1c25dc87...|c7d81ae036f502cf3...| [0, 0]| 32| 200|\n", | |
"|2bd92aea1b6370079...|0075394d3de702d27...|ad30253c36cb51e8a...|835fa6581c493ad15...| [0, 0]| 60| 600|\n", | |
"|86a7bcceae53c92b6...|2d99c303d7e8ca75f...|00628da87d6300e7c...|0134e45aca6297e8c...| [0, 0]| 36| 140|\n", | |
"|c5fa5c7c1a897f136...|429e3558e2b579426...|f9411618cf0a1c858...|a7b85484410cde43e...| [0, 0]| 640| 480|\n", | |
"+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .images()\\\n", | |
" .select(Udf.compute_sha1(\"bytes\").alias(\"udf_sha1\"), \"sha1\", Udf.compute_md5(\"bytes\").alias(\"udf_md5\"), \"md5\", Udf.compute_image_size(\"bytes\").alias(\"udf_image_size\"), \"height\", \"width\")\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+------------+--------+\n", | |
"|crawl_date|udf_language|language|\n", | |
"+----------+------------+--------+\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| ms|\n", | |
"| 20091027| en| en|\n", | |
"| 20091027| en| en|\n", | |
"+----------+------------+--------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .select(\"crawl_date\", Udf.detect_language(\"content\").alias(\"udf_language\"), \"language\")\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+--------------------+--------------------+\n", | |
"|crawl_date| udf_tika| mime_type_tika|\n", | |
"+----------+--------------------+--------------------+\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027|application/xhtml...|application/xhtml...|\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027| text/html| text/html|\n", | |
"| 20091027| text/html| text/html|\n", | |
"+----------+--------------------+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .all()\\\n", | |
" .select(\"crawl_date\", Udf.detect_mime_type_tika(\"bytes\").alias(\"udf_tika\"), \"mime_type_tika\")\\\n", | |
" .show(10, True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "Py4JJavaError", | |
"evalue": "An error occurred while calling o397.apply.\n: java.lang.ClassCastException: io.archivesunleashed.UdfLoader$$anonfun$hasContent$1 cannot be cast to scala.Function1\n\tat org.apache.spark.sql.catalyst.expressions.ScalaUDF.<init>(ScalaUDF.scala:104)\n\tat org.apache.spark.sql.expressions.UserDefinedFunction.apply(UserDefinedFunction.scala:85)\n\tat sun.reflect.GeneratedMethodAccessor70.invoke(Unknown Source)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-25-20fdfdb2bd6b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mWebArchive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msqlContext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"/home/nruest/Projects/au/sample-data/geocities\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mwebpages\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mUdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhas_content\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"content\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"crawl_date\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract_domain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"url\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malias\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"domain\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"url\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremove_http_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"content\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malias\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"content\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/tmp/spark-ebab34db-5e87-43ba-9304-301861215262/userFiles-90a2a3e8-a131-4bcc-91fe-e9e52f8a8941/aut.zip/aut/udfs.py\u001b[0m in \u001b[0;36mhas_content\u001b[0;34m(col, content)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m )\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mColumn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mudf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_to_seq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_to_java_column\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mremove_http_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/bin/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1257\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1259\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/bin/spark-2.4.5-bin-hadoop2.7/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/bin/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 326\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 327\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m raise Py4JError(\n", | |
"\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o397.apply.\n: java.lang.ClassCastException: io.archivesunleashed.UdfLoader$$anonfun$hasContent$1 cannot be cast to scala.Function1\n\tat org.apache.spark.sql.catalyst.expressions.ScalaUDF.<init>(ScalaUDF.scala:104)\n\tat org.apache.spark.sql.expressions.UserDefinedFunction.apply(UserDefinedFunction.scala:85)\n\tat sun.reflect.GeneratedMethodAccessor70.invoke(Unknown Source)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n" | |
] | |
} | |
], | |
"source": [ | |
"content = [\"radio\"]\n", | |
"\n", | |
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n", | |
" .webpages()\\\n", | |
" .filter(Udf.has_content(\"content\", content))\\\n", | |
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_http_header(\"content\").alias(\"content\"))\\\n", | |
" .show(10, True)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment