Created
January 7, 2020 08:05
-
-
Save dk14/4d6e3d03a33d70c88e1af312b15718f2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Loading spark-stubs\n", | |
"Getting spark JARs\n", | |
"Creating SparkSession\n", | |
"[ WARN] Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<a href=\"http://192.168.8.100:4040\">Spark UI</a>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<script>\n", | |
"var comm = Jupyter.notebook.kernel.comm_manager.new_comm('cancel-stage-14b8f141-94cf-481e-8057-4173aca4f4ea', {});\n", | |
"\n", | |
"function cancelStage(stageId) {\n", | |
" console.log('Cancelling stage ' + stageId);\n", | |
" comm.send({ 'stageId': stageId });\n", | |
"}\n", | |
"</script>\n", | |
" " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">parquet at cmd0.sc:17</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n", | |
"\u001b[39m\n", | |
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.pretrained.PretrainedPipeline\n", | |
"\u001b[39m\n", | |
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.SparkNLP\n", | |
"\u001b[39m\n", | |
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n", | |
"\u001b[39m\n", | |
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n", | |
"\u001b[39m\n", | |
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n", | |
"\n", | |
"\u001b[39m\n", | |
"\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n", | |
"\n", | |
"\u001b[39m\n", | |
"\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@176936a7\n", | |
"\u001b[36mtestData\u001b[39m: \u001b[32mDataFrame\u001b[39m = [text: string]" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import $ivy.`com.johnsnowlabs.nlp::spark-nlp:2.3.5`\n", | |
"import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline\n", | |
"import com.johnsnowlabs.nlp.SparkNLP\n", | |
"import $ivy.`org.apache.spark::spark-sql:2.4.0` \n", | |
"import $ivy.`org.apache.spark::spark-mllib:2.4.0` \n", | |
"import $ivy.`sh.almond::almond-spark:0.6.0`\n", | |
"\n", | |
"import org.apache.spark.sql._\n", | |
"\n", | |
"val spark = {\n", | |
" NotebookSparkSession.builder()\n", | |
" .master(\"local[*]\")\n", | |
" .getOrCreate()\n", | |
"}\n", | |
"\n", | |
"\n", | |
"val testData = spark.read.parquet(\"./sample_text.parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"glove_100d download started this may take some time.\n", | |
"Approximate size to download 144.3 MB\n", | |
"Download done! Loading the resource.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">first at ReadWrite.scala:615</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"pos_anc download started this may take some time.\n", | |
"Approximate size to download 4.3 MB\n", | |
"Download done! Loading the resource.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">first at ReadWrite.scala:615</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">first at Feature.scala:120</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">first at Feature.scala:120</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 4 / 4\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ner_crf download started this may take some time.\n", | |
"Approximate size to download 10.1 MB\n", | |
"Download done! Loading the resource.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">first at ReadWrite.scala:615</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">first at Feature.scala:120</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">first at Feature.scala:120</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 4 / 4\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">collect at Feature.scala:161</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 12 / 12\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.annotator._\n", | |
"\u001b[39m\n", | |
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.base._\n", | |
"\n", | |
"\u001b[39m\n", | |
"\u001b[36mdocumentAssembler\u001b[39m: \u001b[32mDocumentAssembler\u001b[39m = document_0e03e3f48692\n", | |
"\u001b[36mtokenizer\u001b[39m: \u001b[32mTokenizer\u001b[39m = REGEX_TOKENIZER_813151f3fe72\n", | |
"\u001b[36membed\u001b[39m: \u001b[32mWordEmbeddingsModel\u001b[39m = WORD_EMBEDDINGS_MODEL_2f4ad586e8a2\n", | |
"\u001b[36mtagger_pos\u001b[39m: \u001b[32mPerceptronModel\u001b[39m = POS_29fd848601e6\n", | |
"\u001b[36mner_crf\u001b[39m: \u001b[32mNerCrfModel\u001b[39m = NER_cb957d2304b3\n", | |
"\u001b[36mrecursivePipeline\u001b[39m: \u001b[32mRecursivePipeline\u001b[39m = RECURSIVE_PIPELINE_0853188169e5" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import com.johnsnowlabs.nlp.annotator._\n", | |
"import com.johnsnowlabs.nlp.base._\n", | |
"\n", | |
"val documentAssembler = new DocumentAssembler().\n", | |
" setInputCol(\"text\").\n", | |
" setOutputCol(\"document\")\n", | |
"\n", | |
"val tokenizer = new Tokenizer().\n", | |
" setInputCols(Array(\"document\")).\n", | |
" setOutputCol(\"token\")\n", | |
"\n", | |
"val embed = WordEmbeddingsModel.pretrained(\"glove_100d\"). //100 dimensions, I guess\n", | |
" setInputCols(\"document\", \"token\").\n", | |
" setOutputCol(\"embed_100\")\n", | |
"\n", | |
"val tagger_pos = PerceptronModel.pretrained(\"pos_anc\", lang=\"en\").\n", | |
" setInputCols(Array(\"token\", \"document\")).\n", | |
" setOutputCol(\"POS\")\n", | |
"\n", | |
"val ner_crf = NerCrfModel.pretrained(\"ner_crf\", lang = \"en\").\n", | |
" setInputCols(Array(\"document\", \"token\", \"POS\", \"embed_100\")).\n", | |
" setOutputCol(\"NER\")\n", | |
"\n", | |
"val recursivePipeline = new RecursivePipeline()\n", | |
" .setStages(Array(documentAssembler, tokenizer, embed, tagger_pos, ner_crf))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n", | |
"\n", | |
"\u001b[39m\n", | |
"\u001b[36mlearningPipeline\u001b[39m: \u001b[32morg\u001b[39m.\u001b[32mapache\u001b[39m.\u001b[32mspark\u001b[39m.\u001b[32mml\u001b[39m.\u001b[32mPipelineModel\u001b[39m = pipeline_092ba3ab09ce\n", | |
"\u001b[36mannotatorPipeline\u001b[39m: \u001b[32mLightPipeline\u001b[39m = com.johnsnowlabs.nlp.LightPipeline@920f364" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import org.apache.spark.sql.functions._\n", | |
"\n", | |
"val learningPipeline = recursivePipeline.fit(testData)\n", | |
"val annotatorPipeline = new LightPipeline(learningPipeline)\n", | |
"annotatorPipeline.transform(testData).createOrReplaceTempView(\"analysis\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">show at cmd3.sc:11</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+----------+---+\n", | |
"| input| word|POS|\n", | |
"+--------------------+----------+---+\n", | |
"|Reuters historica...| Reuters|NNP|\n", | |
"|Reuters historica...|historical| JJ|\n", | |
"|Reuters historica...| calendar| NN|\n", | |
"|Reuters historica...| -| -|\n", | |
"|Reuters historica...| September|NNP|\n", | |
"|Reuters historica...| 7| CD|\n", | |
"|Reuters historica...| .| .|\n", | |
"|Following are som...| Following|VBG|\n", | |
"|Following are som...| are|VBP|\n", | |
"|Following are som...| some| DT|\n", | |
"|Following are som...| of| IN|\n", | |
"|Following are som...| the| DT|\n", | |
"|Following are som...| major| JJ|\n", | |
"|Following are som...| events|NNS|\n", | |
"|Following are som...| to| TO|\n", | |
"|Following are som...| have| VB|\n", | |
"|Following are som...| occurred|VBN|\n", | |
"|Following are som...| on| IN|\n", | |
"|Following are som...| September|NNP|\n", | |
"|Following are som...| 7| CD|\n", | |
"+--------------------+----------+---+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">show at cmd3.sc:12</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+----------+-----+\n", | |
"| input| word| NER|\n", | |
"+--------------------+----------+-----+\n", | |
"|Reuters historica...| Reuters|I-ORG|\n", | |
"|Reuters historica...|historical| O|\n", | |
"|Reuters historica...| calendar| O|\n", | |
"|Reuters historica...| -| O|\n", | |
"|Reuters historica...| September| O|\n", | |
"|Reuters historica...| 7| O|\n", | |
"|Reuters historica...| .| O|\n", | |
"|Following are som...| Following| O|\n", | |
"|Following are som...| are| O|\n", | |
"|Following are som...| some| O|\n", | |
"|Following are som...| of| O|\n", | |
"|Following are som...| the| O|\n", | |
"|Following are som...| major| O|\n", | |
"|Following are som...| events| O|\n", | |
"|Following are som...| to| O|\n", | |
"|Following are som...| have| O|\n", | |
"|Following are som...| occurred| O|\n", | |
"|Following are som...| on| O|\n", | |
"|Following are som...| September| O|\n", | |
"|Following are som...| 7| O|\n", | |
"+--------------------+----------+-----+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.Row\n", | |
"\u001b[39m\n", | |
"defined \u001b[32mfunction\u001b[39m \u001b[36mextract\u001b[39m\n", | |
"\u001b[36mres3_2\u001b[39m: \u001b[32mexpressions\u001b[39m.\u001b[32mUserDefinedFunction\u001b[39m = \u001b[33mUserDefinedFunction\u001b[39m(\n", | |
" <function1>,\n", | |
" \u001b[33mArrayType\u001b[39m(\n", | |
" \u001b[33mList\u001b[39m(\n", | |
" \u001b[33mStructField\u001b[39m(\u001b[32m\"_1\"\u001b[39m, StringType, true, {}),\n", | |
" \u001b[33mStructField\u001b[39m(\u001b[32m\"_2\"\u001b[39m, StringType, true, {})\n", | |
" ),\n", | |
" true\n", | |
" ),\n", | |
" \u001b[32mNone\u001b[39m\n", | |
")" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import org.apache.spark.sql.Row\n", | |
"def extract(in: Seq[Row]) = in.map(x => x.getAs[Map[String, String]](4)(\"word\") -> x.getAs[String](3))\n", | |
"spark.udf.register(\"extract\", extract _)\n", | |
"\n", | |
"spark.sql(\"select text as input, explode(extract(POS)) as POS from analysis\").createOrReplaceTempView(\"POS_analysis_raw\")\n", | |
"spark.sql(\"select text as input, explode(extract(NER)) as NER from analysis\").createOrReplaceTempView(\"NER_analysis_raw\")\n", | |
"\n", | |
"spark.sql(\"select input, POS._1 as word, POS._2 as POS from POS_analysis_raw\").createOrReplaceTempView(\"POS_analysis\")\n", | |
"spark.sql(\"select input, NER._1 as word, NER._2 as NER from NER_analysis_raw\").createOrReplaceTempView(\"NER_analysis\")\n", | |
"\n", | |
"spark.sql(\"select * from POS_analysis\").show()\n", | |
"spark.sql(\"select * from NER_analysis\").show()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Hypothesis: I-ORG, I-LOC named entities are usually nouns (tagged as NNP)**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ WARN] Lost task 3.0 in stage 15.0 (TID 41, localhost, executor driver): TaskKilled (Stage cancelled)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">run at ThreadPoolExecutor.java:1149</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 4 / 4\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ WARN] Lost task 2.0 in stage 15.0 (TID 40, localhost, executor driver): TaskKilled (Stage cancelled)\n", | |
"[ WARN] Lost task 1.0 in stage 15.0 (TID 39, localhost, executor driver): TaskKilled (Stage cancelled)\n", | |
"[ WARN] Lost task 0.0 in stage 15.0 (TID 38, localhost, executor driver): TaskKilled (Stage cancelled)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">show at cmd7.sc:2</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 4 / 4\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">show at cmd7.sc:2</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 1 / 1\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------+---+------+\n", | |
"| word|POS| NER|\n", | |
"+-----------+---+------+\n", | |
"| of| IN| I-LOC|\n", | |
"| between| IN| O|\n", | |
"| Quayle|NNP| I-PER|\n", | |
"| Congress|NNP| I-ORG|\n", | |
"| gendarmes|NNS| O|\n", | |
"| Belgian|NNP|I-MISC|\n", | |
"| :| :| O|\n", | |
"| 37.3| CD| O|\n", | |
"| extended|VBD| O|\n", | |
"| Labour|NNP|I-MISC|\n", | |
"| 275| CD| O|\n", | |
"| dignity| NN| O|\n", | |
"| department| NN| O|\n", | |
"| removing|VBG| O|\n", | |
"| prayerful| JJ| O|\n", | |
"|application| NN| O|\n", | |
"| VS|NNP| O|\n", | |
"| AGENT|NNP| O|\n", | |
"| jump| NN| O|\n", | |
"| Shulman|NNP| I-PER|\n", | |
"+-----------+---+------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"spark.sql(\"select distinct posa.word, POS, NER from POS_analysis as posa inner join NER_analysis as nera on posa.word = nera.word\").createOrReplaceTempView(\"relation\")\n", | |
"spark.sql(\"select * from relation\").show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ WARN] Lost task 1.0 in stage 19.0 (TID 52, localhost, executor driver): TaskKilled (Stage cancelled)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">run at ThreadPoolExecutor.java:1149</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 4 / 4\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ WARN] Lost task 2.0 in stage 19.0 (TID 53, localhost, executor driver): TaskKilled (Stage cancelled)\n", | |
"[ WARN] Lost task 0.0 in stage 19.0 (TID 51, localhost, executor driver): TaskKilled (Stage cancelled)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">show at cmd10.sc:1</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 4 / 4\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">show at cmd10.sc:1</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 200 / 200\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
" <span style=\"float: left;\">show at cmd10.sc:1</span>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div class=\"progress\">\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n", | |
" 200 / 200\n", | |
" </div>\n", | |
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n", | |
"</div>\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+---+------+----+\n", | |
"|POS| NER| cnt|\n", | |
"+---+------+----+\n", | |
"| NN| O|1213|\n", | |
"|NNP| I-PER| 990|\n", | |
"| JJ| O| 707|\n", | |
"| CD| O| 598|\n", | |
"|NNP| I-ORG| 590|\n", | |
"|NNS| O| 499|\n", | |
"|NNP| I-LOC| 400|\n", | |
"|NNP| O| 373|\n", | |
"|VBD| O| 274|\n", | |
"|VBN| O| 259|\n", | |
"| VB| O| 243|\n", | |
"|VBG| O| 214|\n", | |
"|NNP|I-MISC| 205|\n", | |
"| RB| O| 148|\n", | |
"| IN| O| 91|\n", | |
"|VBZ| O| 66|\n", | |
"|VBP| O| 48|\n", | |
"| JJ|I-MISC| 41|\n", | |
"|PRP| O| 24|\n", | |
"| DT| O| 24|\n", | |
"+---+------+----+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"spark.sql(\"select POS, NER, count(*) as cnt from relation group by POS, NER order by cnt desc\").show()\n", | |
"//TODO: calculate correlation matrix using MlLib" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Scala", | |
"language": "scala", | |
"name": "scala" | |
}, | |
"language_info": { | |
"codemirror_mode": "text/x-scala", | |
"file_extension": ".scala", | |
"mimetype": "text/x-scala", | |
"name": "scala", | |
"nbconvert_exporter": "script", | |
"version": "2.11.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment