Skip to content

Instantly share code, notes, and snippets.

@dk14
Created January 7, 2020 08:05
Show Gist options
  • Save dk14/4d6e3d03a33d70c88e1af312b15718f2 to your computer and use it in GitHub Desktop.
Save dk14/4d6e3d03a33d70c88e1af312b15718f2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading spark-stubs\n",
"Getting spark JARs\n",
"Creating SparkSession\n",
"[ WARN] Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
},
{
"data": {
"text/html": [
"<a href=\"http://192.168.8.100:4040\">Spark UI</a>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<script>\n",
"var comm = Jupyter.notebook.kernel.comm_manager.new_comm('cancel-stage-14b8f141-94cf-481e-8057-4173aca4f4ea', {});\n",
"\n",
"function cancelStage(stageId) {\n",
" console.log('Cancelling stage ' + stageId);\n",
" comm.send({ 'stageId': stageId });\n",
"}\n",
"</script>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">parquet at cmd0.sc:17</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n",
"\u001b[39m\n",
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.pretrained.PretrainedPipeline\n",
"\u001b[39m\n",
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.SparkNLP\n",
"\u001b[39m\n",
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n",
"\u001b[39m\n",
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n",
"\u001b[39m\n",
"\u001b[32mimport \u001b[39m\u001b[36m$ivy.$ \n",
"\n",
"\u001b[39m\n",
"\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
"\n",
"\u001b[39m\n",
"\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@176936a7\n",
"\u001b[36mtestData\u001b[39m: \u001b[32mDataFrame\u001b[39m = [text: string]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import $ivy.`com.johnsnowlabs.nlp::spark-nlp:2.3.5`\n",
"import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline\n",
"import com.johnsnowlabs.nlp.SparkNLP\n",
"import $ivy.`org.apache.spark::spark-sql:2.4.0` \n",
"import $ivy.`org.apache.spark::spark-mllib:2.4.0` \n",
"import $ivy.`sh.almond::almond-spark:0.6.0`\n",
"\n",
"import org.apache.spark.sql._\n",
"\n",
"val spark = {\n",
" NotebookSparkSession.builder()\n",
" .master(\"local[*]\")\n",
" .getOrCreate()\n",
"}\n",
"\n",
"\n",
"val testData = spark.read.parquet(\"./sample_text.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"glove_100d download started this may take some time.\n",
"Approximate size to download 144.3 MB\n",
"Download done! Loading the resource.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">first at ReadWrite.scala:615</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"pos_anc download started this may take some time.\n",
"Approximate size to download 4.3 MB\n",
"Download done! Loading the resource.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">first at ReadWrite.scala:615</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">first at Feature.scala:120</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">first at Feature.scala:120</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 4 / 4\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ner_crf download started this may take some time.\n",
"Approximate size to download 10.1 MB\n",
"Download done! Loading the resource.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">first at ReadWrite.scala:615</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">first at Feature.scala:120</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">first at Feature.scala:120</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 4 / 4\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">collect at Feature.scala:161</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 12 / 12\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.annotator._\n",
"\u001b[39m\n",
"\u001b[32mimport \u001b[39m\u001b[36mcom.johnsnowlabs.nlp.base._\n",
"\n",
"\u001b[39m\n",
"\u001b[36mdocumentAssembler\u001b[39m: \u001b[32mDocumentAssembler\u001b[39m = document_0e03e3f48692\n",
"\u001b[36mtokenizer\u001b[39m: \u001b[32mTokenizer\u001b[39m = REGEX_TOKENIZER_813151f3fe72\n",
"\u001b[36membed\u001b[39m: \u001b[32mWordEmbeddingsModel\u001b[39m = WORD_EMBEDDINGS_MODEL_2f4ad586e8a2\n",
"\u001b[36mtagger_pos\u001b[39m: \u001b[32mPerceptronModel\u001b[39m = POS_29fd848601e6\n",
"\u001b[36mner_crf\u001b[39m: \u001b[32mNerCrfModel\u001b[39m = NER_cb957d2304b3\n",
"\u001b[36mrecursivePipeline\u001b[39m: \u001b[32mRecursivePipeline\u001b[39m = RECURSIVE_PIPELINE_0853188169e5"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import com.johnsnowlabs.nlp.annotator._\n",
"import com.johnsnowlabs.nlp.base._\n",
"\n",
"val documentAssembler = new DocumentAssembler().\n",
" setInputCol(\"text\").\n",
" setOutputCol(\"document\")\n",
"\n",
"val tokenizer = new Tokenizer().\n",
" setInputCols(Array(\"document\")).\n",
" setOutputCol(\"token\")\n",
"\n",
"val embed = WordEmbeddingsModel.pretrained(\"glove_100d\"). //100 dimensions, I guess\n",
" setInputCols(\"document\", \"token\").\n",
" setOutputCol(\"embed_100\")\n",
"\n",
"val tagger_pos = PerceptronModel.pretrained(\"pos_anc\", lang=\"en\").\n",
" setInputCols(Array(\"token\", \"document\")).\n",
" setOutputCol(\"POS\")\n",
"\n",
"val ner_crf = NerCrfModel.pretrained(\"ner_crf\", lang = \"en\").\n",
" setInputCols(Array(\"document\", \"token\", \"POS\", \"embed_100\")).\n",
" setOutputCol(\"NER\")\n",
"\n",
"val recursivePipeline = new RecursivePipeline()\n",
" .setStages(Array(documentAssembler, tokenizer, embed, tagger_pos, ner_crf))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n",
"\n",
"\u001b[39m\n",
"\u001b[36mlearningPipeline\u001b[39m: \u001b[32morg\u001b[39m.\u001b[32mapache\u001b[39m.\u001b[32mspark\u001b[39m.\u001b[32mml\u001b[39m.\u001b[32mPipelineModel\u001b[39m = pipeline_092ba3ab09ce\n",
"\u001b[36mannotatorPipeline\u001b[39m: \u001b[32mLightPipeline\u001b[39m = com.johnsnowlabs.nlp.LightPipeline@920f364"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.functions._\n",
"\n",
"val learningPipeline = recursivePipeline.fit(testData)\n",
"val annotatorPipeline = new LightPipeline(learningPipeline)\n",
"annotatorPipeline.transform(testData).createOrReplaceTempView(\"analysis\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">show at cmd3.sc:11</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+----------+---+\n",
"| input| word|POS|\n",
"+--------------------+----------+---+\n",
"|Reuters historica...| Reuters|NNP|\n",
"|Reuters historica...|historical| JJ|\n",
"|Reuters historica...| calendar| NN|\n",
"|Reuters historica...| -| -|\n",
"|Reuters historica...| September|NNP|\n",
"|Reuters historica...| 7| CD|\n",
"|Reuters historica...| .| .|\n",
"|Following are som...| Following|VBG|\n",
"|Following are som...| are|VBP|\n",
"|Following are som...| some| DT|\n",
"|Following are som...| of| IN|\n",
"|Following are som...| the| DT|\n",
"|Following are som...| major| JJ|\n",
"|Following are som...| events|NNS|\n",
"|Following are som...| to| TO|\n",
"|Following are som...| have| VB|\n",
"|Following are som...| occurred|VBN|\n",
"|Following are som...| on| IN|\n",
"|Following are som...| September|NNP|\n",
"|Following are som...| 7| CD|\n",
"+--------------------+----------+---+\n",
"only showing top 20 rows\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">show at cmd3.sc:12</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+----------+-----+\n",
"| input| word| NER|\n",
"+--------------------+----------+-----+\n",
"|Reuters historica...| Reuters|I-ORG|\n",
"|Reuters historica...|historical| O|\n",
"|Reuters historica...| calendar| O|\n",
"|Reuters historica...| -| O|\n",
"|Reuters historica...| September| O|\n",
"|Reuters historica...| 7| O|\n",
"|Reuters historica...| .| O|\n",
"|Following are som...| Following| O|\n",
"|Following are som...| are| O|\n",
"|Following are som...| some| O|\n",
"|Following are som...| of| O|\n",
"|Following are som...| the| O|\n",
"|Following are som...| major| O|\n",
"|Following are som...| events| O|\n",
"|Following are som...| to| O|\n",
"|Following are som...| have| O|\n",
"|Following are som...| occurred| O|\n",
"|Following are som...| on| O|\n",
"|Following are som...| September| O|\n",
"|Following are som...| 7| O|\n",
"+--------------------+----------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
},
{
"data": {
"text/plain": [
"\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.Row\n",
"\u001b[39m\n",
"defined \u001b[32mfunction\u001b[39m \u001b[36mextract\u001b[39m\n",
"\u001b[36mres3_2\u001b[39m: \u001b[32mexpressions\u001b[39m.\u001b[32mUserDefinedFunction\u001b[39m = \u001b[33mUserDefinedFunction\u001b[39m(\n",
" <function1>,\n",
" \u001b[33mArrayType\u001b[39m(\n",
" \u001b[33mList\u001b[39m(\n",
" \u001b[33mStructField\u001b[39m(\u001b[32m\"_1\"\u001b[39m, StringType, true, {}),\n",
" \u001b[33mStructField\u001b[39m(\u001b[32m\"_2\"\u001b[39m, StringType, true, {})\n",
" ),\n",
" true\n",
" ),\n",
" \u001b[32mNone\u001b[39m\n",
")"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import org.apache.spark.sql.Row\n",
"def extract(in: Seq[Row]) = in.map(x => x.getAs[Map[String, String]](4)(\"word\") -> x.getAs[String](3))\n",
"spark.udf.register(\"extract\", extract _)\n",
"\n",
"spark.sql(\"select text as input, explode(extract(POS)) as POS from analysis\").createOrReplaceTempView(\"POS_analysis_raw\")\n",
"spark.sql(\"select text as input, explode(extract(NER)) as NER from analysis\").createOrReplaceTempView(\"NER_analysis_raw\")\n",
"\n",
"spark.sql(\"select input, POS._1 as word, POS._2 as POS from POS_analysis_raw\").createOrReplaceTempView(\"POS_analysis\")\n",
"spark.sql(\"select input, NER._1 as word, NER._2 as NER from NER_analysis_raw\").createOrReplaceTempView(\"NER_analysis\")\n",
"\n",
"spark.sql(\"select * from POS_analysis\").show()\n",
"spark.sql(\"select * from NER_analysis\").show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Hypothesis: I-ORG, I-LOC named entities are usually nouns (tagged as NNP)**"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ WARN] Lost task 3.0 in stage 15.0 (TID 41, localhost, executor driver): TaskKilled (Stage cancelled)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">run at ThreadPoolExecutor.java:1149</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 4 / 4\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ WARN] Lost task 2.0 in stage 15.0 (TID 40, localhost, executor driver): TaskKilled (Stage cancelled)\n",
"[ WARN] Lost task 1.0 in stage 15.0 (TID 39, localhost, executor driver): TaskKilled (Stage cancelled)\n",
"[ WARN] Lost task 0.0 in stage 15.0 (TID 38, localhost, executor driver): TaskKilled (Stage cancelled)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">show at cmd7.sc:2</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 4 / 4\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">show at cmd7.sc:2</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 1 / 1\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+---+------+\n",
"| word|POS| NER|\n",
"+-----------+---+------+\n",
"| of| IN| I-LOC|\n",
"| between| IN| O|\n",
"| Quayle|NNP| I-PER|\n",
"| Congress|NNP| I-ORG|\n",
"| gendarmes|NNS| O|\n",
"| Belgian|NNP|I-MISC|\n",
"| :| :| O|\n",
"| 37.3| CD| O|\n",
"| extended|VBD| O|\n",
"| Labour|NNP|I-MISC|\n",
"| 275| CD| O|\n",
"| dignity| NN| O|\n",
"| department| NN| O|\n",
"| removing|VBG| O|\n",
"| prayerful| JJ| O|\n",
"|application| NN| O|\n",
"| VS|NNP| O|\n",
"| AGENT|NNP| O|\n",
"| jump| NN| O|\n",
"| Shulman|NNP| I-PER|\n",
"+-----------+---+------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"spark.sql(\"select distinct posa.word, POS, NER from POS_analysis as posa inner join NER_analysis as nera on posa.word = nera.word\").createOrReplaceTempView(\"relation\")\n",
"spark.sql(\"select * from relation\").show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ WARN] Lost task 1.0 in stage 19.0 (TID 52, localhost, executor driver): TaskKilled (Stage cancelled)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">run at ThreadPoolExecutor.java:1149</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 4 / 4\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ WARN] Lost task 2.0 in stage 19.0 (TID 53, localhost, executor driver): TaskKilled (Stage cancelled)\n",
"[ WARN] Lost task 0.0 in stage 19.0 (TID 51, localhost, executor driver): TaskKilled (Stage cancelled)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">show at cmd10.sc:1</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 4 / 4\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">show at cmd10.sc:1</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 200 / 200\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
" <span style=\"float: left;\">show at cmd10.sc:1</span>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div class=\"progress\">\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
" 200 / 200\n",
" </div>\n",
" <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
"</div>\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+------+----+\n",
"|POS| NER| cnt|\n",
"+---+------+----+\n",
"| NN| O|1213|\n",
"|NNP| I-PER| 990|\n",
"| JJ| O| 707|\n",
"| CD| O| 598|\n",
"|NNP| I-ORG| 590|\n",
"|NNS| O| 499|\n",
"|NNP| I-LOC| 400|\n",
"|NNP| O| 373|\n",
"|VBD| O| 274|\n",
"|VBN| O| 259|\n",
"| VB| O| 243|\n",
"|VBG| O| 214|\n",
"|NNP|I-MISC| 205|\n",
"| RB| O| 148|\n",
"| IN| O| 91|\n",
"|VBZ| O| 66|\n",
"|VBP| O| 48|\n",
"| JJ|I-MISC| 41|\n",
"|PRP| O| 24|\n",
"| DT| O| 24|\n",
"+---+------+----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"spark.sql(\"select POS, NER, count(*) as cnt from relation group by POS, NER order by cnt desc\").show()\n",
"//TODO: calculate correlation matrix using MlLib"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Scala",
"language": "scala",
"name": "scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"nbconvert_exporter": "script",
"version": "2.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment