Created
December 5, 2016 14:23
-
-
Save AashishTiwari/3000f5cc592a4b05cab95cce52472597 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Visualizing Clusters of UP Vidhan Sabha Headlines Using Spark and Word2vec\n", | |
"\n", | |
"by Aashish K Tiwari" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Start Spark Session" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"spark_path = \"/root/dev/spark-2.0.0-bin-hadoop2.6\"\n", | |
"\n", | |
"# Easiest way to get Spark to work with Jupyter: https://github.com/minrk/findspark\n", | |
"import findspark\n", | |
"findspark.init(spark_path)\n", | |
"\n", | |
"import pyspark\n", | |
"from pyspark.sql import SparkSession\n", | |
"sc = pyspark.SparkContext(appName='vidhansabha_headlines')\n", | |
"spark = SparkSession(sc)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### List the Spark host URL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Web URL: 10.51.239.241:4040\n" | |
] | |
} | |
], | |
"source": [ | |
"config = sc._conf.getAll()\n", | |
"print 'Web URL: ' + filter(lambda x: 'spark.driver.host' in x[0], config)[0][1] + ':4040'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.sql.functions import regexp_replace\n", | |
"from pyspark.sql import Column\n", | |
"from pyspark.sql.types import *\n", | |
"\n", | |
"\n", | |
"# schema = StructType([\n", | |
"# StructField(\"headline_text\", StringType(), True),\n", | |
"# StructField(\"headline_keywords\", StringType(), True),\n", | |
"# StructField(\"headline_keypersons\", StringType(), True),\n", | |
"# StructField(\"book_year\", IntegerType(), True),\n", | |
"# StructField(\"book_session\", IntegerType(), True),\n", | |
"# StructField(\"book_volume\", IntegerType(), True),\n", | |
"# StructField(\"book_number\", IntegerType(), True),\n", | |
"# StructField(\"book_proceeding_date\", TimestampType(), True),\n", | |
" \n", | |
"# ])\n", | |
"\n", | |
"def read_csv(path):\n", | |
" \n", | |
" schema = StructType([\n", | |
" StructField(\"headline_text\", StringType(), True),\n", | |
" StructField(\"headline_keywords\", StringType(), True),\n", | |
" StructField(\"headline_keypersons\", StringType(), True),\n", | |
" StructField(\"book_year\", StringType(), True),\n", | |
" StructField(\"book_session\", StringType(), True),\n", | |
" StructField(\"book_volume\", StringType(), False ),\n", | |
" StructField(\"book_number\", StringType(), False ),\n", | |
" StructField(\"book_proceeding_date\", StringType(), True),\n", | |
" \n", | |
" ])\n", | |
" \n", | |
" return spark.read.csv(path ,schema = schema, header=True, inferSchema=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+-----------+--------------------+\n", | |
"| headline_text|book_volume| headline_keypersons|\n", | |
"+--------------------+-----------+--------------------+\n", | |
"| पूर्व न्यायाधीश ...| 493|प्रदीप माथुर, श्र...|\n", | |
"| प्रदेश में जिला ...| 493|दलजीत सिंह, श्री;...|\n", | |
"|श्रमिकों के स्वास...| 493|मनीष असीजा, श्री;...|\n", | |
"|पूर्व न्यायाधीश क...| 493|मोहम्मद आजम खां, ...|\n", | |
"|जेलों में बन्दियो...| 493|दलवीर सिंह, श्री;...|\n", | |
"|प्रदेश में पंचायत...| 493|लोकेन्द्र सिंह, श...|\n", | |
"|प्रदेश में डीजल, ...| 493|धर्मपाल सिंह, श्र...|\n", | |
"|मेरठ सहित प्रदेश ...| 493|रविन्द्र भडाना, श...|\n", | |
"|मेडिकल कालेज में...| 493|धर्मपाल सिंह, डा....|\n", | |
"|प्रदेश में एम्स (...| 493|अखिलेश प्रताप सिं...|\n", | |
"|प्रदेश में बेरोजग...| 493|वीरपाल राठी, श्री...|\n", | |
"|प्रदेश में सड़क दु...| 493|अरुण कुमार, ड.;दु...|\n", | |
"|जनपद हाथरस के ब्ल...| 493|रामवीर उपाध्याय, ...|\n", | |
"|राष्ट्रीय खाद्य स...| 493|अनुग्रह नारायण सि...|\n", | |
"|प्रदेश में डीजल क...| 493|दववीर सिंह, श्री;...|\n", | |
"|जनपद-उन्नाव के ग्...| 493|अगयश राम सरन वर्म...|\n", | |
"|उ.प्र. में डेंटल ...| 493|अनुग्रह नारायण सि...|\n", | |
"|प्रदेश के ग्रामीण...| 493|ज्योत्सना श्रीवास...|\n", | |
"|वर्ष 2013-14 में ...| 493|सुरेश राणा, श्री;...|\n", | |
"|प्रदेश के किसानों...| 493|लोकेन्द्र सिंह, श...|\n", | |
"+--------------------+-----------+--------------------+\n", | |
"only showing top 20 rows\n", | |
"\n", | |
"7895\n", | |
"root\n", | |
" |-- headline_text: string (nullable = true)\n", | |
" |-- headline_keywords: string (nullable = true)\n", | |
" |-- headline_keypersons: string (nullable = true)\n", | |
" |-- book_year: string (nullable = true)\n", | |
" |-- book_session: string (nullable = true)\n", | |
" |-- book_volume: string (nullable = true)\n", | |
" |-- book_number: string (nullable = true)\n", | |
" |-- book_proceeding_date: string (nullable = true)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df = read_csv(\"headline_dump.csv\").na.drop()\n", | |
"\n", | |
"from pyspark.sql.functions import col, when\n", | |
"\n", | |
"# df.withColumn(\"book_volume\", when(col(\"book_volume\").isNull(), \"490\").otherwise(col(\"book_volume\")))\n", | |
"\n", | |
"df.select('headline_text', 'book_volume', 'headline_keypersons').show()\n", | |
"print df.count()\n", | |
"df.printSchema()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+--------------------+\n", | |
"| headline_text| words|\n", | |
"+--------------------+--------------------+\n", | |
"| पूर्व न्यायाधीश ...|[पूर्व, न्यायाधीश...|\n", | |
"| प्रदेश में जिला ...|[प्रदेश, में, जिल...|\n", | |
"|श्रमिकों के स्वास...|[श्रमिकों, के, स्...|\n", | |
"|पूर्व न्यायाधीश क...|[पूर्व, न्यायाधीश...|\n", | |
"|जेलों में बन्दियो...|[जेलों, में, बन्द...|\n", | |
"|प्रदेश में पंचायत...|[प्रदेश, में, पंच...|\n", | |
"|प्रदेश में डीजल, ...|[प्रदेश, में, डीज...|\n", | |
"|मेरठ सहित प्रदेश ...|[मेरठ, सहित, प्रद...|\n", | |
"|मेडिकल कालेज में...|[मेडिकल, कालेज, म...|\n", | |
"|प्रदेश में एम्स (...|[प्रदेश, में, एम्...|\n", | |
"|प्रदेश में बेरोजग...|[प्रदेश, में, बेर...|\n", | |
"|प्रदेश में सड़क दु...|[प्रदेश, में, सड़क...|\n", | |
"|जनपद हाथरस के ब्ल...|[जनपद, हाथरस, के,...|\n", | |
"|राष्ट्रीय खाद्य स...|[राष्ट्रीय, खाद्य...|\n", | |
"|प्रदेश में डीजल क...|[प्रदेश, में, डीज...|\n", | |
"|जनपद-उन्नाव के ग्...|[जनपद-उन्नाव, के,...|\n", | |
"|उ.प्र. में डेंटल ...|[उ.प्र., में, डें...|\n", | |
"|प्रदेश के ग्रामीण...|[प्रदेश, के, ग्रा...|\n", | |
"|वर्ष 2013-14 में ...|[वर्ष, 2013-14, म...|\n", | |
"|प्रदेश के किसानों...|[प्रदेश, के, किसा...|\n", | |
"+--------------------+--------------------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"from pyspark.ml.feature import Tokenizer, RegexTokenizer\n", | |
"\n", | |
"tokenizer = RegexTokenizer(inputCol=\"headline_text\", outputCol=\"words\")\n", | |
"df = tokenizer.transform(df)\n", | |
"df.select('headline_text','words').show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+--------------------+\n", | |
"| words| vectors|\n", | |
"+--------------------+--------------------+\n", | |
"|[पूर्व, न्यायाधीश...|[-0.0038432309489...|\n", | |
"|[प्रदेश, में, जिल...|[-0.0556664528325...|\n", | |
"|[श्रमिकों, के, स्...|[-0.0375305324354...|\n", | |
"|[पूर्व, न्यायाधीश...|[0.02609115355880...|\n", | |
"|[जेलों, में, बन्द...|[0.09240809204056...|\n", | |
"|[प्रदेश, में, पंच...|[0.03344186266056...|\n", | |
"|[प्रदेश, में, डीज...|[0.05430450532141...|\n", | |
"|[मेरठ, सहित, प्रद...|[0.03163213781253...|\n", | |
"|[मेडिकल, कालेज, म...|[0.00842472057789...|\n", | |
"|[प्रदेश, में, एम्...|[0.03410257293802...|\n", | |
"|[प्रदेश, में, बेर...|[0.03977544771300...|\n", | |
"|[प्रदेश, में, सड़क...|[0.03273418352182...|\n", | |
"|[जनपद, हाथरस, के,...|[-0.0761301376915...|\n", | |
"|[राष्ट्रीय, खाद्य...|[-0.0305102400016...|\n", | |
"|[प्रदेश, में, डीज...|[0.06460548467934...|\n", | |
"|[जनपद-उन्नाव, के,...|[-0.0016552956550...|\n", | |
"|[उ.प्र., में, डें...|[-0.0043472779826...|\n", | |
"|[प्रदेश, के, ग्रा...|[-0.0287785890201...|\n", | |
"|[वर्ष, 2013-14, म...|[0.01353365811519...|\n", | |
"|[प्रदेश, के, किसा...|[0.06119039084296...|\n", | |
"+--------------------+--------------------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"from pyspark.ml.feature import Word2Vec\n", | |
"\n", | |
"word2Vec = Word2Vec(vectorSize=50, seed=42, inputCol=\"words\", outputCol=\"vectors\")\n", | |
"model = word2Vec.fit(df)\n", | |
"df = model.transform(df)\n", | |
"\n", | |
"df.select('words', 'vectors').show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------+------------------+\n", | |
"| word| similarity|\n", | |
"+-----------+------------------+\n", | |
"| संस्थान,|0.8815073904773912|\n", | |
"| पशु|0.8806854230418488|\n", | |
"| शहरों|0.8758613606833399|\n", | |
"|होम्योपैथिक|0.8730747734527519|\n", | |
"| कन्या|0.8724011771304662|\n", | |
"| समूह|0.8621759836192577|\n", | |
"| आयुर्वेदिक|0.8579545273912954|\n", | |
"| जाति,|0.8513603721154581|\n", | |
"| राजकीय|0.8513172266748842|\n", | |
"| घरेलू|0.8495306458208037|\n", | |
"| संग्राम|0.8492104503388931|\n", | |
"| महिला| 0.844929562891094|\n", | |
"| अनुसूचित| 0.8444687243218|\n", | |
"| औद्योगिक|0.8426168915769893|\n", | |
"| अधिकारी,|0.8399809919552093|\n", | |
"| क्षय|0.8376699402441339|\n", | |
"| अकादमी|0.8376214742423098|\n", | |
"| महमूदाबाद| 0.83503038886584|\n", | |
"| समेत|0.8338217883311242|\n", | |
"| ने| 0.831586335402941|\n", | |
"+-----------+------------------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"model.findSynonyms(\"किसान\", 20).show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"root\n", | |
" |-- headline_text: string (nullable = true)\n", | |
" |-- headline_keywords: string (nullable = true)\n", | |
" |-- headline_keypersons: string (nullable = true)\n", | |
" |-- book_year: string (nullable = true)\n", | |
" |-- book_session: string (nullable = true)\n", | |
" |-- book_volume: string (nullable = true)\n", | |
" |-- book_number: string (nullable = true)\n", | |
" |-- book_proceeding_date: string (nullable = true)\n", | |
" |-- words: array (nullable = true)\n", | |
" | |-- element: string (containsNull = true)\n", | |
" |-- vectors: vector (nullable = true)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df.printSchema()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------+-------+\n", | |
"|book_volume|indexed|\n", | |
"+-----------+-------+\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"+-----------+-------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# from pyspark.ml.feature import StringIndexer, OneHotEncoder\n", | |
"\n", | |
"# stringIndexer = StringIndexer(inputCol=\"book_volume\", outputCol=\"indexed\")\n", | |
"# model = stringIndexer.fit(df)\n", | |
"# df = model.transform(df)\n", | |
"# df.select('book_volume', 'indexed').show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_bk = df.withColumn(\"book_vol\", col('book_volume').cast('int'))\n", | |
"# df_bk = df.select(df.book_volume.cast('int').alias('book_vol'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------+-------+\n", | |
"|book_volume|indexed|\n", | |
"+-----------+-------+\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"| 493| 3.0|\n", | |
"+-----------+-------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"from pyspark.ml.feature import StringIndexer, OneHotEncoder\n", | |
"\n", | |
"stringIndexer = StringIndexer(inputCol=\"book_volume\", outputCol=\"indexed\")\n", | |
"model = stringIndexer.fit(df)\n", | |
"df = model.transform(df)\n", | |
"df.select('book_volume', 'indexed').show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------+--------------+\n", | |
"|book_volume| book_ohe|\n", | |
"+-----------+--------------+\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"| 493|(33,[3],[1.0])|\n", | |
"+-----------+--------------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"encoder = OneHotEncoder(inputCol=\"indexed\", outputCol=\"book_ohe\")\n", | |
"df = encoder.transform(df)\n", | |
"\n", | |
"df.select('book_volume', 'book_ohe').show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[u'493' u'2015' u'492' u' \\u0936\\u094d\\u0930\\u0940;\"' u'494' u'2' u'488'\n", | |
" u' \"\"\\u092a\\u094d\\u0930\\u0926\\u0947\\u0936 \\u0915\\u0947 \\u0938\\u094d\\u0925\\u093e\\u0928\\u0940\\u092f \\u0928\\u093f\\u0915\\u093e\\u092f\\u094b\\u0902 \\u0915\\u0947 \\u0932\\u0947\\u0916\\u093e \\u092a\\u0930\\u0940\\u0915\\u094d\\u0937\\u093e \\u092a\\u094d\\u0930\\u0924\\u093f\\u0935\\u0947\\u0926\\u0928\\u094b\\u0902 \\u0915\\u094b \\u091c\\u093e\\u0902\\u091a \\u0938\\u092e\\u094d\\u092c\\u0928\\u094d\\u0927\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0924\\u0925\\u093e \"\"\\u092a\\u0902\\u091a\\u093e\\u092f\\u0924\\u0940 \\u0930\\u093e\\u091c \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0914\\u0930 \\u092e\\u0902\\u0924\\u094d\\u0930\\u093f\\u092f\\u094b\\u0902 \\u0915\\u094b \\u092a\\u0930\\u093e\\u092e\\u0930\\u094d\\u0936 \\u0926\\u0947\\u0928\\u0947 \\u0935\\u093e\\u0932\\u0940 30 \\u0938\\u094d\\u0925\\u093e\\u092f\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\\u092f\\u094b\\u0902 \\u092e\\u0947\\u0902 \\u0938\\u0926\\u0938\\u094d\\u092f\\u0924\\u093e \\u0915\\u0947 \\u0932\\u093f\\u092f\\u0947 \\u0935\\u093f\\u0927\\u093e\\u0928 \\u0938\\u092d\\u093e \\u0915\\u0947 \\u0938\\u0926\\u0938\\u094d\\u092f\\u094b\\u0902 \\u0915\\u093e \\u0928\\u093e\\u092e-\\u0928\\u093f\\u0930\\u094d\\u0926\\u0947\\u0936\\u093f\\u0924 \\u0915\\u0930\\u0928\\u0947 \\u0939\\u0947\\u0924\\u0941 \\u0936\\u094d\\u0930\\u0940 \\u0905\\u0927\\u094d\\u092f\\u0915\\u094d\\u0937 \\u0915\\u094b \\u092a\\u094d\\u0930\\u093e\\u0927\\u093f\\u0915\\u0943\\u0924 \\u0915\\u093f\\u092f\\u0947 \\u091c\\u093e\\u0928\\u0947 \\u0915\\u093e \\u092a\\u094d\\u0930\\u0938\\u094d\\u0924\\u093e\\u0935\"'\n", | |
" u'472' u'2010' u'1'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0907\\u0928\\u094d\\u0926\\u094d\\u0930\\u091c\\u0940\\u0924 \\u0938\\u0930\\u094b\\u091c'\n", | |
" u'489' u' \\u0936\\u094d\\u0930\\u0940;\\u092e\\u094b. \\u0905\\u092f\\u0942\\u092c'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0936\\u092e\\u0936\\u0947\\u0930 \\u092c\\u0939\\u093e\\u0926\\u0941\\u0930 \\u0909\\u0930\\u094d\\u092b \\u0936\\u0947\\u0930\\u0942 \\u092d\\u0948\\u092f\\u094d\\u092f\\u093e'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940.\\u092e\\u094b. \\u0905\\u092f\\u0942\\u092c'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940; \"' u'491' u'490'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0928\\u0940\\u0930\\u091c (\\u0915\\u0941\\u0936\\u0935\\u093e\\u0939\\u093e) \\u092e\\u094c\\u0930\\u094d\\u092f'\n", | |
" u' \"\"\\u092a\\u094d\\u0930\\u0926\\u0947\\u0936 \\u0915\\u0947 \\u0938\\u094d\\u0925\\u093e\\u0928\\u0940\\u092f \\u0928\\u093f\\u0915\\u093e\\u092f\\u094b\\u0902 \\u0915\\u0947 \\u0932\\u0947\\u0916\\u093e \\u092a\\u0930\\u0940\\u0915\\u094d\\u0937\\u093e \\u092a\\u094d\\u0930\\u0924\\u093f\\u0935\\u0947\\u0926\\u0928\\u094b\\u0902 \\u0915\\u0940 \\u091c\\u093e\\u0902\\u091a \\u0938\\u092e\\u094d\\u092c\\u0928\\u094d\\u0927\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0924\\u0925\\u093e \"\"\\u092a\\u0902\\u091a\\u093e\\u092f\\u0924\\u0940 \\u0930\\u093e\\u091c \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0914\\u0930 \\u092e\\u0902\\u0924\\u094d\\u0930\\u093f\\u092f\\u094b\\u0902 \\u0915\\u094b \\u092a\\u0930\\u093e\\u092e\\u0930\\u094d\\u0936 \\u0926\\u0947\\u0928\\u0947 \\u0935\\u093e\\u0932\\u0940 30 \\u0938\\u094d\\u0925\\u093e\\u092f\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\\u092f\\u094b\\u0902 \\u092e\\u0947\\u0902 \\u0935\\u093f\\u0927\\u093e\\u0928 \\u0938\\u092d\\u093e \\u0915\\u0947 \\u0938\\u0926\\u0938\\u094d\\u092f\\u094b\\u0902 \\u0915\\u093e \\u0928\\u093e\\u092e-\\u0928\\u093f\\u0930\\u094d\\u0926\\u0947\\u0936 \\u0939\\u0947\\u0924\\u0941 \\u0936\\u094d\\u0930\\u0940 \\u0905\\u0927\\u094d\\u092f\\u0915\\u094d\\u0937 \\u0915\\u094b \\u092a\\u094d\\u0930\\u093e\\u0927\\u093f\\u0915\\u0943\\u0924 \\u0915\\u093f\\u092f\\u0947 \\u091c\\u093e\\u0928\\u0947 \\u0915\\u093e \\u092a\\u094d\\u0930\\u0938\\u094d\\u0924\\u093e\\u0935\"'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u092e\\u0928\\u0940\\u0937 \\u0905\\u0938\\u0940\\u091c\\u093e'\n", | |
" u'3'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u091c\\u093e\\u0939\\u093f\\u0926 \\u092c\\u0947\\u0917'\n", | |
" u'2014'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0930\\u093e\\u091c\\u0947\\u0936 \\u0924\\u094d\\u0930\\u093f\\u092a\\u093e\\u0920\\u0940'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0938\\u094d\\u0935\\u093e\\u092e\\u0940 \\u092a\\u094d\\u0930\\u0938\\u093e\\u0926 \\u092e\\u094c\\u0930\\u094d\\u092f'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940\\u092e\\u0924\\u0940;\\u0930\\u094b\\u0936\\u0928 \\u0932\\u093e\\u0932 \\u0935\\u0930\\u094d\\u092e\\u093e'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0907\\u0928\\u094d\\u0926\\u094d\\u0930\\u093e\\u0923\\u0940 \\u0926\\u0947\\u0935\\u0940'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0905\\u091c\\u092f \\u0915\\u0941\\u092e\\u093e\\u0930'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0926\\u0932\\u0935\\u0940\\u0930 \\u0938\\u093f\\u0902\\u0939'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0938\\u0924\\u094d\\u092f\\u0935\\u0940\\u0930 \\u092e\\u0941\\u0928\\u094d\\u0928\\u093e'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u0930\\u093e\\u092e\\u0917\\u094b\\u0935\\u093f\\u0928\\u094d\\u0926 \\u091a\\u094c\\u0927\\u0930\\u0940'\n", | |
" u' \\u0936\\u094d\\u0930\\u0940;\\u092a\\u094d\\u0930\\u0926\\u0940\\u092a \\u091a\\u094c\\u0927\\u0930\\u0940']\n" | |
] | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"print df.toPandas()['book_volume'].unique()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+\n", | |
"| merged_vectors|\n", | |
"+--------------------+\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"|(83,[3,33,34,35,3...|\n", | |
"+--------------------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"from pyspark.ml.feature import VectorAssembler\n", | |
"\n", | |
"model = VectorAssembler(inputCols=['book_ohe', 'vectors'], outputCol=\"merged_vectors\")\n", | |
"df = model.transform(df)\n", | |
"\n", | |
"df.select('merged_vectors').show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### To CSV, for passing into R" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"root\n", | |
" |-- headline_text: string (nullable = true)\n", | |
" |-- headline_keywords: string (nullable = true)\n", | |
" |-- headline_keypersons: string (nullable = true)\n", | |
" |-- book_year: string (nullable = true)\n", | |
" |-- book_session: string (nullable = true)\n", | |
" |-- book_volume: string (nullable = true)\n", | |
" |-- book_number: string (nullable = true)\n", | |
" |-- book_proceeding_date: string (nullable = true)\n", | |
" |-- words: array (nullable = true)\n", | |
" | |-- element: string (containsNull = true)\n", | |
" |-- vectors: vector (nullable = true)\n", | |
" |-- indexed: double (nullable = true)\n", | |
" |-- book_ohe: vector (nullable = true)\n", | |
" |-- merged_vectors: vector (nullable = true)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df.printSchema()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"(df.select('headline_text', 'headline_keywords', 'headline_keypersons', 'book_volume', 'merged_vectors')\n", | |
" .toPandas()\n", | |
" .to_csv('up_headlines_book_volume_53D.csv', index=False, encoding=\"utf-8\")\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# import pandas as pd\n", | |
"# # df['headline_text'] = df['headline_text'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))\n", | |
"# # df['headline_keywords'] = df['headline_keywords'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))\n", | |
"# # df['headline_keypersons'] = df['headline_keypersons'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))\n", | |
"\n", | |
"# # df.select('headline_text', 'headline_keywords', 'headline_keypersons', 'book_year', 'book_session', 'book_volume', 'book_number', 'book_proceeding_date', 'vectors').toPandas()\n", | |
"# df.write.csv('up_headlines50D.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment