Skip to content

Instantly share code, notes, and snippets.

@AashishTiwari
Created December 5, 2016 14:23
Show Gist options
  • Save AashishTiwari/3000f5cc592a4b05cab95cce52472597 to your computer and use it in GitHub Desktop.
Save AashishTiwari/3000f5cc592a4b05cab95cce52472597 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Visualizing Clusters of UP Vidhan Sabha Headlines Using Spark and Word2vec\n",
"\n",
"by Aashish K Tiwari"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Start Spark Session"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"spark_path = \"/root/dev/spark-2.0.0-bin-hadoop2.6\"\n",
"\n",
"# Easiest way to get Spark to work with Jupyter: https://github.com/minrk/findspark\n",
"import findspark\n",
"findspark.init(spark_path)\n",
"\n",
"import pyspark\n",
"from pyspark.sql import SparkSession\n",
"sc = pyspark.SparkContext(appName='vidhansabha_headlines')\n",
"spark = SparkSession(sc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### List the Spark host URL"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Web URL: 10.51.239.241:4040\n"
]
}
],
"source": [
"config = sc._conf.getAll()\n",
"print 'Web URL: ' + filter(lambda x: 'spark.driver.host' in x[0], config)[0][1] + ':4040'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark.sql.functions import regexp_replace\n",
"from pyspark.sql import Column\n",
"from pyspark.sql.types import *\n",
"\n",
"\n",
"# schema = StructType([\n",
"# StructField(\"headline_text\", StringType(), True),\n",
"# StructField(\"headline_keywords\", StringType(), True),\n",
"# StructField(\"headline_keypersons\", StringType(), True),\n",
"# StructField(\"book_year\", IntegerType(), True),\n",
"# StructField(\"book_session\", IntegerType(), True),\n",
"# StructField(\"book_volume\", IntegerType(), True),\n",
"# StructField(\"book_number\", IntegerType(), True),\n",
"# StructField(\"book_proceeding_date\", TimestampType(), True),\n",
" \n",
"# ])\n",
"\n",
"def read_csv(path):\n",
" \n",
" schema = StructType([\n",
" StructField(\"headline_text\", StringType(), True),\n",
" StructField(\"headline_keywords\", StringType(), True),\n",
" StructField(\"headline_keypersons\", StringType(), True),\n",
" StructField(\"book_year\", StringType(), True),\n",
" StructField(\"book_session\", StringType(), True),\n",
" StructField(\"book_volume\", StringType(), False ),\n",
" StructField(\"book_number\", StringType(), False ),\n",
" StructField(\"book_proceeding_date\", StringType(), True),\n",
" \n",
" ])\n",
" \n",
" return spark.read.csv(path ,schema = schema, header=True, inferSchema=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-----------+--------------------+\n",
"| headline_text|book_volume| headline_keypersons|\n",
"+--------------------+-----------+--------------------+\n",
"| पूर्व न्यायाधीश ...| 493|प्रदीप माथुर, श्र...|\n",
"| प्रदेश में जिला ...| 493|दलजीत सिंह, श्री;...|\n",
"|श्रमिकों के स्वास...| 493|मनीष असीजा, श्री;...|\n",
"|पूर्व न्यायाधीश क...| 493|मोहम्मद आजम खां, ...|\n",
"|जेलों में बन्दियो...| 493|दलवीर सिंह, श्री;...|\n",
"|प्रदेश में पंचायत...| 493|लोकेन्द्र सिंह, श...|\n",
"|प्रदेश में डीजल, ...| 493|धर्मपाल सिंह, श्र...|\n",
"|मेरठ सहित प्रदेश ...| 493|रविन्द्र भडाना, श...|\n",
"|मेडिकल कालेज में...| 493|धर्मपाल सिंह, डा....|\n",
"|प्रदेश में एम्स (...| 493|अखिलेश प्रताप सिं...|\n",
"|प्रदेश में बेरोजग...| 493|वीरपाल राठी, श्री...|\n",
"|प्रदेश में सड़क दु...| 493|अरुण कुमार, ड.;दु...|\n",
"|जनपद हाथरस के ब्ल...| 493|रामवीर उपाध्याय, ...|\n",
"|राष्ट्रीय खाद्य स...| 493|अनुग्रह नारायण सि...|\n",
"|प्रदेश में डीजल क...| 493|दववीर सिंह, श्री;...|\n",
"|जनपद-उन्नाव के ग्...| 493|अगयश राम सरन वर्म...|\n",
"|उ.प्र. में डेंटल ...| 493|अनुग्रह नारायण सि...|\n",
"|प्रदेश के ग्रामीण...| 493|ज्योत्सना श्रीवास...|\n",
"|वर्ष 2013-14 में ...| 493|सुरेश राणा, श्री;...|\n",
"|प्रदेश के किसानों...| 493|लोकेन्द्र सिंह, श...|\n",
"+--------------------+-----------+--------------------+\n",
"only showing top 20 rows\n",
"\n",
"7895\n",
"root\n",
" |-- headline_text: string (nullable = true)\n",
" |-- headline_keywords: string (nullable = true)\n",
" |-- headline_keypersons: string (nullable = true)\n",
" |-- book_year: string (nullable = true)\n",
" |-- book_session: string (nullable = true)\n",
" |-- book_volume: string (nullable = true)\n",
" |-- book_number: string (nullable = true)\n",
" |-- book_proceeding_date: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"df = read_csv(\"headline_dump.csv\").na.drop()\n",
"\n",
"from pyspark.sql.functions import col, when\n",
"\n",
"# df.withColumn(\"book_volume\", when(col(\"book_volume\").isNull(), \"490\").otherwise(col(\"book_volume\")))\n",
"\n",
"df.select('headline_text', 'book_volume', 'headline_keypersons').show()\n",
"print df.count()\n",
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+--------------------+\n",
"| headline_text| words|\n",
"+--------------------+--------------------+\n",
"| पूर्व न्यायाधीश ...|[पूर्व, न्यायाधीश...|\n",
"| प्रदेश में जिला ...|[प्रदेश, में, जिल...|\n",
"|श्रमिकों के स्वास...|[श्रमिकों, के, स्...|\n",
"|पूर्व न्यायाधीश क...|[पूर्व, न्यायाधीश...|\n",
"|जेलों में बन्दियो...|[जेलों, में, बन्द...|\n",
"|प्रदेश में पंचायत...|[प्रदेश, में, पंच...|\n",
"|प्रदेश में डीजल, ...|[प्रदेश, में, डीज...|\n",
"|मेरठ सहित प्रदेश ...|[मेरठ, सहित, प्रद...|\n",
"|मेडिकल कालेज में...|[मेडिकल, कालेज, म...|\n",
"|प्रदेश में एम्स (...|[प्रदेश, में, एम्...|\n",
"|प्रदेश में बेरोजग...|[प्रदेश, में, बेर...|\n",
"|प्रदेश में सड़क दु...|[प्रदेश, में, सड़क...|\n",
"|जनपद हाथरस के ब्ल...|[जनपद, हाथरस, के,...|\n",
"|राष्ट्रीय खाद्य स...|[राष्ट्रीय, खाद्य...|\n",
"|प्रदेश में डीजल क...|[प्रदेश, में, डीज...|\n",
"|जनपद-उन्नाव के ग्...|[जनपद-उन्नाव, के,...|\n",
"|उ.प्र. में डेंटल ...|[उ.प्र., में, डें...|\n",
"|प्रदेश के ग्रामीण...|[प्रदेश, के, ग्रा...|\n",
"|वर्ष 2013-14 में ...|[वर्ष, 2013-14, म...|\n",
"|प्रदेश के किसानों...|[प्रदेश, के, किसा...|\n",
"+--------------------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import Tokenizer, RegexTokenizer\n",
"\n",
"tokenizer = RegexTokenizer(inputCol=\"headline_text\", outputCol=\"words\")\n",
"df = tokenizer.transform(df)\n",
"df.select('headline_text','words').show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+--------------------+\n",
"| words| vectors|\n",
"+--------------------+--------------------+\n",
"|[पूर्व, न्यायाधीश...|[-0.0038432309489...|\n",
"|[प्रदेश, में, जिल...|[-0.0556664528325...|\n",
"|[श्रमिकों, के, स्...|[-0.0375305324354...|\n",
"|[पूर्व, न्यायाधीश...|[0.02609115355880...|\n",
"|[जेलों, में, बन्द...|[0.09240809204056...|\n",
"|[प्रदेश, में, पंच...|[0.03344186266056...|\n",
"|[प्रदेश, में, डीज...|[0.05430450532141...|\n",
"|[मेरठ, सहित, प्रद...|[0.03163213781253...|\n",
"|[मेडिकल, कालेज, म...|[0.00842472057789...|\n",
"|[प्रदेश, में, एम्...|[0.03410257293802...|\n",
"|[प्रदेश, में, बेर...|[0.03977544771300...|\n",
"|[प्रदेश, में, सड़क...|[0.03273418352182...|\n",
"|[जनपद, हाथरस, के,...|[-0.0761301376915...|\n",
"|[राष्ट्रीय, खाद्य...|[-0.0305102400016...|\n",
"|[प्रदेश, में, डीज...|[0.06460548467934...|\n",
"|[जनपद-उन्नाव, के,...|[-0.0016552956550...|\n",
"|[उ.प्र., में, डें...|[-0.0043472779826...|\n",
"|[प्रदेश, के, ग्रा...|[-0.0287785890201...|\n",
"|[वर्ष, 2013-14, म...|[0.01353365811519...|\n",
"|[प्रदेश, के, किसा...|[0.06119039084296...|\n",
"+--------------------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import Word2Vec\n",
"\n",
"word2Vec = Word2Vec(vectorSize=50, seed=42, inputCol=\"words\", outputCol=\"vectors\")\n",
"model = word2Vec.fit(df)\n",
"df = model.transform(df)\n",
"\n",
"df.select('words', 'vectors').show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+------------------+\n",
"| word| similarity|\n",
"+-----------+------------------+\n",
"| संस्थान,|0.8815073904773912|\n",
"| पशु|0.8806854230418488|\n",
"| शहरों|0.8758613606833399|\n",
"|होम्योपैथिक|0.8730747734527519|\n",
"| कन्या|0.8724011771304662|\n",
"| समूह|0.8621759836192577|\n",
"| आयुर्वेदिक|0.8579545273912954|\n",
"| जाति,|0.8513603721154581|\n",
"| राजकीय|0.8513172266748842|\n",
"| घरेलू|0.8495306458208037|\n",
"| संग्राम|0.8492104503388931|\n",
"| महिला| 0.844929562891094|\n",
"| अनुसूचित| 0.8444687243218|\n",
"| औद्योगिक|0.8426168915769893|\n",
"| अधिकारी,|0.8399809919552093|\n",
"| क्षय|0.8376699402441339|\n",
"| अकादमी|0.8376214742423098|\n",
"| महमूदाबाद| 0.83503038886584|\n",
"| समेत|0.8338217883311242|\n",
"| ने| 0.831586335402941|\n",
"+-----------+------------------+\n",
"\n"
]
}
],
"source": [
"model.findSynonyms(\"किसान\", 20).show()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- headline_text: string (nullable = true)\n",
" |-- headline_keywords: string (nullable = true)\n",
" |-- headline_keypersons: string (nullable = true)\n",
" |-- book_year: string (nullable = true)\n",
" |-- book_session: string (nullable = true)\n",
" |-- book_volume: string (nullable = true)\n",
" |-- book_number: string (nullable = true)\n",
" |-- book_proceeding_date: string (nullable = true)\n",
" |-- words: array (nullable = true)\n",
" | |-- element: string (containsNull = true)\n",
" |-- vectors: vector (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+-------+\n",
"|book_volume|indexed|\n",
"+-----------+-------+\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"+-----------+-------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"# from pyspark.ml.feature import StringIndexer, OneHotEncoder\n",
"\n",
"# stringIndexer = StringIndexer(inputCol=\"book_volume\", outputCol=\"indexed\")\n",
"# model = stringIndexer.fit(df)\n",
"# df = model.transform(df)\n",
"# df.select('book_volume', 'indexed').show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_bk = df.withColumn(\"book_vol\", col('book_volume').cast('int'))\n",
"# df_bk = df.select(df.book_volume.cast('int').alias('book_vol'))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+-------+\n",
"|book_volume|indexed|\n",
"+-----------+-------+\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"| 493| 3.0|\n",
"+-----------+-------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import StringIndexer, OneHotEncoder\n",
"\n",
"stringIndexer = StringIndexer(inputCol=\"book_volume\", outputCol=\"indexed\")\n",
"model = stringIndexer.fit(df)\n",
"df = model.transform(df)\n",
"df.select('book_volume', 'indexed').show()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+--------------+\n",
"|book_volume| book_ohe|\n",
"+-----------+--------------+\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"| 493|(33,[3],[1.0])|\n",
"+-----------+--------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"encoder = OneHotEncoder(inputCol=\"indexed\", outputCol=\"book_ohe\")\n",
"df = encoder.transform(df)\n",
"\n",
"df.select('book_volume', 'book_ohe').show()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[u'493' u'2015' u'492' u' \\u0936\\u094d\\u0930\\u0940;\"' u'494' u'2' u'488'\n",
" u' \"\"\\u092a\\u094d\\u0930\\u0926\\u0947\\u0936 \\u0915\\u0947 \\u0938\\u094d\\u0925\\u093e\\u0928\\u0940\\u092f \\u0928\\u093f\\u0915\\u093e\\u092f\\u094b\\u0902 \\u0915\\u0947 \\u0932\\u0947\\u0916\\u093e \\u092a\\u0930\\u0940\\u0915\\u094d\\u0937\\u093e \\u092a\\u094d\\u0930\\u0924\\u093f\\u0935\\u0947\\u0926\\u0928\\u094b\\u0902 \\u0915\\u094b \\u091c\\u093e\\u0902\\u091a \\u0938\\u092e\\u094d\\u092c\\u0928\\u094d\\u0927\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0924\\u0925\\u093e \"\"\\u092a\\u0902\\u091a\\u093e\\u092f\\u0924\\u0940 \\u0930\\u093e\\u091c \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0914\\u0930 \\u092e\\u0902\\u0924\\u094d\\u0930\\u093f\\u092f\\u094b\\u0902 \\u0915\\u094b \\u092a\\u0930\\u093e\\u092e\\u0930\\u094d\\u0936 \\u0926\\u0947\\u0928\\u0947 \\u0935\\u093e\\u0932\\u0940 30 \\u0938\\u094d\\u0925\\u093e\\u092f\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\\u092f\\u094b\\u0902 \\u092e\\u0947\\u0902 \\u0938\\u0926\\u0938\\u094d\\u092f\\u0924\\u093e \\u0915\\u0947 \\u0932\\u093f\\u092f\\u0947 \\u0935\\u093f\\u0927\\u093e\\u0928 \\u0938\\u092d\\u093e \\u0915\\u0947 \\u0938\\u0926\\u0938\\u094d\\u092f\\u094b\\u0902 \\u0915\\u093e \\u0928\\u093e\\u092e-\\u0928\\u093f\\u0930\\u094d\\u0926\\u0947\\u0936\\u093f\\u0924 \\u0915\\u0930\\u0928\\u0947 \\u0939\\u0947\\u0924\\u0941 \\u0936\\u094d\\u0930\\u0940 \\u0905\\u0927\\u094d\\u092f\\u0915\\u094d\\u0937 \\u0915\\u094b \\u092a\\u094d\\u0930\\u093e\\u0927\\u093f\\u0915\\u0943\\u0924 \\u0915\\u093f\\u092f\\u0947 \\u091c\\u093e\\u0928\\u0947 \\u0915\\u093e \\u092a\\u094d\\u0930\\u0938\\u094d\\u0924\\u093e\\u0935\"'\n",
" u'472' u'2010' u'1'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0907\\u0928\\u094d\\u0926\\u094d\\u0930\\u091c\\u0940\\u0924 \\u0938\\u0930\\u094b\\u091c'\n",
" u'489' u' \\u0936\\u094d\\u0930\\u0940;\\u092e\\u094b. \\u0905\\u092f\\u0942\\u092c'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0936\\u092e\\u0936\\u0947\\u0930 \\u092c\\u0939\\u093e\\u0926\\u0941\\u0930 \\u0909\\u0930\\u094d\\u092b \\u0936\\u0947\\u0930\\u0942 \\u092d\\u0948\\u092f\\u094d\\u092f\\u093e'\n",
" u' \\u0936\\u094d\\u0930\\u0940.\\u092e\\u094b. \\u0905\\u092f\\u0942\\u092c'\n",
" u' \\u0936\\u094d\\u0930\\u0940; \"' u'491' u'490'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0928\\u0940\\u0930\\u091c (\\u0915\\u0941\\u0936\\u0935\\u093e\\u0939\\u093e) \\u092e\\u094c\\u0930\\u094d\\u092f'\n",
" u' \"\"\\u092a\\u094d\\u0930\\u0926\\u0947\\u0936 \\u0915\\u0947 \\u0938\\u094d\\u0925\\u093e\\u0928\\u0940\\u092f \\u0928\\u093f\\u0915\\u093e\\u092f\\u094b\\u0902 \\u0915\\u0947 \\u0932\\u0947\\u0916\\u093e \\u092a\\u0930\\u0940\\u0915\\u094d\\u0937\\u093e \\u092a\\u094d\\u0930\\u0924\\u093f\\u0935\\u0947\\u0926\\u0928\\u094b\\u0902 \\u0915\\u0940 \\u091c\\u093e\\u0902\\u091a \\u0938\\u092e\\u094d\\u092c\\u0928\\u094d\\u0927\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0924\\u0925\\u093e \"\"\\u092a\\u0902\\u091a\\u093e\\u092f\\u0924\\u0940 \\u0930\\u093e\\u091c \\u0938\\u092e\\u093f\\u0924\\u093f\"\" \\u0914\\u0930 \\u092e\\u0902\\u0924\\u094d\\u0930\\u093f\\u092f\\u094b\\u0902 \\u0915\\u094b \\u092a\\u0930\\u093e\\u092e\\u0930\\u094d\\u0936 \\u0926\\u0947\\u0928\\u0947 \\u0935\\u093e\\u0932\\u0940 30 \\u0938\\u094d\\u0925\\u093e\\u092f\\u0940 \\u0938\\u092e\\u093f\\u0924\\u093f\\u092f\\u094b\\u0902 \\u092e\\u0947\\u0902 \\u0935\\u093f\\u0927\\u093e\\u0928 \\u0938\\u092d\\u093e \\u0915\\u0947 \\u0938\\u0926\\u0938\\u094d\\u092f\\u094b\\u0902 \\u0915\\u093e \\u0928\\u093e\\u092e-\\u0928\\u093f\\u0930\\u094d\\u0926\\u0947\\u0936 \\u0939\\u0947\\u0924\\u0941 \\u0936\\u094d\\u0930\\u0940 \\u0905\\u0927\\u094d\\u092f\\u0915\\u094d\\u0937 \\u0915\\u094b \\u092a\\u094d\\u0930\\u093e\\u0927\\u093f\\u0915\\u0943\\u0924 \\u0915\\u093f\\u092f\\u0947 \\u091c\\u093e\\u0928\\u0947 \\u0915\\u093e \\u092a\\u094d\\u0930\\u0938\\u094d\\u0924\\u093e\\u0935\"'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u092e\\u0928\\u0940\\u0937 \\u0905\\u0938\\u0940\\u091c\\u093e'\n",
" u'3'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u091c\\u093e\\u0939\\u093f\\u0926 \\u092c\\u0947\\u0917'\n",
" u'2014'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0930\\u093e\\u091c\\u0947\\u0936 \\u0924\\u094d\\u0930\\u093f\\u092a\\u093e\\u0920\\u0940'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0938\\u094d\\u0935\\u093e\\u092e\\u0940 \\u092a\\u094d\\u0930\\u0938\\u093e\\u0926 \\u092e\\u094c\\u0930\\u094d\\u092f'\n",
" u' \\u0936\\u094d\\u0930\\u0940\\u092e\\u0924\\u0940;\\u0930\\u094b\\u0936\\u0928 \\u0932\\u093e\\u0932 \\u0935\\u0930\\u094d\\u092e\\u093e'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0907\\u0928\\u094d\\u0926\\u094d\\u0930\\u093e\\u0923\\u0940 \\u0926\\u0947\\u0935\\u0940'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0905\\u091c\\u092f \\u0915\\u0941\\u092e\\u093e\\u0930'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0926\\u0932\\u0935\\u0940\\u0930 \\u0938\\u093f\\u0902\\u0939'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0938\\u0924\\u094d\\u092f\\u0935\\u0940\\u0930 \\u092e\\u0941\\u0928\\u094d\\u0928\\u093e'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u0930\\u093e\\u092e\\u0917\\u094b\\u0935\\u093f\\u0928\\u094d\\u0926 \\u091a\\u094c\\u0927\\u0930\\u0940'\n",
" u' \\u0936\\u094d\\u0930\\u0940;\\u092a\\u094d\\u0930\\u0926\\u0940\\u092a \\u091a\\u094c\\u0927\\u0930\\u0940']\n"
]
}
],
"source": [
"import pandas as pd\n",
"print df.toPandas()['book_volume'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+\n",
"| merged_vectors|\n",
"+--------------------+\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"|(83,[3,33,34,35,3...|\n",
"+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import VectorAssembler\n",
"\n",
"model = VectorAssembler(inputCols=['book_ohe', 'vectors'], outputCol=\"merged_vectors\")\n",
"df = model.transform(df)\n",
"\n",
"df.select('merged_vectors').show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### To CSV, for passing into R"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- headline_text: string (nullable = true)\n",
" |-- headline_keywords: string (nullable = true)\n",
" |-- headline_keypersons: string (nullable = true)\n",
" |-- book_year: string (nullable = true)\n",
" |-- book_session: string (nullable = true)\n",
" |-- book_volume: string (nullable = true)\n",
" |-- book_number: string (nullable = true)\n",
" |-- book_proceeding_date: string (nullable = true)\n",
" |-- words: array (nullable = true)\n",
" | |-- element: string (containsNull = true)\n",
" |-- vectors: vector (nullable = true)\n",
" |-- indexed: double (nullable = true)\n",
" |-- book_ohe: vector (nullable = true)\n",
" |-- merged_vectors: vector (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"(df.select('headline_text', 'headline_keywords', 'headline_keypersons', 'book_volume', 'merged_vectors')\n",
" .toPandas()\n",
" .to_csv('up_headlines_book_volume_53D.csv', index=False, encoding=\"utf-8\")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# import pandas as pd\n",
"# # df['headline_text'] = df['headline_text'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))\n",
"# # df['headline_keywords'] = df['headline_keywords'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))\n",
"# # df['headline_keypersons'] = df['headline_keypersons'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))\n",
"\n",
"# # df.select('headline_text', 'headline_keywords', 'headline_keypersons', 'book_year', 'book_session', 'book_volume', 'book_number', 'book_proceeding_date', 'vectors').toPandas()\n",
"# df.write.csv('up_headlines50D.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment