Skip to content

Instantly share code, notes, and snippets.

@friso
Created November 5, 2015 06:01
Show Gist options
  • Save friso/90baaf094b107586c982 to your computer and use it in GitHub Desktop.
Save friso/90baaf094b107586c982 to your computer and use it in GitHub Desktop.
Divolte Data Spark Notebook demo
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Spark Context and Version"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'1.5.1'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sc.version"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.sql import SQLContext"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Spark DataFrame from Avro data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ctx = SQLContext(sc)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = ctx.read.format('com.databricks.spark.avro').load('./*.avro')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## What's in there?"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'city': None,\n",
" 'clientTimestamp': 1440502338366,\n",
" 'continent': u'Europe',\n",
" 'country': u'Netherlands',\n",
" 'countryCode': u'NL',\n",
" 'detectedCorruption': False,\n",
" 'detectedDuplicate': False,\n",
" 'domain': u'education.dev',\n",
" 'eventType': u'pageView',\n",
" 'firstInSession': False,\n",
" 'googleClickId': None,\n",
" 'lat': 52.3667,\n",
" 'location': u'http://education.dev/#?p=1',\n",
" 'lon': 4.9,\n",
" 'mostSpecificSubdivision': None,\n",
" 'pageViewId': u'0:NuUhzVp1wvN0F4Vk9o9TvZuPi_QNTMEB',\n",
" 'partyId': u'0:id63gaay:vnk~wWlKLO0busj0etAGB8RNN9m_OLFC',\n",
" 'path': u'/',\n",
" 'referer': u'http://education.dev/scrum/certified-scrum-master/17-9-2015/register',\n",
" 'refererDomain': u'education.dev',\n",
" 'refererPath': u'/scrum/certified-scrum-master/17-9-2015/register',\n",
" 'remoteHost': u'92.111.226.194',\n",
" 'screenPixelHeight': 1173,\n",
" 'screenPixelWidth': 1920,\n",
" 'sessionId': u'0:idr9h8lk:tQr1fKFsc06nd2NBWfqk8r3eO1OJ6fZ7',\n",
" 'timeZone': u'Europe/Amsterdam',\n",
" 'timestamp': 1440502338554,\n",
" 'trafficSource': None,\n",
" 'userAgentDeviceCategory': u'Personal computer',\n",
" 'userAgentFamily': u'Chrome',\n",
" 'userAgentName': u'Chrome',\n",
" 'userAgentOsFamily': u'OS X',\n",
" 'userAgentOsVendor': u'Apple Computer, Inc.',\n",
" 'userAgentOsVersion': u'10.10.5',\n",
" 'userAgentString': u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\n",
" 'userAgentType': u'Browser',\n",
" 'userAgentVendor': u'Google Inc.',\n",
" 'userAgentVersion': u'44.0.2403.157',\n",
" 'viewportPixelHeight': 1078,\n",
" 'viewportPixelWidth': 1879}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"record, = data.take(1)\n",
"record.asDict()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = data.repartition(8)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"80008"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.cache()\n",
"data.count()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"80008"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Basic Aggregates"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+-----+\n",
"| userAgentFamily|count|\n",
"+------------------+-----+\n",
"| unknown| 44|\n",
"| Opera Mini| 41|\n",
"| Mobile Firefox| 85|\n",
"| IE| 5535|\n",
"|BlackBerry Browser| 71|\n",
"| Googlebot| 63|\n",
"| IceWeasel| 69|\n",
"| Android Browser| 246|\n",
"| Opera Mobile| 10|\n",
"| Nokia Web Browser| 1|\n",
"| Lunascape| 1|\n",
"| IE Mobile| 182|\n",
"| Mozilla| 172|\n",
"| Maxthon| 27|\n",
"| Pale Moon| 44|\n",
"| Iron| 9|\n",
"| UC Browser| 6|\n",
"| Baiduspider| 62|\n",
"| Sogou Explorer| 1|\n",
"| Chrome Mobile| 3338|\n",
"+------------------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"data.groupby('userAgentFamily').count().show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Count by day of week"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x11098dc10>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEKCAYAAAD6q1UVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHGpJREFUeJzt3X+U3XWd3/HnC7LBAAFk0ZhAQmJ3qGRFVxFCq8BVVppa\nTajHQnJqmpVgj6ag9qjbBFsZz+EIqBVZWzinlR8J1WgUFxPLZonI7bK6MP5AjASWRBtIAokUWSKL\n0QRe/eN+Eq7DZObOnTu5M/m+HufMyee+v5/v9/v+zoH3/cznfu/3I9tERES1HNbtBCIi4uBL8Y+I\nqKAU/4iICkrxj4iooBT/iIgKSvGPiKigQYu/pJsk7ZS0oSl2pqQ+SfdL+oGkM5q2LZe0SdLDks5v\nip8uaUPZdl1T/AhJXyvxeyWd3OkLjIiIlxpq5H8zMLdf7DPAf7H9BuCT5TWSZgMXAbPLPtdLUtnn\nBmCJ7R6gR9K+Yy4Bnirxa4FrRng9ERHRgkGLv+17gKf7hZ8Aji3t44DtpT0fWGV7j+0twGZgjqSp\nwGTbfaXfSuCC0p4HrCjt24Dz2ryOiIgYhglt7LMM+FtJn6Px5vHPSnwacG9Tv23AicCe0t5ne4lT\n/t0KYHuvpGckHW/7V23kFRERLWqn+N8IfMj2X0r6N8BNwNs7m9bvk5RnUEREtMG2Boq3c7fPmbb/\nsrS/AZxZ2tuB6U39TqIx4t9e2v3j+/aZASBpAnDsgUb9tkft54orrhjV44/2T/JP/lXMPfkP/TOY\ndor/ZknnlvbbgEdKew2wQNJESbOAHqDP9g5gl6Q55QPgRcC3mvZZXNrvAe5qI5+IiBimQad9JK0C\nzgVOkLSVxt09/x7475KOAH5TXmN7o6TVwEZgL7DUL771LAVuASYBd9heV+I3ArdK2gQ8BSzo4LVF\nRMQBDFr8bS88wKY5B+j/aeDTA8R/BJw2QPy3wIVDpzm6arVat1MYkeTfXeM5//GcOyT/kdBQ80Jj\ngSSPhzwjIsYSSfgAH/i2c7dPRMSoevH7odGq4Q6QU/wjYkzKX/uta+fNMg92i4iooBT/iIgKSvGP\niKigFP+IiApK8Y+IqKAU/4gYFySN+s9YNHPmTL773e92/Li51TMixpHRvP1zbBb/8kWtjh83I/+I\niGHYunUr7373u3nlK1/JCSecwGWXXYZtrrzySmbOnMmUKVNYvHgxu3btAqBerzN9+vTfO0bzaL63\nt5cLL7yQxYsXc8wxx/Da176WH/3oRwAsWrSIxx57jHe9611MnjyZz33ucx27jhT/iIgWPf/887zz\nne9k1qxZPProozz++OMsWLCAm2++mRUrVlCv1/nFL37Bs88+y6WXXnrA4/SfYlq7di0LFy7kmWee\nYd68efv3vfXWW5kxYwbf/va3+fWvf83HPvaxjl1Lin9ERIv6+vp44okn+OxnP8ukSZOYOHEib37z\nm/nyl7/MRz/6UWbOnMlRRx3FVVddxVe/+lVeeOGFlo579tlnM3fuXCTx3ve+lwceeGCUryTFPyKi\nZVu3buXkk0/msMN+v3Q+8cQTnHzyyftfz5gxg71797Jz586WjjtlypT97SOPPJLdu3e3/MbRrhT/\niIgWTZ8+nccee4znn3/+9+LTpk1jy5Yt+18/9thjTJgwgSlTpnDUUUfx3HPP7d/2/PPP8+STT7Z8\nztG6CynFPyKiRXPmzGHq1KksW7aM5557jt27d/O9732PhQsXcu2117JlyxaeffZZLr/8chYsWMBh\nhx3GKaecwu7du7njjjvYs2cPV155Jb/97W9bPueUKVP4+c9/3vFrGbT4S7pJ0k5JG/rFL5P0kKSf\nSbqmKb5c0iZJD0s6vyl+uqQNZdt1TfEjJH2txO+VdDIREQekUfwZ2mGHHcbatWvZvHkzM2bMYPr0\n6Xz961/n4osvZtGiRZxzzjm8+tWv5sgjj+SLX/wiAMceeyzXX389l1xyCSeddBJHH3307939M9B3\nDJpfL1++nCuvvJKXv/zlfP7znx/Wb2swgy7mIuls4Flgpe3TSuytwOXAO2zvkfQK209Kmg18BTgD\nOBH4DtBj25L6gEtt90m6A/gL2+skLQVea3uppIuAf237JUs5Vm0xl4PxZZMq/T5j/Bmte9sPVQf6\nfQ22mMugI3/b9wBP9wt/ELjK9p7SZ9/k1Xxgle09trcAm4E5kqYCk233lX4rgQtKex6worRvA84b\nLJ9q8Sj+RETVtTPn3wOcU6Zp6pLeVOLTgG1N/bbR+Augf3x7iVP+3Qpgey/wjKTj28gpIiKGoZ3H\nO0wAXm77LElnAKuBV3c2rZfq7e3d367VauN+4eaIiE6r1+vU6/WW+rZT/LcB3wSw/QNJL0g6gcaI\nvvk7zCeVvttLu3+csm0G8LikCcCxtn810Embi39ERLxU/4Hxpz71qQP2bWfa53bgbQCSTgEm2v5/\nwBpggaSJkmbRmB7qs70D2CVpjhqfZC4CvlWOtQZYXNrvAe5qI5+IiBimQUf+klYB5wJ/KGkr8Eng\nJuCmcvvn74B/B2B7o6TVwEZgL7C06RadpcAtwCTgDtvrSvxG4FZJm4CngJfc6RMREZ036K2eY0U1\nb/Uc3UfXVun3GePPWH22/lg23Fs98zz/iBhzMjgZfSn+0XH5klrE2HdIFv8Un7GgeisuRYwnh2Tx\nb0jxiYg4kDzVMyKiglL8IyIqKMU/IqKCUvwjIiooxT8iooJS/CMiKijFPyKiglL8IyIqKMU/IqKC\nUvwjIiooxT8iooJS/CMiKmjQ4i/pJkk7y6pd/bd9tKzfe3xTbLmkTZIelnR+U/x0SRvKtuua4kdI\n+lqJ3yvp5E5dWEREHNhQI/+bgbn9g5KmA28HHm2KzQYuAmaXfa7Xi89WvgFYYrsH6JG075hLgKdK\n/FrgmhFcS0RHSBr1n4huG7T4274HeHqATZ8H/rxfbD6wyvYe21uAzcAcSVOBybb7Sr+VwAWlPQ9Y\nUdq3AecN+woiRoVH8Sei+4Y95y9pPrDN9k/7bZoGbGt6vQ04cYD49hKn/LsVwPZe4JnmaaSIiBgd\nw1rMRdKRwOU0pnz2hzua0QH09vbub9dqNWq12sE4bUTEuFGv16nX6y311VDLEUqaCay1fZqk04Dv\nAM+VzSfRGMnPAd4HYPvqst864AoanwvcbfvUEl8InGP7g6VPr+17JU0AnrD9igFy8HCWTWzMqY7u\nSl6juYxj8h/yDMk/ogWSsD3gAH1Y0z62N9ieYnuW7Vk0pnPeaHsnsAZYIGmipFlAD9BnewewS9Kc\n8gHwIuBb5ZBrgMWl/R7grmFfXUREDNtQt3quAr4PnCJpq6T39euyf/hieyOwGtgI/BWwtGm4vhT4\nErAJ2Gx7XYnfCPyhpE3AR4BlI7yeiIhowZDTPmNBpn06fobkP9jRx3n+Eft0bNonIiIODSn+EREV\nlOIfEVFBKf4RERWU4h8RUUEp/hERFZTiHxFRQSn+EREVlOIfEVFBKf4RERWU4h8RUUEp/hERFZTi\nHxFRQSn+EREVlOIfEVFBKf4RERU01EpeN0naKWlDU+yzkh6S9ICkb0o6tmnbckmbJD0s6fym+OmS\nNpRt1zXFj5D0tRK/V9LJnb7AiIh4qaFG/jcDc/vF7gT+2PbrgUeA5QCSZgMXAbPLPteXNXsBbgCW\n2O4BeiTtO+YS4KkSvxa4ZoTXExERLRi0+Nu+B3i6X2y97RfKy/uAk0p7PrDK9h7bW4DNwBxJU4HJ\ntvtKv5XABaU9D1hR2rcB543gWiIiokUjnfO/GLijtKcB25q2bQNOHCC+vcQp/24FsL0XeEbS8SPM\nKSIihjCh3R0lfQL4ne2vdDCfA+rt7d3frtVq1Gq1g3HaiIhxo16vU6/XW+or24N3kGYCa22f1hT7\nM+D9wHm2d5fYMgDbV5fX64ArgEeBu22fWuILgXNsf7D06bV9r6QJwBO2XzFADh4qz379gdb7D58Y\nTj7DPnryH+oMyT+iBZKwrYG2DXvap3xY+3Fg/r7CX6wBFkiaKGkW0AP02d4B7JI0p3wAvAj4VtM+\ni0v7PcBdw80nIiKGb9BpH0mrgHOBEyRtpTGSXw5MBNaXm3n+zvZS2xslrQY2AnuBpU3D9aXALcAk\n4A7b60r8RuBWSZuAp4AFnby4iIgY2JDTPmNBpn06fobkP9jRx3n+EfsMNu3T9ge+ETH2vPjVmtGT\nN65DQ4p/xCFndP9qiUNDnu0TEVFBKf4RERWU4h8RUUEp/hERFZTiHxFRQSn+EREVlOIfEVFBKf4R\nERWU4h8RUUEp/hERFZTiHxFRQSn+EREVlOIfEVFBgxZ/STdJ2ilpQ1PseEnrJT0i6U5JxzVtWy5p\nk6SHJZ3fFD9d0oay7bqm+BGSvlbi90o6udMXGBERLzXUyP9mYG6/2DJgve1TaCy7uAxA0mzgImB2\n2ed6vfhw8RuAJbZ7gJ6yFCTAEuCpEr8WuGaE1xMRES0YtPjbvgd4ul94HrCitFcAF5T2fGCV7T22\ntwCbgTmSpgKTbfeVfiub9mk+1m3AeW1eR0REDEM7c/5TbO8s7Z3AlNKeBmxr6rcNOHGA+PYSp/y7\nFcD2XuAZSce3kVNERAzDiD7wLQvrZk23iIhxpp1lHHdKepXtHWVK55clvh2Y3tTvJBoj/u2l3T++\nb58ZwOOSJgDH2v7VQCft7e3d367VatRqtTZSj4g4dNXrder1ekt9NdRizJJmAmttn1Zef4bGh7TX\nSFoGHGd7WfnA9yvAmTSmc74D/JFtS7oP+BDQB/xv4C9sr5O0FDjN9gclLQAusL1ggBw8nEWjG58z\nj+46pqO5iHXyH/IMyf9ARx7HuUfnScL2gAsvDzryl7QKOBc4QdJW4JPA1cBqSUuALcCFALY3SloN\nbAT2AkubKvZS4BZgEnCH7XUlfiNwq6RNwFPASwp/RER03pAj/7EgI/+OnyH5D3b0cZz/eM49Om+w\nkX++4RsRUUEp/hERFZTiHxFRQSn+EREVlOIfEVFBKf4RERWU4h8RUUEp/hERFZTiHxFRQSn+EREV\nlOIfEVFBKf4RERWU4h8RUUEp/hERFZTiHxFRQSn+EREV1Hbxl7Rc0oOSNkj6iqQjJB0vab2kRyTd\nKem4fv03SXpY0vlN8dPLMTZJum6kFxQREUNrq/iXdX3fD7yxrO17OI0lGJcB622fAtxVXlPW970I\nmA3MBa5XY8khgBuAJbZ7gB5Jc9u+moiIaEm7I/9dwB7gSEkTgCOBx4F5wIrSZwVwQWnPB1bZ3mN7\nC7AZmCNpKjDZdl/pt7Jpn4iIGCVtFX/bvwL+K/AYjaL/D7bXA1Ns7yzddgJTSnsasK3pENuAEweI\nby/xiIgYRRPa2UnSPwE+AswEngG+Lum9zX1sW1LHVnru7e3d367VatRqtU4dOiLikFCv16nX6y31\nlT38+izpIuDtti8prxcBZwFvA95qe0eZ0rnb9mskLQOwfXXpvw64Ani09Dm1xBcC59r+QL/zeTh5\nNj5O6Nj7zkBnoJ3fW8tHT/5DnSH5H+jI4zj36DxJ2NZA29qd838YOEvSpPLB7Z8CG4G1wOLSZzFw\ne2mvARZImihpFtAD9NneAeySNKccZ1HTPhERMUramvax/YCklcAPgReAHwP/A5gMrJa0BNgCXFj6\nb5S0msYbxF5gadNQfilwCzAJuMP2uravJiIiWtLWtM/Blmmfjp8h+Q929HGc/3jOPTpvNKZ9IiJi\nHEvxj4iooBT/iIgKSvGPiKigFP+IiApK8Y+IqKAU/4iICkrxj4iooBT/iIgKSvGPiKigFP+IiApK\n8Y+IqKAU/4iICkrxj4iooBT/iIgKSvGPiKigtou/pOMkfUPSQ5I2lqUYj5e0XtIjku6UdFxT/+WS\nNkl6WNL5TfHTJW0o264b6QVFRMTQRjLyv47GsounAq+jsa7vMmC97VOAu8prJM0GLgJmA3OB68ua\nvQA3AEts9wA9kuaOIKeIiGhBW8Vf0rHA2bZvArC91/YzwDxgRem2ArigtOcDq2zvsb0F2AzMkTQV\nmGy7r/Rb2bRPRESMknZH/rOAJyXdLOnHkv6npKOAKbZ3lj47gSmlPQ3Y1rT/NuDEAeLbSzwiIkbR\nhBHs90bgUts/kPQFyhTPPrYtqWMrPff29u5v12o1arVapw4dEXFIqNfr1Ov1lvrKHn59lvQq4O9s\nzyqv3wIsB14NvNX2jjKlc7ft10haBmD76tJ/HXAF8Gjpc2qJLwTOtf2BfufzcPJsfJzQsfedgc5A\nO7+3lo+e/Ic6Q/I/0JHHce7ReZKwrYG2tTXtY3sHsFXSKSX0p8CDwFpgcYktBm4v7TXAAkkTJc0C\neoC+cpxd5U4hAYua9omIiFHS7rQPwGXAlyVNBH4OvA84HFgtaQmwBbgQwPZGSauBjcBeYGnTUH4p\ncAswicbdQ+tGkFNERLSgrWmfgy3TPh0/Q/If7OjjOP/xnHt0XsenfSIiYnxL8Y+IqKAU/4iICkrx\nj4iooBT/iIgKSvGPiKigFP+IiApK8Y+IqKAU/4iICkrxj4iooBT/iIgKSvGPiKigFP+IiApK8Y+I\nqKAU/4iICkrxj4iooBEVf0mHS7pf0try+nhJ6yU9IulOScc19V0uaZOkhyWd3xQ/XdKGsu26keQT\nERGtGenI/8M0lmbct7TPMmC97VOAu8prJM0GLgJmA3OB68uavQA3AEts9wA9kuaOMKeIiBhC28Vf\n0knAO4AvAfsK+TxgRWmvAC4o7fnAKtt7bG8BNgNzJE0FJtvuK/1WNu0TERGjZCQj/2uBjwMvNMWm\n2N5Z2juBKaU9DdjW1G8bcOIA8e0lHhERo2hCOztJeifwS9v3S6oN1Me2JXVspefe3t797VqtRq02\n4GkjIiqrXq9Tr9db6it7+PVZ0qeBRcBe4GXAMcA3gTOAmu0dZUrnbtuvkbQMwPbVZf91wBXAo6XP\nqSW+EDjX9gf6nc/DybPxcULH3ncGOgPt/N5aPnryH+oMyf9ARx7HuUfnScK2BtrW1rSP7cttT7c9\nC1gAfNf2ImANsLh0WwzcXtprgAWSJkqaBfQAfbZ3ALskzSkfAC9q2iciIkZJW9M+A9g3FLgaWC1p\nCbAFuBDA9kZJq2ncGbQXWNo0lF8K3AJMAu6wva5DOUVExAG0Ne1zsGXap+NnSP6DHX0c5z+ecz8U\nvHgH++gZbi080LRPp0b+EREBjPabb6fk8Q4RERWU4h8RUUGZ9omIMWOszZkfylL8I2KMGR9z5uNd\npn0iIiooxT8iooJS/CMiKijFPyKiglL8IyIqKMU/IqKCUvwjIiooxT8iooJS/CMiKijFPyKiglL8\nIyIqqK3iL2m6pLslPSjpZ5I+VOLHS1ov6RFJd0o6rmmf5ZI2SXpY0vlN8dMlbSjbrhv5JUVExFDa\nHfnvAf6j7T8GzgL+g6RTgWXAetunAHeV10iaDVwEzAbmAtfrxcf33QAssd0D9Eia2/bVRERES9pd\nwH2H7Z+U9rPAQ8CJwDxgRem2ArigtOcDq2zvsb0F2AzMkTQVmGy7r/Rb2bRPRESMkhHP+UuaCbwB\nuA+YYntn2bQTmFLa04BtTbtto/Fm0T++vcQjImIUjeh5/pKOBm4DPmz7180LMdi2pI49mLu3t3d/\nu1arUavVOnXoiIhDQr1ep16vt9RX7a5qI+kPgG8Df2X7CyX2MFCzvaNM6dxt+zWSlgHYvrr0Wwdc\nATxa+pxa4guBc21/oN+5PNwV60d7QYjRXA0o+Q95huR/oCOP49wh+bdwhmHlLwnbA65g0+7dPgJu\nBDbuK/zFGmBxaS8Gbm+KL5A0UdIsoAfos70D2CVpTjnmoqZ9IiJilLQ18pf0FuBvgJ/y4tvccqAP\nWA3MALYAF9r+h7LP5cDFwF4a00R/XeKnA7cAk4A7bH9ogPNl5N/ZMyT/wY4+jvMfz7lD8m/hDB0b\n+bc97XMwpfh3/AzJf7Cjj+P8x3PukPxbOEN3p30iImJ8S/GPiKigFP+IiApK8Y+IqKAU/4iICkrx\nj4iooBT/iIgKSvGPiKigFP+IiApK8Y+IqKAU/4iICkrxj4iooBT/iIgKSvGPiKigFP+IiAoaE8Vf\n0lxJD0vaJOk/dTufiIhDXdeLv6TDgf8GzAVmAwslnXpws6gf3NN1XL3bCYxQvdsJjFC92wmMQL3b\nCYxQvdsJjFC9a2fuevEHzgQ2295iew/wVWD+wU2hfnBP13H1bicwQvVuJzBC9W4nMAL1bicwQvVu\nJzBC9a6deSwU/xOBrU2vt5VYRESMkrFQ/Mf+IsIREYeYri/gLuksoNf23PJ6OfCC7Wua+uQNIiKi\nDQdawH0sFP8JwN8D5wGPA33AQtsPdTWxiIhD2IRuJ2B7r6RLgb8GDgduTOGPiBhdXR/5R0TEwdf1\nkf/BVr5DMJ8X7yjaBqzJXxsHR/n9TwPus/1sU3yu7XXdy6w1kt4C/Mr2Rkk14E3A/bbv6m5m1SLp\nbBq3iW+wfWe38xlK+WzzIdvPSDoSWAa8EXgQ+LTtZw52TmPhbp+Dpnx7eFV5eV/5OQxYVT5oHrck\nva/bOQxF0oeA24HLgAclXdC0+aruZNU6SVcBnwNWSPoMcDUwCbhC0se7mlwbJK3sdg6tktTX1H4/\n8EXgaBq/+/Hw/+5NwD+W9nXAMTT++/kNcHM3EqrUtI+kTcDs8mWy5vhEYKPtP+pOZiMnaavt6d3O\nYzCSfgacZftZSTOBbwD/y/YXJN1v+w1dTXAIkjYCrwMmAjuBk8pIbhKNv2Re19UEByFpLY3bqpvv\n/Hgb8F3Atud1JbEWNf/3IemHwL+0/aSko2j87l/b3QwHJ+kh26eW9o9tv7Fp2wO2X3+wc6ratM/z\nNKZ7tvSLTyvbxjRJGwbZ/MqDlkj7tG+qx/aWMm1ym6ST+f2iNFb9zvZeYK+kn+/7U932byS90OXc\nhnISsBH4EvACjd/3m2j8JTMeHC7peBp5H277SQDb/yhpb3dTa8mDki62fRPwgKQzbP9A0inA77qR\nUNWK/0eA70jazIvfKp4O9ACXdi2r1r2SxjOQnh5g2/cPci7t+KWkP7H9E4DyF8A7gRtpjKjHut9K\nOtL2czTmawGQdByNgjqWvQn4MPAJ4OO275e02/b/6XJerToG+FFpW9JU209ImtzNpIbhEuA6Sf8Z\neBL4vqRtNOrQJd1IqFLTPrD/QXJn0vgLwMB24IdlRDemSboJuNn2PQNsW2V7YRfSapmk6cAe2zv6\nxQW82fbfdiez1kh6me3dA8RPAKbaHuwvszFB0knAtcAvgXljfapwKOXD0ym2/2+3c2mFpGOBWTQG\n3tv6/79wUHOpWvGPCCh/cf1z25d3O5fojhT/iIgKqtStnhER0ZDiHxFRQSn+EREVlOIfMQySeiV9\ntNt5RIxUin/E8OQOiTgkpPhHDEHSJyT9vaR7gH9aYpdI6pP0E0nfkDRJ0mRJvyhrVCDpmPL68K5e\nQMQAUvwjBiHpdOAi4PXAO4AzaIz+v2n7TNt/AjwELLH9axorcv+rsvsC4DbbY/7RIVE9Kf4Rgzub\nRqHfXYr7GhrPlzlN0j2Sfgr8W2B26f8lYN8TVv+MLj2xMWIoKf4Rg+v/JMx9bgaWlid5forGo52x\n/X1gZnlo3eG2Nx6sRCOGI8U/YnB/A1wg6WXlIWLvKvHJwA5JfwC8t98+K4Ev03iGe8SYlMc7RAxB\n0uXAYhoPQ3sU+DHwHPDnNJ7QeB9wtO2LS/9XAb8AXmV7V1eSjhhCin9Eh0l6D/Au24u7nUvEgVTt\nef4Ro0rSF4F/QePOoIgxKyP/iIgKyge+EREVlOIfEVFBKf4RERWU4h8RUUEp/hERFfT/AQhPAuAM\n/XxXAAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x11098d5d0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pd.DataFrame(\n",
" data.rdd.map(lambda r: (datetime.fromtimestamp(r.timestamp / 1000.0).weekday(), 1))\n",
" .reduceByKey(lambda x,y: x+y).collect(),\n",
" columns=['day', 'count']\n",
").plot(kind='bar', x='day', y='count')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Top 10 longest session lengths with frequency of occurrence"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(578, 1),\n",
" (532, 1),\n",
" (510, 1),\n",
" (407, 1),\n",
" (329, 1),\n",
" (314, 1),\n",
" (304, 1),\n",
" (239, 1),\n",
" (222, 1),\n",
" (219, 1)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(data.rdd.map(lambda r: (r.sessionId, 1))\n",
" .reduceByKey(lambda x,y: x+y)\n",
" .map(lambda (id,c): (c, 1))\n",
" .reduceByKey(lambda x,y: x+y)\n",
" .sortByKey(ascending=False)\n",
" .take(10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Top ten most frequent session lengths"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 16199),\n",
" (2, 4807),\n",
" (3, 2655),\n",
" (4, 1504),\n",
" (5, 986),\n",
" (6, 603),\n",
" (7, 419),\n",
" (8, 346),\n",
" (9, 244),\n",
" (10, 206)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(data.rdd.map(lambda r: (r.sessionId, 1))\n",
" .reduceByKey(lambda x,y: x+y)\n",
" .map(lambda (id,c): (c, 1))\n",
" .reduceByKey(lambda x,y: x+y)\n",
" .sortBy(lambda (id,c): c, ascending=False)\n",
" .take(10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Simple K-Means clustering on sessions based on pages visited"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from urlparse import urlparse\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.clustering import KMeans\n",
"from pyspark.ml.feature import HashingTF, Tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"df = ctx.createDataFrame(\n",
" data.rdd.sortBy(lambda r: r.timestamp)\n",
" .map(lambda r: (r.sessionId, urlparse(r.location)))\n",
" .filter(lambda (sid, pr): pr.netloc == 'www.godatadriven.com')\n",
" .map(lambda (sid, pr): (sid, pr.path))\n",
" .groupByKey()\n",
" .map(lambda (sid, paths): (sid, [p for p in paths])),\n",
" ('session_id', 'paths')\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Grouped by session id, concatenated list of pages visited"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+--------------------+\n",
"| session_id| paths|\n",
"+--------------------+--------------------+\n",
"|0:iefl8bmx:w2vb3G...|[/, /careers.html...|\n",
"|0:if4ypszb:DvqmnF...| [/, /team.html, /]|\n",
"|0:idralz1d:uGSUi9...| [/]|\n",
"|0:if88r6qj:FHYJ6w...|[/, /careers.html...|\n",
"|0:idxczsuy:is9qkQ...| [/training.html]|\n",
"|0:ieb6cemo:N2ifkd...| [/]|\n",
"|0:ie9pbckc:9k9IEJ...|[/careers.html, /...|\n",
"|0:iezbegy0:L8a3mk...| [/]|\n",
"|0:iejltv53:KxaIuK...| [/]|\n",
"|0:ieo79s7l:cGSKS4...| [/customers.html]|\n",
"|0:idsz88ak:rA1zJA...|[/, /careers.html...|\n",
"|0:iecbe2f6:1kE0Am...| [/]|\n",
"|0:if6oa2aw:ADNp1~...| [/team.html]|\n",
"|0:iev1yko7:KbjaEP...| [/]|\n",
"|0:iewfrk11:PtlxgM...| [/, /why.html]|\n",
"|0:ieoen8gh:e0ohSG...| [/training.html]|\n",
"|0:ie9zvrlw:lmw0oa...|[/why.html, /cust...|\n",
"|0:if48nrrq:EJfEta...|[/, /customers.html]|\n",
"|0:idszkjl6:AubMGt...|[/job-data-scient...|\n",
"|0:if1chb7s:s9dYv9...|[/careers.html, /...|\n",
"+--------------------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clustering\n",
"- Turn sessions into 'documents' containing the visited pages as 'words'\n",
"- Represent sessions as term frequencies (using hasing term freq vectorizer)\n",
"- Run Spark ML KMeans"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pipeline = Pipeline(stages=[\n",
" HashingTF(inputCol=\"paths\", outputCol=\"features\"),\n",
" KMeans(featuresCol='features', predictionCol='cluster', k=5)\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = pipeline.fit(df)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"result = model.transform(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Resulting DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+--------------------+--------------------+-------+\n",
"| session_id| paths| features|cluster|\n",
"+--------------------+--------------------+--------------------+-------+\n",
"|0:iefl8bmx:w2vb3G...|[/, /careers.html...|(262144,[47,45811...| 2|\n",
"|0:if4ypszb:DvqmnF...| [/, /team.html, /]|(262144,[47,17748...| 2|\n",
"|0:idralz1d:uGSUi9...| [/]| (262144,[47],[1.0])| 2|\n",
"|0:if88r6qj:FHYJ6w...|[/, /careers.html...|(262144,[47,16421...| 2|\n",
"|0:idxczsuy:is9qkQ...| [/training.html]|(262144,[177264],...| 0|\n",
"|0:ieb6cemo:N2ifkd...| [/]| (262144,[47],[1.0])| 2|\n",
"|0:ie9pbckc:9k9IEJ...|[/careers.html, /...|(262144,[45811,71...| 0|\n",
"|0:iezbegy0:L8a3mk...| [/]| (262144,[47],[1.0])| 2|\n",
"|0:iejltv53:KxaIuK...| [/]| (262144,[47],[1.0])| 2|\n",
"|0:ieo79s7l:cGSKS4...| [/customers.html]|(262144,[45971],[...| 0|\n",
"|0:idsz88ak:rA1zJA...|[/, /careers.html...|(262144,[47,45811...| 2|\n",
"|0:iecbe2f6:1kE0Am...| [/]| (262144,[47],[1.0])| 2|\n",
"|0:if6oa2aw:ADNp1~...| [/team.html]|(262144,[177485],...| 0|\n",
"|0:iev1yko7:KbjaEP...| [/]| (262144,[47],[1.0])| 2|\n",
"|0:iewfrk11:PtlxgM...| [/, /why.html]|(262144,[47,25888...| 2|\n",
"|0:ieoen8gh:e0ohSG...| [/training.html]|(262144,[177264],...| 0|\n",
"|0:ie9zvrlw:lmw0oa...|[/why.html, /cust...|(262144,[45971,25...| 0|\n",
"|0:if48nrrq:EJfEta...|[/, /customers.html]|(262144,[47,45971...| 2|\n",
"|0:idszkjl6:AubMGt...|[/job-data-scient...|(262144,[16421],[...| 0|\n",
"|0:if1chb7s:s9dYv9...|[/careers.html, /...|(262144,[47,45811...| 2|\n",
"+--------------------+--------------------+--------------------+-------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"result.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cluster sizes"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x110c78d90>"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEKCAYAAAAYd05sAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGR5JREFUeJzt3X+QlVed5/H3h7CMgBiSzaTlZyCTpgwadzUKmdWYm8xI\nUVMRKGuHQFUYNj9myqGSOFUxK8Ra6T8yBjVrzDiLVbsKEte0hY7DgIMY8uO6GXXomQxm0A4TOk4H\nmoQ2s8YQNqINfPePe4Cba//i9r19L6c/r6ounuc8v8596uFzzz33uc9RRGBmZvkZ1+gKmJlZfTjg\nzcwy5YA3M8uUA97MLFMOeDOzTDngzcwyNWjAS9okqVfSvrKyBZI6JO2V9A+S3lu2bJ2kA5L2S1pU\nVn61pH1p2UP1eSlmZlZuqBb8ZmBxRdlngP8WEe8CPpnmkTQfuAmYn7bZKElpmy8Ct0VEK9AqqXKf\nZmZWY4MGfEQ8BbxSUfwScGGangocTtNLgfaI6IuIbqALWChpGjAlIjrSeg8Dy2pQdzMzG8T4KrZZ\nC/ydpAcovUH8biqfDvx92Xo9wAygL02fdjiVm5lZHVUT8F8G7oqIv5b0h8Am4IO1qIwkPzfBzKwK\nEaHKsmruolkQEX+dpr8JLEjTh4FZZevNpNRyP5ymy8sPM4CIaOjf+vXrG16HZvnzufC58Lk4P87F\nQKoJ+C5J16XpG4Dn0vR2YIWkCZLmAq1AR0QcAY5KWpi+dF0FbKviuGZmdg4G7aKR1A5cB1wi6RCl\nu2b+BPgfkn4L+GWaJyI6JW0FOoETwJo4+9ayBvgKMBHYGRG76vBazMyszKABHxErB1i0cID1PwV8\nqp/yp4Grzrl2DVAoFBpdhabhc3GWz8VZPhdnNfu50GD9N6NNUjRTfczMzgeSiH6+ZK3mLhozs5o4\n+1tIG65zaQQ74M2sofypffjO9Q3RDxszM8uUA97MLFMOeDOzTDngzcwy5YA3M8uUA97Mmoakuv81\nqzlz5vDEE0/UdJ++TdLMmkw9b5ts3oBPP1aq6T7dgrfsjUar8HxvPdpvOnToEB/+8Ie59NJLueSS\nS7jzzjuJCO677z7mzJlDS0sLq1ev5ujRowAUi0VmzZr1hn2Ut8rb2tpYvnw5q1ev5i1veQvveMc7\nePrppwFYtWoVBw8e5EMf+hBTpkzhgQceqMlrcMDbGBFN8Gfni5MnT3LjjTcyd+5cXnjhBV588UVW\nrFjB5s2b2bJlC8VikZ/+9KccO3aMO+64Y8D9VL6p79ixg5UrV/Lqq6+yZMmSM9t+9atfZfbs2Xz7\n29/mtdde42Mf+1hNXocD3sysQkdHBy+99BKf/exnmThxIhMmTOB973sfX/va17j77ruZM2cOkydP\n5v777+frX/86p06dGtZ+r732WhYvXowkbr75Zp555pm6vg4HvJlZhUOHDnHZZZcxbtwbI/Kll17i\nsssuOzM/e/ZsTpw4QW9v77D229LScmZ60qRJHD9+fNhvDtVwwJuZVZg1axYHDx7k5MmTbyifPn06\n3d3dZ+YPHjzI+PHjaWlpYfLkybz++utnlp08eZKXX3552Mesx3c0DngzswoLFy5k2rRprF27ltdf\nf53jx4/z/e9/n5UrV/Lggw/S3d3NsWPHuPfee1mxYgXjxo1j3rx5HD9+nJ07d9LX18d9993Hr371\nq2Efs6Wlheeff76mr2PQgJe0SVKvpH0V5XdKelbSjyV9uqx8naQDkvZLWlRWfrWkfWnZQzV9BWaW\nGdXxb3jGjRvHjh076OrqYvbs2cyaNYtvfOMb3HrrraxatYoPfOADXH755UyaNIkvfOELAFx44YVs\n3LiR22+/nZkzZ/LmN7/5DXfV9HcnVfn8unXruO+++7jooov43Oc+N+y6DmbQAT8kXQscAx6OiKtS\n2fXAvcAfRESfpN+OiJclzQceAd4LzAAeA1ojIiR1AHdERIekncBf9Ddsnwf8sHoo/Sdqhuuq9vc5\nn+/qce93zgY6XwMN+DFoCz4ingJeqSj+U+D+iOhL65zuZFoKtEdEX0R0A13AQknTgCkR0ZHWexhY\nNvyXZGZm1aimD74V+ICkv5dUlPSeVD4d6Clbr4dSS76y/HAqNzOzOqrmUQXjgYsi4hpJ7wW2ApfX\nqkJtbW1npguFQtMPamtmNtqKxSLFYnHI9YYcdFvSHGBHWR/8d4ANEfG9NN8FXAPcDhARG1L5LmA9\n8ALwZERcmcpXAtdFxEf6OZb74K3m3AffvNwHf25q2gc/gG3ADWmn84AJEfFvwHZghaQJkuZS6srp\niIgjwFFJC1X6n7Yq7cPMzOpo0C4aSe3AdcC/l3QI+CSwCdiUbp38NfBHABHRKWkr0AmcANaUNcfX\nAF8BJgI7+7uDxszMamvILprR5C4aqwd30TQvP2Hz3J1LF42fB29mDeM3vPryowrMzDLlgDczy5QD\n3swsUw54M7NMOeDNzDLlgDczy5QD3swsUw54M7NMOeDNzDLlgDczy5QD3swsUw54M7NMOeDNzDLl\ngDczy5QD3swsU4MGvKRNknrT6E2Vy+6WdErSxWVl6yQdkLRf0qKy8qsl7UvLHqrtSzAzs/4M1YLf\nDCyuLJQ0C/ggpQG1T5fNB24C5qdtNurscC1fBG6LiFagVdJv7NPMzGpr0ICPiKeAV/pZ9Dngv1aU\nLQXaI6IvIrqBLmChpGnAlIjoSOs9DCwbUa3NzGxI59wHL2kp0BMR/1yxaDrQUzbfA8zop/xwKjcz\nszo6pzFZJU0C7qXUPXOmuJYVamtrOzNdKBQoFAq13L2Z2XmvWCxSLBaHXE9DDXoraQ6wIyKuknQV\n8Bjwelo8k1KLfCFwC0BEbEjb7QLWU+qnfzIirkzlK4HrIuIj/RwrPAiv1Vrpq6BmuK7kQaatLiQR\nEb/R2D6nLpqI2BcRLRExNyLmUup6eXdE9ALbgRWSJkiaC7QCHRFxBDgqaWH60nUVsG3Er8jMzAY1\n1G2S7cAPgHmSDkm6pWKVM82RiOgEtgKdwHeANWXN8TXAl4ADQFdE7KpR/c3MbABDdtGMJnfRWD24\ni8ZyV5MuGjMzO3844M3MMuWANzPLlAPezCxTDngzs0w54M3MMuWANzPLlAPezCxTDngzs0w54M3M\nMuWANzPLlAPezCxTDngzs0w54M3MMuWANzPLlAPezCxTQ43otElSr6R9ZWWflfSspGckfUvShWXL\n1kk6IGm/pEVl5VdL2peWPVSfl2JmZuWGasFvBhZXlD0KvD0i/gPwHLAOQNJ84CZgftpmYxqDFeCL\nwG0R0Qq0Sqrcp5mZ1digAR8RTwGvVJTtjohTaXYPMDNNLwXaI6IvIrqBLmChpGnAlIjoSOs9DCyr\nUf3NzGwAI+2DvxXYmaanAz1ly3qAGf2UH07lZmZWR+Or3VDSJ4BfR8QjNawPbW1tZ6YLhQKFQqGW\nuzczO+8Vi0WKxeKQ62moUd4lzQF2RMRVZWX/Bfhj4Pci4ngqWwsQERvS/C5gPfAC8GREXJnKVwLX\nRcRH+jlWeNR5q7XSV0HNcF0JX99WD5KICFWWn3MXTfqC9B5g6elwT7YDKyRNkDQXaAU6IuIIcFTS\nwvSl6ypgW1WvwszMhm3QLhpJ7cB1wCWSDlFqka8DJgC7000yP4yINRHRKWkr0AmcANaUNcfXAF8B\nJgI7I2JXPV6MmZmdNWQXzWhyF43Vg7toLHc166IxM7PzgwPezCxTDngzs0w54M3MMuWANzPLlAPe\nzCxTDngzs0w54M3MMuWANzPLlAPezCxTDngzs0w54M3MMuWANzPLlAPezCxTDngzs0wNGvCSNknq\nlbSvrOxiSbslPSfpUUlTy5atk3RA0n5Ji8rKr5a0Ly17qD4vxczMyg3Vgt8MLK4oWwvsjoh5wONp\nHknzgZuA+WmbjWmIPoAvArdFRCvQmob9MzOzOho04CPiKeCViuIlwJY0vQVYlqaXAu0R0RcR3UAX\nsFDSNGBKRHSk9R4u28bMzOqkmj74lojoTdO9QEuang70lK3XA8zop/xwKjczszoa0ZesaQBVDzJp\nZtaExlexTa+kt0bEkdT98rNUfhiYVbbeTEot98Npurz88EA7b2trOzNdKBQoFApVVNHMLF/FYpFi\nsTjkehpqlHdJc4AdEXFVmv8M8H8j4tOS1gJTI2Jt+pL1EWABpS6Yx4ArIiIk7QHuAjqAvwX+IiJ2\n9XOs8KjzVmul7/qb4boSvr6tHiQREaosH7QFL6kduA64RNIh4JPABmCrpNuAbmA5QER0StoKdAIn\ngDVlab0G+AowEdjZX7ibmVltDdmCH01uwVs9uAVvuRuoBe9fspqZZcoBb2aWKQe8mVmmHPBmZply\nwJuZZcoBb2aWKQe8mVmmHPBmZplywJuZZaqah401tbNjjDSWf7FoZo2WXcCXNDpcm+NNxszGNnfR\nmJllygFvZpYpB7yZWaYc8GZmmXLAm5llquqAl7RO0k8k7ZP0iKTfknSxpN2SnpP0qKSpFesfkLRf\n0qLaVN/MzAZSVcCncVr/GHh3Gqv1AmAFsBbYHRHzgMfTPGm81puA+cBiYKMkf3owM6ujakP2KNAH\nTJI0HpgEvAgsAbakdbYAy9L0UqA9IvoiohvoojQ4t5mZ1UlVAR8RPwf+O3CQUrD/IiJ2Ay0R0ZtW\n6wVa0vR0oKdsFz3AjKpqbGZmw1LVL1kl/Q7wZ8Ac4FXgG5JuLl8nIkLSYD8p7XdZW1vbmelCoUCh\nUKimimZm2SoWixSLxSHXUzXPTJF0E/DBiLg9za8CrgFuAK6PiCOSpgFPRsTbJK0FiIgNaf1dwPqI\n2FOx3xjpM1xKz6Jp/KMK/Cya5tEc1wT4urB6kURE/MYzUqrtg98PXCNpokr/e34f6AR2AKvTOquB\nbWl6O7BC0gRJc4FWoKPKY5uZ2TBU1UUTEc9Iehj4R+AU8E/A/wSmAFsl3QZ0A8vT+p2StlJ6EzgB\nrBlxU93MzAZVVRdNvbiLxuqhOa4J8HVh9VLrLhozM2tyDngzs0w54M3MMuWANzPLlAPezCxTDngz\ns0w54M3MMuWANzPLlAPezCxTDngzs0w54M3MMuWANzPLlAPezCxTDngzs0w54M3MMuWANzPLVNUB\nL2mqpG9KelZSp6SFki6WtFvSc5IelTS1bP11kg5I2i9pUW2qb2ZmAxlJC/4hYGdEXAm8k9I4rWuB\n3RExD3g8zSNpPnATMB9YDGyU5E8PZmZ1VFXISroQuDYiNgFExImIeBVYAmxJq20BlqXppUB7RPRF\nRDfQBSwYScXNzGxw1bai5wIvS9os6Z8k/S9Jk4GWiOhN6/QCLWl6OtBTtn0PMKPKY5uZ2TCMH8F2\n7wbuiIh/kPR5UnfMaRERkgYbYbjfZW1tbWemC4UChUKhyiqameWpWCxSLBaHXE/VjPIu6a3ADyNi\nbpp/P7AOuBy4PiKOSJoGPBkRb5O0FiAiNqT1dwHrI2JPxX5jpKPOS2KA945RJEb6Oqx2muOaAF8X\nVi+SiAhVllfVRRMRR4BDkualot8HfgLsAFanstXAtjS9HVghaYKkuUAr0FHNsc3MbHiq7aIBuBP4\nmqQJwPPALcAFwFZJtwHdwHKAiOiUtBXoBE4Aa0bcVDczs0FV1UVTL+6isXpojmsCfF1YvdS0i8bM\nzJqfA97MLFMOeDOzTDngzcwy5YA3M8uUA97MLFMOeDOzTDngzcwy5YA3M8uUA97MLFMOeDOzTDng\nzcwy5YA3M8uUA97MLFMOeDOzTDngzcwyNaKAl3SBpL2SdqT5iyXtlvScpEclTS1bd52kA5L2S1o0\n0oqbmdngRtqC/yilYfhOD1OzFtgdEfOAx9M8kuYDNwHzgcXARkn+9GBmVkdVh6ykmcAfAF8CTg8V\ntQTYkqa3AMvS9FKgPSL6IqIb6AIWVHtsMzMb2kha0Q8C9wCnyspaIqI3TfcCLWl6OtBTtl4PMGME\nxzYzsyGMr2YjSTcCP4uIvZIK/a0TESFpsBGG+13W1tZ2ZrpQKFAo9Lt7M7Mxq1gsUiwWh1xP1Yzy\nLulTwCrgBPAm4C3At4D3AoWIOCJpGvBkRLxN0lqAiNiQtt8FrI+IPRX7jZGOOi+JAd47RpEY6euw\n2mmOawJ8XVi9SCIiVFleVRdNRNwbEbMiYi6wAngiIlYB24HVabXVwLY0vR1YIWmCpLlAK9BRzbHN\nzGx4quqi6cfpZskGYKuk24BuYDlARHRK2krpjpsTwJoRN9XNzGxQVXXR1Iu7aKwemuOaAF8XVi81\n7aIxM7Pm54A3M8uUA97MLFMOeDOzTDngzcwy5YA3M8uUA97MLFMOeDOzTDngzcwy5YA3M8uUA97M\nLFMOeDOzTNXqaZJmdh4oPXitOfjBa/XngDcbc5ohWJvnjSZn7qIxM8uUA97MLFNVBbykWZKelPQT\nST+WdFcqv1jSbknPSXpU0tSybdZJOiBpv6RFtXoBZmbWv2oH3X4r8NaI+JGkNwNPA8uAW4B/i4jP\nSPo4cFFErJU0H3iE0qDcM4DHgHkRcapivx7RyWquOa4JaIbrwuciT7UedPtIRPwoTR8DnqUU3EuA\nLWm1LZRCH2Ap0B4RfRHRDXQBC6o5tpmZDc+I++AlzQHeBewBWiKiNy3qBVrS9HSgp2yzHkpvCGZm\nVicjuk0ydc/8FfDRiHit/B7biAhJg30G63dZW1vbmelCoUChUBhJFc3MslMsFikWi0OuV1UfPICk\nfwd8G/hORHw+le0HChFxRNI04MmIeJuktQARsSGttwtYHxF7KvbpPvha1qJJftTS6HPRHNcENMN1\n4XORp5r2wat0lXwZ6Dwd7sl2YHWaXg1sKytfIWmCpLlAK9BRzbHtXEWD/8ysUaq9i+b9wP8B/pmz\n/4vXUQrtrcBsoBtYHhG/SNvcC9wKnKDUpfPdfvbrFnwta+FzUapBU5wH8Lko1/hzkZOBWvBVd9HU\ngwO+xrXwuSjVoCnOA/hclGv8uchJTbtozMys+Tngzcwy5YA3M8uUA97MLFMOeDOzTDngzcwy5YA3\nM8uUA97MLFMOeDOzTDngzcwy5YA3M8uUA97MLFMOeDOzTDngzcwy5YA3M8vUqAa8pMWS9ks6IOnj\no3lsM7OxZtQCXtIFwF8Ci4H5wEpJV47W8Yev2OgKNJFioyvQRIqNrkATKTa6Ak1jOANfN9JotuAX\nAF0R0R0RfcDXgaWjePxhKja6Ak2k2OgKNJFioyvQRIqNrkBNSBrx3/XXX1+T/dTLaAb8DOBQ2XxP\nKjMza5CRDiq/vgb7qJ/RDHgPwGhmNopGbdBtSdcAbRGxOM2vA05FxKfL1vGbgJlZFfobdHs0A348\n8C/A7wEvAh3Ayoh4dlQqYGY2xowfrQNFxAlJdwDfBS4AvuxwNzOrn1FrwZuZ2egatRZ8M0r34S/l\n7N08PcB2f7IY29J1MR3YExHHysoXR8SuxtVs9El6P/DziOiUVADeA+yNiMcbWzMbjjH7qIL0S9r2\nNLsn/Y0D2tMXwAZIuqXRdRhNku4CtgF3Aj+RtKxs8f2NqVVjSLofeADYIukzwAZgIrBe0j0NrVwT\nkPRwo+swlDHbRSPpADA//eiqvHwC0BkRVzSmZs1F0qGImNXoeowWST8GromIY5LmAN8E/ndEfF7S\n3oh4V0MrOIokdQLvBCYAvcDMiHhV0kRKn27e2dAKjiJJOyjd6l1+p8oNwBNARMSShlRsCGO5i+Yk\npa6Z7ory6WnZmCFp3yCLLx21ijQHne6WiYju1C3xV5Iu443/uceCX0fECeCEpOcj4lWAiPilpFMN\nrttomwl0Al8CTlG6Ft5D6RNO0xrLAf9nwGOSujj7C9tZQCtwR8Nq1RiXUnpG0Cv9LPvBKNel0X4m\n6T9GxI8AUkv+RuDLlFqzY8mvJE2KiNeBd58ulDSVUsiNJe8BPgp8ArgnIvZKOh4R32twvQY1Zrto\n4MwD0BZQaskHcBj4x9RqGTMkbQI2R8RT/Sxrj4iVDahWQ0iaBfRFxJGKcgHvi4i/a0zNRp+kN0XE\n8X7KLwGmRcRgn/yyJGkm8CDwM2BJs3dfjumANzOrRvpU958i4t5G12UwDngzs0yN2dskzcxy54A3\nM8uUA97MLFMOeBuTJLVJuruK7S6U9Kf1qJNZrTngbayq9u6Ci4A157KBkiqPZ1Y1B7yNCZL+SNIz\nkn5U+QwRSUVJV6fpSyT9a5p+u6Q9kvam7a6g9DyW30lln07r3SOpI+2/LZXNkfQvkrYA+yj9EtJs\nVI3lX7LaGCHp7ZR+gfi7EfFzSRcBd3G2FT/Q4JgfAR6KiEfSgDXjgY8Dbz/9TBpJi4ArImKBpHHA\n30i6ltKvo68AVkVERz1fn9lAHPA2FtwAbI2InwNExCvD7DH5AfCJ9OvFb0VEVz9dLYuARZL2pvnJ\nlIL9EPCCw90ayV00NhZUPgWw0gnO/l9405mNItqBDwG/BHZKun6A7e+PiHelv3kRsTmV/78R1tts\nRBzwNhY8AfyhpIsBTv/L2dDvpvQwKYD/fHojSZdHxL9GxBeAvwGuAo4CU8r2/V3gVkmT0zYzJP12\nvV6I2blwF41lL41G9OfA9ySdBPZSCvXT/e4PAFsl/Qnwt2XlyyXdDPQBLwF/HhG/kPT99IjlnRHx\n8TQC1A9T781rwM0M3K9vNmr8LBozs0y5i8bMLFMOeDOzTDngzcwy5YA3M8uUA97MLFMOeDOzTDng\nzcwy9f8BI2Fo7KMyIYEAAAAASUVORK5CYII=\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x110c782d0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"(result.groupBy('cluster')\n",
" .count()\n",
" .toPandas()\n",
" .plot(kind='bar', x='cluster', y='count'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View some clusters"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def cluster_sample(n):\n",
" return [x.asDict() for x in result.filter('cluster = %d' % n).select('paths').take(10)]"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[{'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']},\n",
" {'paths': [u'/bigdatasurveythankyou.html']}]"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster_sample(1)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[{'paths': [u'/',\n",
" u'/careers.html',\n",
" u'/job-big-data-devops-engineer.html',\n",
" u'/job-big-data-software-engineer.html',\n",
" u'/careers.html']},\n",
" {'paths': [u'/', u'/team.html', u'/']},\n",
" {'paths': [u'/']},\n",
" {'paths': [u'/',\n",
" u'/careers.html',\n",
" u'/',\n",
" u'/careers.html',\n",
" u'/job-data-scientist.html',\n",
" u'/team.html']},\n",
" {'paths': [u'/']},\n",
" {'paths': [u'/']},\n",
" {'paths': [u'/']},\n",
" {'paths': [u'/',\n",
" u'/careers.html',\n",
" u'/',\n",
" u'/careers.html',\n",
" u'/job-big-data-software-engineer.html']},\n",
" {'paths': [u'/']},\n",
" {'paths': [u'/']}]"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster_sample(2)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[{'paths': [u'/',\n",
" u'/why.html',\n",
" u'/accelerator.html',\n",
" u'/customers.html',\n",
" u'/training.html',\n",
" u'/events.html',\n",
" u'/team.html',\n",
" u'/index.html',\n",
" u'/accelerator.html',\n",
" u'/index.html',\n",
" u'/',\n",
" u'/careers.html',\n",
" u'/']},\n",
" {'paths': [u'/',\n",
" u'/team.html',\n",
" u'/careers.html',\n",
" u'/index.html',\n",
" u'/why.html',\n",
" u'/accelerator.html',\n",
" u'/team.html',\n",
" u'/events.html']},\n",
" {'paths': [u'/',\n",
" u'/team.html',\n",
" u'/why.html',\n",
" u'/accelerator.html',\n",
" u'/customers.html',\n",
" u'/training.html',\n",
" u'/customers.html',\n",
" u'/index.html',\n",
" u'/why.html']},\n",
" {'paths': [u'/',\n",
" u'/why.html',\n",
" u'/accelerator.html',\n",
" u'/team.html',\n",
" u'/events.html',\n",
" u'/training.html',\n",
" u'/customers.html']},\n",
" {'paths': [u'/',\n",
" u'/why.html',\n",
" u'/accelerator.html',\n",
" u'/customers.html',\n",
" u'/why.html',\n",
" u'/index.html']},\n",
" {'paths': [u'/',\n",
" u'/accelerator.html',\n",
" u'/why.html',\n",
" u'/team.html',\n",
" u'/careers.html']},\n",
" {'paths': [u'/',\n",
" u'/team.html',\n",
" u'/why.html',\n",
" u'/team.html',\n",
" u'/customers.html']},\n",
" {'paths': [u'/',\n",
" u'/team.html',\n",
" u'/events.html',\n",
" u'/training.html',\n",
" u'/training.html',\n",
" u'/customers.html',\n",
" u'/accelerator.html',\n",
" u'/customers.html',\n",
" u'/accelerator.html',\n",
" u'/why.html',\n",
" u'/index.html']},\n",
" {'paths': [u'/',\n",
" u'/',\n",
" u'/',\n",
" u'/why.html',\n",
" u'/accelerator.html',\n",
" u'/customers.html',\n",
" u'/training.html',\n",
" u'/team.html']},\n",
" {'paths': [u'/',\n",
" u'/why.html',\n",
" u'/team.html',\n",
" u'/customers.html',\n",
" u'/training.html',\n",
" u'/training.html',\n",
" u'/events.html',\n",
" u'/team.html',\n",
" u'/careers.html',\n",
" u'/job-data-scientist.html']}]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster_sample(3)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment