Last active
          February 23, 2021 04:45 
        
      - 
      
- 
        Save gbraccialli/9f1a8be687050a4278758e7944ce0225 to your computer and use it in GitHub Desktop. 
    clickstream
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "spark version = 3.0.1\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import pyspark\n", | |
| "from pyspark.sql import SparkSession\n", | |
| "from pyspark.sql import functions as F\n", | |
| "from pyspark.sql.types import *\n", | |
| "from pyspark.sql.functions import udf\n", | |
| "from IPython.display import HTML\n", | |
| "import seaborn as sns\n", | |
| "\n", | |
| "print(f\"spark version = {pyspark.__version__}\")\n", | |
| "spark = (SparkSession\n", | |
| " .builder\n", | |
| " .master(\"local[*]\")\n", | |
| " .appName(\"jupyter\")\n", | |
| " .config(\"spark.driver.memory\", \"10g\")\n", | |
| " .getOrCreate()\n", | |
| " )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "events = [\n", | |
| " {\"visitor\": 1, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n", | |
| " {\"visitor\": 1, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
| " {\"visitor\": 1, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'C'},\n", | |
| " \n", | |
| " {\"visitor\": 2, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'Z'},\n", | |
| " {\"visitor\": 2, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
| " {\"visitor\": 2, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'F'},\n", | |
| " \n", | |
| " {\"visitor\": 3, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'K'},\n", | |
| " {\"visitor\": 3, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
| " {\"visitor\": 3, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'L'},\n", | |
| " \n", | |
| " {\"visitor\": 4, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n", | |
| " {\"visitor\": 4, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
| " \n", | |
| " {\"visitor\": 5, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n", | |
| " {\"visitor\": 5, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
| " \n", | |
| "\n", | |
| " {\"visitor\": 6, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'C'},\n", | |
| " {\"visitor\": 6, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'B'},\n", | |
| "\n", | |
| " {\"visitor\": 7, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'C'},\n", | |
| " {\"visitor\": 7, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'B'},\n", | |
| " \n", | |
| " \n", | |
| " \n", | |
| "]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/Users/guilherme_braccialli/anaconda3/envs/pyspark/lib/python3.7/site-packages/pyspark/sql/session.py:381: UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead\n", | |
| " warnings.warn(\"inferring schema from dict is deprecated,\"\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df_events = spark.createDataFrame(events)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>page</th>\n", | |
| " <th>timestamp</th>\n", | |
| " <th>visitor</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A</td>\n", | |
| " <td>2020-01-01 00:00:00</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>B</td>\n", | |
| " <td>2020-01-01 01:00:00</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>C</td>\n", | |
| " <td>2020-01-01 02:00:00</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Z</td>\n", | |
| " <td>2020-01-01 00:00:00</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>B</td>\n", | |
| " <td>2020-01-01 01:00:00</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>F</td>\n", | |
| " <td>2020-01-01 02:00:00</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>K</td>\n", | |
| " <td>2020-01-01 00:00:00</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>B</td>\n", | |
| " <td>2020-01-01 01:00:00</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>L</td>\n", | |
| " <td>2020-01-01 02:00:00</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>A</td>\n", | |
| " <td>2020-01-01 00:00:00</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>B</td>\n", | |
| " <td>2020-01-01 01:00:00</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>A</td>\n", | |
| " <td>2020-01-01 00:00:00</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>B</td>\n", | |
| " <td>2020-01-01 01:00:00</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13</th>\n", | |
| " <td>C</td>\n", | |
| " <td>2020-01-01 01:00:00</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14</th>\n", | |
| " <td>B</td>\n", | |
| " <td>2020-01-01 00:00:00</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>15</th>\n", | |
| " <td>C</td>\n", | |
| " <td>2020-01-01 01:00:00</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>16</th>\n", | |
| " <td>B</td>\n", | |
| " <td>2020-01-01 00:00:00</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " page timestamp visitor\n", | |
| "0 A 2020-01-01 00:00:00 1\n", | |
| "1 B 2020-01-01 01:00:00 1\n", | |
| "2 C 2020-01-01 02:00:00 1\n", | |
| "3 Z 2020-01-01 00:00:00 2\n", | |
| "4 B 2020-01-01 01:00:00 2\n", | |
| "5 F 2020-01-01 02:00:00 2\n", | |
| "6 K 2020-01-01 00:00:00 3\n", | |
| "7 B 2020-01-01 01:00:00 3\n", | |
| "8 L 2020-01-01 02:00:00 3\n", | |
| "9 A 2020-01-01 00:00:00 4\n", | |
| "10 B 2020-01-01 01:00:00 4\n", | |
| "11 A 2020-01-01 00:00:00 5\n", | |
| "12 B 2020-01-01 01:00:00 5\n", | |
| "13 C 2020-01-01 01:00:00 6\n", | |
| "14 B 2020-01-01 00:00:00 6\n", | |
| "15 C 2020-01-01 01:00:00 7\n", | |
| "16 B 2020-01-01 00:00:00 7" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_events.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_sequence_per_user = (\n", | |
| " df_events\n", | |
| " .sort(\"timestamp\")\n", | |
| " .groupby(\"visitor\")\n", | |
| " .agg(\n", | |
| " F.collect_list(F.col(\"page\")).alias(\"sequence\")\n", | |
| " )\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>visitor</th>\n", | |
| " <th>sequence</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>7</td>\n", | |
| " <td>[B, C]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>6</td>\n", | |
| " <td>[B, C]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>5</td>\n", | |
| " <td>[A, B]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>[A, B, C]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>3</td>\n", | |
| " <td>[K, B, L]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2</td>\n", | |
| " <td>[Z, B, F]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>4</td>\n", | |
| " <td>[A, B]</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " visitor sequence\n", | |
| "0 7 [B, C]\n", | |
| "1 6 [B, C]\n", | |
| "2 5 [A, B]\n", | |
| "3 1 [A, B, C]\n", | |
| "4 3 [K, B, L]\n", | |
| "5 2 [Z, B, F]\n", | |
| "6 4 [A, B]" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_sequence_per_user.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_count_per_sequence = (\n", | |
| " df_sequence_per_user\n", | |
| " .groupby(\"sequence\")\n", | |
| " .count()\n", | |
| " .sort(F.desc(\"count\"))\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>sequence</th>\n", | |
| " <th>count</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>[B, C]</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>[A, B]</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>[K, B, L]</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>[A, B, C]</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>[Z, B, F]</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " sequence count\n", | |
| "0 [B, C] 2\n", | |
| "1 [A, B] 2\n", | |
| "2 [K, B, L] 1\n", | |
| "3 [A, B, C] 1\n", | |
| "4 [Z, B, F] 1" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_count_per_sequence.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def print_json_sankey(df, column):\n", | |
| " @udf(returnType=ArrayType(StringType()))\n", | |
| " def array_index(arr):\n", | |
| " return [f\"{count}|{item}\" for count, item in enumerate(arr, start=1)]\n", | |
| " #add index, so same step can be showed as multiple times in the sequence with same color\n", | |
| " df_sankey = (\n", | |
| " df\n", | |
| " .withColumn(\"path_index\", array_index(F.col(column)))\n", | |
| " .cache()\n", | |
| " )\n", | |
| " #get color pallet from seaborn\n", | |
| " pal = sns.color_palette(n_colors=500)\n", | |
| " colors = pal.as_hex()\n", | |
| " \n", | |
| " #find unique nodes colors\n", | |
| " nodes_colors_aux = df.select(F.explode(column)).distinct().toPandas().to_dict('records')\n", | |
| " nodes_colors = {}\n", | |
| " for count, item in enumerate(sorted([r['col'] for r in nodes_colors_aux])):\n", | |
| " nodes_colors[item] = colors[count % len(colors)]\n", | |
| " nodes_colors['end'] = 'grey'\n", | |
| " \n", | |
| " #deal with different node ids, keeping same color\n", | |
| " nodes_aux = df_sankey.select(F.explode(\"path_index\")).distinct().toPandas().to_dict('records')\n", | |
| " nodes_aux2 = {}\n", | |
| " for count, item in enumerate(sorted([r['col'] for r in nodes_aux])):\n", | |
| " nodes_aux2[item] = count\n", | |
| " nodes_aux2['end|end'] = len(nodes_aux2)\n", | |
| " nodes = []\n", | |
| " for node, id in sorted(nodes_aux2.items()):\n", | |
| " nodes.append({\"name\" : node.split(\"|\")[1], \"id\" : node, \"color\" : nodes_colors[node.split(\"|\")[1]]})\n", | |
| " \n", | |
| " #for each step in the sequence, print a link for s3 sankey\n", | |
| " links_aux = df_sankey.select(\"path_index\", \"count\").toPandas().to_dict('records')\n", | |
| " links = []\n", | |
| " for l in links_aux:\n", | |
| " journey = l['path_index'].append('end|end')\n", | |
| " journey_len = len(l['path_index'])\n", | |
| " for i in range(journey_len -1):\n", | |
| " links.append({\"source\" : nodes_aux2[l['path_index'][i]], \"value\" : l['count'], \"target\" : nodes_aux2[l['path_index'][i+1]]})\n", | |
| " \n", | |
| " print({\"nodes\": nodes, \"links\": links})\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "{'nodes': [{'name': 'A', 'id': '1|A', 'color': '#1f77b4'}, {'name': 'B', 'id': '1|B', 'color': '#ff7f0e'}, {'name': 'K', 'id': '1|K', 'color': '#9467bd'}, {'name': 'Z', 'id': '1|Z', 'color': '#e377c2'}, {'name': 'B', 'id': '2|B', 'color': '#ff7f0e'}, {'name': 'C', 'id': '2|C', 'color': '#2ca02c'}, {'name': 'C', 'id': '3|C', 'color': '#2ca02c'}, {'name': 'F', 'id': '3|F', 'color': '#d62728'}, {'name': 'L', 'id': '3|L', 'color': '#8c564b'}, {'name': 'end', 'id': 'end|end', 'color': 'grey'}], 'links': [{'source': 1, 'value': 2, 'target': 5}, {'source': 5, 'value': 2, 'target': 9}, {'source': 0, 'value': 2, 'target': 4}, {'source': 4, 'value': 2, 'target': 9}, {'source': 2, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 8}, {'source': 8, 'value': 1, 'target': 9}, {'source': 0, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 6}, {'source': 6, 'value': 1, 'target': 9}, {'source': 3, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 7}, {'source': 7, 'value': 1, 'target': 9}]}\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print_json_sankey(df_count_per_sequence, \"sequence\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#open d3-sankey.html and replace contents of data and colors as above" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 66, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with open (\"d3-sankey.html\", \"r\") as html_file:\n", | |
| " html=html_file.read()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 69, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<iframe id=iframe1 width=1000 height=900 src='./d3-sankey.html'></iframe>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "execution_count": 69, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "HTML(\"\"\"<iframe id=iframe1 width=1000 height=900 src='./d3-sankey.html'></iframe>\"\"\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_list_per_user = (\n", | |
| " df_events\n", | |
| " .groupby(\"visitor\")\n", | |
| " .agg(\n", | |
| " F.collect_set(F.col(\"page\")).alias(\"list\")\n", | |
| " )\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>visitor</th>\n", | |
| " <th>list</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>7</td>\n", | |
| " <td>[C, B]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>6</td>\n", | |
| " <td>[C, B]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>5</td>\n", | |
| " <td>[B, A]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>[C, B, A]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>3</td>\n", | |
| " <td>[K, B, L]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2</td>\n", | |
| " <td>[F, Z, B]</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>4</td>\n", | |
| " <td>[B, A]</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " visitor list\n", | |
| "0 7 [C, B]\n", | |
| "1 6 [C, B]\n", | |
| "2 5 [B, A]\n", | |
| "3 1 [C, B, A]\n", | |
| "4 3 [K, B, L]\n", | |
| "5 2 [F, Z, B]\n", | |
| "6 4 [B, A]" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_list_per_user.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "@udf(returnType=ArrayType(ArrayType(StringType())))\n", | |
| "\n", | |
| "def arrray_combination(list_all):\n", | |
| " list_out = []\n", | |
| " for i in range(0, len(list_all)):\n", | |
| " for j in range(i+1, len(list_all)):\n", | |
| " list_out.append([list_all[i], list_all[j]])\n", | |
| " return list_out\n", | |
| "\n", | |
| "\n", | |
| "@udf(\"array<struct<_1: string, _2: string>>\")\n", | |
| "def arrray_combination2(x):\n", | |
| " from itertools import combinations\n", | |
| " pairs = [pair for pair in combinations(x, 2)] \n", | |
| " return pairs" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_combination = (\n", | |
| " df_list_per_user\n", | |
| " .withColumn(\"combination_pairs\", arrray_combination2(F.sort_array(F.col(\"list\"))))\n", | |
| " .select(\n", | |
| " F.col(\"visitor\"),\n", | |
| " F.explode(\"combination_pairs\").alias(\"combination_exploded\")\n", | |
| " )\n", | |
| " .select(\"visitor\", F.expr(\"combination_exploded._1\").alias(\"part_1\"), F.col(\"combination_exploded._2\").alias(\"part_2\"))\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>visitor</th>\n", | |
| " <th>part_1</th>\n", | |
| " <th>part_2</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>7</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>6</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>5</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>1</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>3</td>\n", | |
| " <td>B</td>\n", | |
| " <td>K</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>3</td>\n", | |
| " <td>B</td>\n", | |
| " <td>L</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>3</td>\n", | |
| " <td>K</td>\n", | |
| " <td>L</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>2</td>\n", | |
| " <td>B</td>\n", | |
| " <td>F</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>2</td>\n", | |
| " <td>B</td>\n", | |
| " <td>Z</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>2</td>\n", | |
| " <td>F</td>\n", | |
| " <td>Z</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>4</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " visitor part_1 part_2\n", | |
| "0 7 B C\n", | |
| "1 6 B C\n", | |
| "2 5 A B\n", | |
| "3 1 A B\n", | |
| "4 1 A C\n", | |
| "5 1 B C\n", | |
| "6 3 B K\n", | |
| "7 3 B L\n", | |
| "8 3 K L\n", | |
| "9 2 B F\n", | |
| "10 2 B Z\n", | |
| "11 2 F Z\n", | |
| "12 4 A B" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_combination.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_top_pairs = (\n", | |
| " df_combination\n", | |
| " .groupby(\"part_1\", \"part_2\")\n", | |
| " .count()\n", | |
| " .sort(F.desc(\"count\"))\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>part_1</th>\n", | |
| " <th>part_2</th>\n", | |
| " <th>count</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>B</td>\n", | |
| " <td>Z</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>B</td>\n", | |
| " <td>L</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>B</td>\n", | |
| " <td>F</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>B</td>\n", | |
| " <td>K</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>K</td>\n", | |
| " <td>L</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>F</td>\n", | |
| " <td>Z</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " part_1 part_2 count\n", | |
| "0 B C 3\n", | |
| "1 A B 3\n", | |
| "2 B Z 1\n", | |
| "3 B L 1\n", | |
| "4 B F 1\n", | |
| "5 A C 1\n", | |
| "6 B K 1\n", | |
| "7 K L 1\n", | |
| "8 F Z 1" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_top_pairs.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_pairs_chord_viz = (\n", | |
| " df_top_pairs\n", | |
| " .union(\n", | |
| " df_top_pairs.select(F.col(\"part_2\").alias(\"part_1\"), F.col(\"part_1\").alias(\"part_2\"), F.col(\"count\"))\n", | |
| " )\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dict_pairs_pandas = df_pairs_chord_viz.toPandas().to_dict('records')\n", | |
| "dict_nodes = df_pairs_chord_viz.select(\"part_1\").distinct().toPandas().to_dict('records')\n", | |
| "pairs = [[record['part_1'], record['part_2'], record['count']] for record in dict_pairs_pandas]\n", | |
| "pal = sns.color_palette(n_colors=50)\n", | |
| "colors = pal.as_hex()\n", | |
| "node_colors = {}\n", | |
| "i = 0\n", | |
| "for record in dict_nodes:\n", | |
| " node_colors[record['part_1']] = colors[i]\n", | |
| " i+=1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "var data = [['B', 'C', 3], ['A', 'B', 3], ['B', 'Z', 1], ['B', 'L', 1], ['B', 'F', 1], ['A', 'C', 1], ['B', 'K', 1], ['K', 'L', 1], ['F', 'Z', 1], ['C', 'B', 3], ['B', 'A', 3], ['Z', 'B', 1], ['L', 'B', 1], ['F', 'B', 1], ['C', 'A', 1], ['K', 'B', 1], ['L', 'K', 1], ['Z', 'F', 1]]\n", | |
| "var colors = {'K': '#1f77b4', 'F': '#ff7f0e', 'B': '#2ca02c', 'L': '#d62728', 'C': '#9467bd', 'Z': '#8c564b', 'A': '#e377c2'}\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(f\"var data = {pairs}\")\n", | |
| "print(f\"var colors = {node_colors}\")\n", | |
| "#open d3-chord.html and replace contents of data and colors as above" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<iframe width=1000 height=900 src='./d3-chord.html'></iframe>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "execution_count": 57, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "HTML(\"<iframe width=1000 height=900 src='./d3-chord.html'></iframe>\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.9" | |
| }, | |
| "toc": { | |
| "base_numbering": 1, | |
| "nav_menu": {}, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "skip_h1_title": false, | |
| "title_cell": "Table of Contents", | |
| "title_sidebar": "Contents", | |
| "toc_cell": false, | |
| "toc_position": {}, | |
| "toc_section_display": true, | |
| "toc_window_display": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <!DOCTYPE html> | |
| <meta charset="utf-8"> | |
| <style> | |
| body { | |
| font: 10px sans-serif; | |
| } | |
| svg text{ | |
| fill:grey; | |
| font-size:11px; | |
| } | |
| svg .values text{ | |
| pointer-events:none; | |
| stroke-width: 0.5px; | |
| } | |
| .groups:hover{ | |
| cursor:pointer; | |
| font-weight:bold; | |
| } | |
| </style> | |
| <title> | |
| Chord | |
| </title> | |
| <body> | |
| <script src="https://d3js.org/d3.v4.min.js"></script> | |
| <script src="http://vizjs.org/viz.v1.1.0.min.js"></script> | |
| <script> | |
| var data = [] | |
| var colors = {} | |
| var sortOrder =[] | |
| var data = [['B', 'C', 3], ['A', 'B', 3], ['B', 'Z', 1], ['B', 'L', 1], ['B', 'F', 1], ['A', 'C', 1], ['B', 'K', 1], ['K', 'L', 1], ['F', 'Z', 1], ['C', 'B', 3], ['B', 'A', 3], ['Z', 'B', 1], ['L', 'B', 1], ['F', 'B', 1], ['C', 'A', 1], ['K', 'B', 1], ['L', 'K', 1], ['Z', 'F', 1]] | |
| var colors = {'K': '#1f77b4', 'F': '#ff7f0e', 'B': '#2ca02c', 'L': '#d62728', 'C': '#9467bd', 'Z': '#8c564b', 'A': '#e377c2'} | |
| function sort(a,b){ return d3.ascending(sortOrder.indexOf(a),sortOrder.indexOf(b)); } | |
| var ch = viz.ch().data(data) | |
| .padding(.01) | |
| .sort(sort) | |
| .innerRadius(430) | |
| .outerRadius(450) | |
| .duration(1000) | |
| .chordOpacity(0.3) | |
| .labelPadding(.03) | |
| .fill(function(d){ return colors[d];}); | |
| var width=1200, height=1100; | |
| var svg = d3.select("body").append("svg").attr("height",height).attr("width",width); | |
| svg.append("g").attr("transform", "translate(600,550)").call(ch); | |
| // adjust height of frame in bl.ocks.org | |
| d3.select(self.frameElement).style("height", height+"px").style("width", width+"px"); | |
| </script> | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>Sankey</title> | |
| <script src="http://d3js.org/d3.v3.min.js"></script> | |
| <script src="http://cdn.rawgit.com/newrelic-forks/d3-plugins-sankey/master/sankey.js"></script> | |
| <script src="http://cdn.rawgit.com/misoproject/d3.chart/master/d3.chart.min.js"></script> | |
| <script src="http://cdn.rawgit.com/q-m/d3.chart.sankey/master/d3.chart.sankey.min.js"></script> | |
| <style> | |
| body { | |
| padding: 30px; | |
| min-width: 600px; | |
| max-width: 1200px; | |
| margin: auto; | |
| } | |
| #chart { | |
| height: 700px; | |
| font: 8px sans-serif; | |
| } | |
| .node rect { | |
| fill-opacity: .9; | |
| shape-rendering: crispEdges; | |
| stroke-width: 0; | |
| } | |
| .node text { | |
| text-shadow: 0 1px 0 #fff; | |
| } | |
| .link { | |
| fill: none; | |
| stroke: #000; | |
| stroke-opacity: .2; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div id="chart"></div> | |
| <script> | |
| //d3.json("http://cdn.rawgit.com/q-m/d3.chart.sankey/master/example/data/product.json", function(error, json) { | |
| json = {} | |
| json = {'nodes': [{'name': 'A', 'id': '1|A', 'color': '#1f77b4'}, {'name': 'B', 'id': '1|B', 'color': '#ff7f0e'}, {'name': 'K', 'id': '1|K', 'color': '#9467bd'}, {'name': 'Z', 'id': '1|Z', 'color': '#e377c2'}, {'name': 'B', 'id': '2|B', 'color': '#ff7f0e'}, {'name': 'C', 'id': '2|C', 'color': '#2ca02c'}, {'name': 'C', 'id': '3|C', 'color': '#2ca02c'}, {'name': 'F', 'id': '3|F', 'color': '#d62728'}, {'name': 'L', 'id': '3|L', 'color': '#8c564b'}, {'name': 'end', 'id': 'end|end', 'color': 'grey'}], 'links': [{'source': 1, 'value': 2, 'target': 5}, {'source': 5, 'value': 2, 'target': 9}, {'source': 0, 'value': 2, 'target': 4}, {'source': 4, 'value': 2, 'target': 9}, {'source': 2, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 8}, {'source': 8, 'value': 1, 'target': 9}, {'source': 0, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 6}, {'source': 6, 'value': 1, 'target': 9}, {'source': 3, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 7}, {'source': 7, 'value': 1, 'target': 9}]} | |
| var chart = d3.select("#chart").append("svg").chart("Sankey.Path"); | |
| chart | |
| .name(label) | |
| .colorNodes(function(name, node) { | |
| return color(node, 1) || colors.fallback; | |
| }) | |
| .colorLinks(function(link) { | |
| return color(link.source, 4) || color(link.target, 1) || colors.fallback; | |
| }) | |
| .nodeWidth(15) | |
| .nodePadding(10) | |
| .spread(true) | |
| .iterations(0) | |
| .draw(json); | |
| function label(node) { | |
| return node.name.replace(/\s*\(.*?\)$/, ''); | |
| } | |
| function color(node, depth) { | |
| if (node.color) { | |
| return node.color; | |
| } else { | |
| return 'grey'; | |
| } | |
| } | |
| //}); | |
| </script> | |
| </body> | |
| </html> | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment