Last active
February 23, 2021 04:45
-
-
Save gbraccialli/9f1a8be687050a4278758e7944ce0225 to your computer and use it in GitHub Desktop.
clickstream
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"spark version = 3.0.1\n" | |
] | |
} | |
], | |
"source": [ | |
"import pyspark\n", | |
"from pyspark.sql import SparkSession\n", | |
"from pyspark.sql import functions as F\n", | |
"from pyspark.sql.types import *\n", | |
"from pyspark.sql.functions import udf\n", | |
"from IPython.display import HTML\n", | |
"import seaborn as sns\n", | |
"\n", | |
"print(f\"spark version = {pyspark.__version__}\")\n", | |
"spark = (SparkSession\n", | |
" .builder\n", | |
" .master(\"local[*]\")\n", | |
" .appName(\"jupyter\")\n", | |
" .config(\"spark.driver.memory\", \"10g\")\n", | |
" .getOrCreate()\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"events = [\n", | |
" {\"visitor\": 1, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n", | |
" {\"visitor\": 1, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
" {\"visitor\": 1, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'C'},\n", | |
" \n", | |
" {\"visitor\": 2, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'Z'},\n", | |
" {\"visitor\": 2, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
" {\"visitor\": 2, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'F'},\n", | |
" \n", | |
" {\"visitor\": 3, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'K'},\n", | |
" {\"visitor\": 3, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
" {\"visitor\": 3, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'L'},\n", | |
" \n", | |
" {\"visitor\": 4, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n", | |
" {\"visitor\": 4, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
" \n", | |
" {\"visitor\": 5, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n", | |
" {\"visitor\": 5, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n", | |
" \n", | |
"\n", | |
" {\"visitor\": 6, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'C'},\n", | |
" {\"visitor\": 6, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'B'},\n", | |
"\n", | |
" {\"visitor\": 7, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'C'},\n", | |
" {\"visitor\": 7, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'B'},\n", | |
" \n", | |
" \n", | |
" \n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/guilherme_braccialli/anaconda3/envs/pyspark/lib/python3.7/site-packages/pyspark/sql/session.py:381: UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead\n", | |
" warnings.warn(\"inferring schema from dict is deprecated,\"\n" | |
] | |
} | |
], | |
"source": [ | |
"df_events = spark.createDataFrame(events)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>page</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>visitor</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A</td>\n", | |
" <td>2020-01-01 00:00:00</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>B</td>\n", | |
" <td>2020-01-01 01:00:00</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>C</td>\n", | |
" <td>2020-01-01 02:00:00</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Z</td>\n", | |
" <td>2020-01-01 00:00:00</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>B</td>\n", | |
" <td>2020-01-01 01:00:00</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>F</td>\n", | |
" <td>2020-01-01 02:00:00</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>K</td>\n", | |
" <td>2020-01-01 00:00:00</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>B</td>\n", | |
" <td>2020-01-01 01:00:00</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>L</td>\n", | |
" <td>2020-01-01 02:00:00</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>A</td>\n", | |
" <td>2020-01-01 00:00:00</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>B</td>\n", | |
" <td>2020-01-01 01:00:00</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>A</td>\n", | |
" <td>2020-01-01 00:00:00</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>B</td>\n", | |
" <td>2020-01-01 01:00:00</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>C</td>\n", | |
" <td>2020-01-01 01:00:00</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>B</td>\n", | |
" <td>2020-01-01 00:00:00</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>C</td>\n", | |
" <td>2020-01-01 01:00:00</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>B</td>\n", | |
" <td>2020-01-01 00:00:00</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" page timestamp visitor\n", | |
"0 A 2020-01-01 00:00:00 1\n", | |
"1 B 2020-01-01 01:00:00 1\n", | |
"2 C 2020-01-01 02:00:00 1\n", | |
"3 Z 2020-01-01 00:00:00 2\n", | |
"4 B 2020-01-01 01:00:00 2\n", | |
"5 F 2020-01-01 02:00:00 2\n", | |
"6 K 2020-01-01 00:00:00 3\n", | |
"7 B 2020-01-01 01:00:00 3\n", | |
"8 L 2020-01-01 02:00:00 3\n", | |
"9 A 2020-01-01 00:00:00 4\n", | |
"10 B 2020-01-01 01:00:00 4\n", | |
"11 A 2020-01-01 00:00:00 5\n", | |
"12 B 2020-01-01 01:00:00 5\n", | |
"13 C 2020-01-01 01:00:00 6\n", | |
"14 B 2020-01-01 00:00:00 6\n", | |
"15 C 2020-01-01 01:00:00 7\n", | |
"16 B 2020-01-01 00:00:00 7" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_events.toPandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_sequence_per_user = (\n", | |
" df_events\n", | |
" .sort(\"timestamp\")\n", | |
" .groupby(\"visitor\")\n", | |
" .agg(\n", | |
" F.collect_list(F.col(\"page\")).alias(\"sequence\")\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>visitor</th>\n", | |
" <th>sequence</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>7</td>\n", | |
" <td>[B, C]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>6</td>\n", | |
" <td>[B, C]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>[A, B]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>[A, B, C]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>3</td>\n", | |
" <td>[K, B, L]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>2</td>\n", | |
" <td>[Z, B, F]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>4</td>\n", | |
" <td>[A, B]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" visitor sequence\n", | |
"0 7 [B, C]\n", | |
"1 6 [B, C]\n", | |
"2 5 [A, B]\n", | |
"3 1 [A, B, C]\n", | |
"4 3 [K, B, L]\n", | |
"5 2 [Z, B, F]\n", | |
"6 4 [A, B]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_sequence_per_user.toPandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_count_per_sequence = (\n", | |
" df_sequence_per_user\n", | |
" .groupby(\"sequence\")\n", | |
" .count()\n", | |
" .sort(F.desc(\"count\"))\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sequence</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>[B, C]</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>[A, B]</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>[K, B, L]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>[A, B, C]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>[Z, B, F]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sequence count\n", | |
"0 [B, C] 2\n", | |
"1 [A, B] 2\n", | |
"2 [K, B, L] 1\n", | |
"3 [A, B, C] 1\n", | |
"4 [Z, B, F] 1" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_count_per_sequence.toPandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def print_json_sankey(df, column):\n", | |
" @udf(returnType=ArrayType(StringType()))\n", | |
" def array_index(arr):\n", | |
" return [f\"{count}|{item}\" for count, item in enumerate(arr, start=1)]\n", | |
" #add index, so same step can be showed as multiple times in the sequence with same color\n", | |
" df_sankey = (\n", | |
" df\n", | |
" .withColumn(\"path_index\", array_index(F.col(column)))\n", | |
" .cache()\n", | |
" )\n", | |
" #get color pallet from seaborn\n", | |
" pal = sns.color_palette(n_colors=500)\n", | |
" colors = pal.as_hex()\n", | |
" \n", | |
" #find unique nodes colors\n", | |
" nodes_colors_aux = df.select(F.explode(column)).distinct().toPandas().to_dict('records')\n", | |
" nodes_colors = {}\n", | |
" for count, item in enumerate(sorted([r['col'] for r in nodes_colors_aux])):\n", | |
" nodes_colors[item] = colors[count % len(colors)]\n", | |
" nodes_colors['end'] = 'grey'\n", | |
" \n", | |
" #deal with different node ids, keeping same color\n", | |
" nodes_aux = df_sankey.select(F.explode(\"path_index\")).distinct().toPandas().to_dict('records')\n", | |
" nodes_aux2 = {}\n", | |
" for count, item in enumerate(sorted([r['col'] for r in nodes_aux])):\n", | |
" nodes_aux2[item] = count\n", | |
" nodes_aux2['end|end'] = len(nodes_aux2)\n", | |
" nodes = []\n", | |
" for node, id in sorted(nodes_aux2.items()):\n", | |
" nodes.append({\"name\" : node.split(\"|\")[1], \"id\" : node, \"color\" : nodes_colors[node.split(\"|\")[1]]})\n", | |
" \n", | |
" #for each step in the sequence, print a link for s3 sankey\n", | |
" links_aux = df_sankey.select(\"path_index\", \"count\").toPandas().to_dict('records')\n", | |
" links = []\n", | |
" for l in links_aux:\n", | |
" journey = l['path_index'].append('end|end')\n", | |
" journey_len = len(l['path_index'])\n", | |
" for i in range(journey_len -1):\n", | |
" links.append({\"source\" : nodes_aux2[l['path_index'][i]], \"value\" : l['count'], \"target\" : nodes_aux2[l['path_index'][i+1]]})\n", | |
" \n", | |
" print({\"nodes\": nodes, \"links\": links})\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'nodes': [{'name': 'A', 'id': '1|A', 'color': '#1f77b4'}, {'name': 'B', 'id': '1|B', 'color': '#ff7f0e'}, {'name': 'K', 'id': '1|K', 'color': '#9467bd'}, {'name': 'Z', 'id': '1|Z', 'color': '#e377c2'}, {'name': 'B', 'id': '2|B', 'color': '#ff7f0e'}, {'name': 'C', 'id': '2|C', 'color': '#2ca02c'}, {'name': 'C', 'id': '3|C', 'color': '#2ca02c'}, {'name': 'F', 'id': '3|F', 'color': '#d62728'}, {'name': 'L', 'id': '3|L', 'color': '#8c564b'}, {'name': 'end', 'id': 'end|end', 'color': 'grey'}], 'links': [{'source': 1, 'value': 2, 'target': 5}, {'source': 5, 'value': 2, 'target': 9}, {'source': 0, 'value': 2, 'target': 4}, {'source': 4, 'value': 2, 'target': 9}, {'source': 2, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 8}, {'source': 8, 'value': 1, 'target': 9}, {'source': 0, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 6}, {'source': 6, 'value': 1, 'target': 9}, {'source': 3, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 7}, {'source': 7, 'value': 1, 'target': 9}]}\n" | |
] | |
} | |
], | |
"source": [ | |
"print_json_sankey(df_count_per_sequence, \"sequence\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#open d3-sankey.html and replace contents of data and colors as above" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open (\"d3-sankey.html\", \"r\") as html_file:\n", | |
" html=html_file.read()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<iframe id=iframe1 width=1000 height=900 src='./d3-sankey.html'></iframe>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"execution_count": 69, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"HTML(\"\"\"<iframe id=iframe1 width=1000 height=900 src='./d3-sankey.html'></iframe>\"\"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_list_per_user = (\n", | |
" df_events\n", | |
" .groupby(\"visitor\")\n", | |
" .agg(\n", | |
" F.collect_set(F.col(\"page\")).alias(\"list\")\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>visitor</th>\n", | |
" <th>list</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>7</td>\n", | |
" <td>[C, B]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>6</td>\n", | |
" <td>[C, B]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>[B, A]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>[C, B, A]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>3</td>\n", | |
" <td>[K, B, L]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>2</td>\n", | |
" <td>[F, Z, B]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>4</td>\n", | |
" <td>[B, A]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" visitor list\n", | |
"0 7 [C, B]\n", | |
"1 6 [C, B]\n", | |
"2 5 [B, A]\n", | |
"3 1 [C, B, A]\n", | |
"4 3 [K, B, L]\n", | |
"5 2 [F, Z, B]\n", | |
"6 4 [B, A]" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_list_per_user.toPandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"@udf(returnType=ArrayType(ArrayType(StringType())))\n", | |
"\n", | |
"def arrray_combination(list_all):\n", | |
" list_out = []\n", | |
" for i in range(0, len(list_all)):\n", | |
" for j in range(i+1, len(list_all)):\n", | |
" list_out.append([list_all[i], list_all[j]])\n", | |
" return list_out\n", | |
"\n", | |
"\n", | |
"@udf(\"array<struct<_1: string, _2: string>>\")\n", | |
"def arrray_combination2(x):\n", | |
" from itertools import combinations\n", | |
" pairs = [pair for pair in combinations(x, 2)] \n", | |
" return pairs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_combination = (\n", | |
" df_list_per_user\n", | |
" .withColumn(\"combination_pairs\", arrray_combination2(F.sort_array(F.col(\"list\"))))\n", | |
" .select(\n", | |
" F.col(\"visitor\"),\n", | |
" F.explode(\"combination_pairs\").alias(\"combination_exploded\")\n", | |
" )\n", | |
" .select(\"visitor\", F.expr(\"combination_exploded._1\").alias(\"part_1\"), F.col(\"combination_exploded._2\").alias(\"part_2\"))\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>visitor</th>\n", | |
" <th>part_1</th>\n", | |
" <th>part_2</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>7</td>\n", | |
" <td>B</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>6</td>\n", | |
" <td>B</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>A</td>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>A</td>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>A</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1</td>\n", | |
" <td>B</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>3</td>\n", | |
" <td>B</td>\n", | |
" <td>K</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>3</td>\n", | |
" <td>B</td>\n", | |
" <td>L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>3</td>\n", | |
" <td>K</td>\n", | |
" <td>L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>2</td>\n", | |
" <td>B</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>2</td>\n", | |
" <td>B</td>\n", | |
" <td>Z</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>2</td>\n", | |
" <td>F</td>\n", | |
" <td>Z</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>4</td>\n", | |
" <td>A</td>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" visitor part_1 part_2\n", | |
"0 7 B C\n", | |
"1 6 B C\n", | |
"2 5 A B\n", | |
"3 1 A B\n", | |
"4 1 A C\n", | |
"5 1 B C\n", | |
"6 3 B K\n", | |
"7 3 B L\n", | |
"8 3 K L\n", | |
"9 2 B F\n", | |
"10 2 B Z\n", | |
"11 2 F Z\n", | |
"12 4 A B" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_combination.toPandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_top_pairs = (\n", | |
" df_combination\n", | |
" .groupby(\"part_1\", \"part_2\")\n", | |
" .count()\n", | |
" .sort(F.desc(\"count\"))\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>part_1</th>\n", | |
" <th>part_2</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>B</td>\n", | |
" <td>C</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>A</td>\n", | |
" <td>B</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>B</td>\n", | |
" <td>Z</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>B</td>\n", | |
" <td>L</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>B</td>\n", | |
" <td>F</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>A</td>\n", | |
" <td>C</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>B</td>\n", | |
" <td>K</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>K</td>\n", | |
" <td>L</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>F</td>\n", | |
" <td>Z</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" part_1 part_2 count\n", | |
"0 B C 3\n", | |
"1 A B 3\n", | |
"2 B Z 1\n", | |
"3 B L 1\n", | |
"4 B F 1\n", | |
"5 A C 1\n", | |
"6 B K 1\n", | |
"7 K L 1\n", | |
"8 F Z 1" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_top_pairs.toPandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_pairs_chord_viz = (\n", | |
" df_top_pairs\n", | |
" .union(\n", | |
" df_top_pairs.select(F.col(\"part_2\").alias(\"part_1\"), F.col(\"part_1\").alias(\"part_2\"), F.col(\"count\"))\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dict_pairs_pandas = df_pairs_chord_viz.toPandas().to_dict('records')\n", | |
"dict_nodes = df_pairs_chord_viz.select(\"part_1\").distinct().toPandas().to_dict('records')\n", | |
"pairs = [[record['part_1'], record['part_2'], record['count']] for record in dict_pairs_pandas]\n", | |
"pal = sns.color_palette(n_colors=50)\n", | |
"colors = pal.as_hex()\n", | |
"node_colors = {}\n", | |
"i = 0\n", | |
"for record in dict_nodes:\n", | |
" node_colors[record['part_1']] = colors[i]\n", | |
" i+=1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"var data = [['B', 'C', 3], ['A', 'B', 3], ['B', 'Z', 1], ['B', 'L', 1], ['B', 'F', 1], ['A', 'C', 1], ['B', 'K', 1], ['K', 'L', 1], ['F', 'Z', 1], ['C', 'B', 3], ['B', 'A', 3], ['Z', 'B', 1], ['L', 'B', 1], ['F', 'B', 1], ['C', 'A', 1], ['K', 'B', 1], ['L', 'K', 1], ['Z', 'F', 1]]\n", | |
"var colors = {'K': '#1f77b4', 'F': '#ff7f0e', 'B': '#2ca02c', 'L': '#d62728', 'C': '#9467bd', 'Z': '#8c564b', 'A': '#e377c2'}\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"var data = {pairs}\")\n", | |
"print(f\"var colors = {node_colors}\")\n", | |
"#open d3-chord.html and replace contents of data and colors as above" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<iframe width=1000 height=900 src='./d3-chord.html'></iframe>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"HTML(\"<iframe width=1000 height=900 src='./d3-chord.html'></iframe>\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.9" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<style> | |
body { | |
font: 10px sans-serif; | |
} | |
svg text{ | |
fill:grey; | |
font-size:11px; | |
} | |
svg .values text{ | |
pointer-events:none; | |
stroke-width: 0.5px; | |
} | |
.groups:hover{ | |
cursor:pointer; | |
font-weight:bold; | |
} | |
</style> | |
<title> | |
Chord | |
</title> | |
<body> | |
<script src="https://d3js.org/d3.v4.min.js"></script> | |
<script src="http://vizjs.org/viz.v1.1.0.min.js"></script> | |
<script> | |
var data = [] | |
var colors = {} | |
var sortOrder =[] | |
var data = [['B', 'C', 3], ['A', 'B', 3], ['B', 'Z', 1], ['B', 'L', 1], ['B', 'F', 1], ['A', 'C', 1], ['B', 'K', 1], ['K', 'L', 1], ['F', 'Z', 1], ['C', 'B', 3], ['B', 'A', 3], ['Z', 'B', 1], ['L', 'B', 1], ['F', 'B', 1], ['C', 'A', 1], ['K', 'B', 1], ['L', 'K', 1], ['Z', 'F', 1]] | |
var colors = {'K': '#1f77b4', 'F': '#ff7f0e', 'B': '#2ca02c', 'L': '#d62728', 'C': '#9467bd', 'Z': '#8c564b', 'A': '#e377c2'} | |
function sort(a,b){ return d3.ascending(sortOrder.indexOf(a),sortOrder.indexOf(b)); } | |
var ch = viz.ch().data(data) | |
.padding(.01) | |
.sort(sort) | |
.innerRadius(430) | |
.outerRadius(450) | |
.duration(1000) | |
.chordOpacity(0.3) | |
.labelPadding(.03) | |
.fill(function(d){ return colors[d];}); | |
var width=1200, height=1100; | |
var svg = d3.select("body").append("svg").attr("height",height).attr("width",width); | |
svg.append("g").attr("transform", "translate(600,550)").call(ch); | |
// adjust height of frame in bl.ocks.org | |
d3.select(self.frameElement).style("height", height+"px").style("width", width+"px"); | |
</script> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<title>Sankey</title> | |
<script src="http://d3js.org/d3.v3.min.js"></script> | |
<script src="http://cdn.rawgit.com/newrelic-forks/d3-plugins-sankey/master/sankey.js"></script> | |
<script src="http://cdn.rawgit.com/misoproject/d3.chart/master/d3.chart.min.js"></script> | |
<script src="http://cdn.rawgit.com/q-m/d3.chart.sankey/master/d3.chart.sankey.min.js"></script> | |
<style> | |
body { | |
padding: 30px; | |
min-width: 600px; | |
max-width: 1200px; | |
margin: auto; | |
} | |
#chart { | |
height: 700px; | |
font: 8px sans-serif; | |
} | |
.node rect { | |
fill-opacity: .9; | |
shape-rendering: crispEdges; | |
stroke-width: 0; | |
} | |
.node text { | |
text-shadow: 0 1px 0 #fff; | |
} | |
.link { | |
fill: none; | |
stroke: #000; | |
stroke-opacity: .2; | |
} | |
</style> | |
</head> | |
<body> | |
<div id="chart"></div> | |
<script> | |
//d3.json("http://cdn.rawgit.com/q-m/d3.chart.sankey/master/example/data/product.json", function(error, json) { | |
json = {} | |
json = {'nodes': [{'name': 'A', 'id': '1|A', 'color': '#1f77b4'}, {'name': 'B', 'id': '1|B', 'color': '#ff7f0e'}, {'name': 'K', 'id': '1|K', 'color': '#9467bd'}, {'name': 'Z', 'id': '1|Z', 'color': '#e377c2'}, {'name': 'B', 'id': '2|B', 'color': '#ff7f0e'}, {'name': 'C', 'id': '2|C', 'color': '#2ca02c'}, {'name': 'C', 'id': '3|C', 'color': '#2ca02c'}, {'name': 'F', 'id': '3|F', 'color': '#d62728'}, {'name': 'L', 'id': '3|L', 'color': '#8c564b'}, {'name': 'end', 'id': 'end|end', 'color': 'grey'}], 'links': [{'source': 1, 'value': 2, 'target': 5}, {'source': 5, 'value': 2, 'target': 9}, {'source': 0, 'value': 2, 'target': 4}, {'source': 4, 'value': 2, 'target': 9}, {'source': 2, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 8}, {'source': 8, 'value': 1, 'target': 9}, {'source': 0, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 6}, {'source': 6, 'value': 1, 'target': 9}, {'source': 3, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 7}, {'source': 7, 'value': 1, 'target': 9}]} | |
var chart = d3.select("#chart").append("svg").chart("Sankey.Path"); | |
chart | |
.name(label) | |
.colorNodes(function(name, node) { | |
return color(node, 1) || colors.fallback; | |
}) | |
.colorLinks(function(link) { | |
return color(link.source, 4) || color(link.target, 1) || colors.fallback; | |
}) | |
.nodeWidth(15) | |
.nodePadding(10) | |
.spread(true) | |
.iterations(0) | |
.draw(json); | |
function label(node) { | |
return node.name.replace(/\s*\(.*?\)$/, ''); | |
} | |
function color(node, depth) { | |
if (node.color) { | |
return node.color; | |
} else { | |
return 'grey'; | |
} | |
} | |
//}); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment