Skip to content

Instantly share code, notes, and snippets.

@gbraccialli
Last active February 23, 2021 04:45
Show Gist options
  • Save gbraccialli/9f1a8be687050a4278758e7944ce0225 to your computer and use it in GitHub Desktop.
Save gbraccialli/9f1a8be687050a4278758e7944ce0225 to your computer and use it in GitHub Desktop.
clickstream
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"spark version = 3.0.1\n"
]
}
],
"source": [
"import pyspark\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F\n",
"from pyspark.sql.types import *\n",
"from pyspark.sql.functions import udf\n",
"from IPython.display import HTML\n",
"import seaborn as sns\n",
"\n",
"print(f\"spark version = {pyspark.__version__}\")\n",
"spark = (SparkSession\n",
" .builder\n",
" .master(\"local[*]\")\n",
" .appName(\"jupyter\")\n",
" .config(\"spark.driver.memory\", \"10g\")\n",
" .getOrCreate()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"events = [\n",
" {\"visitor\": 1, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n",
" {\"visitor\": 1, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n",
" {\"visitor\": 1, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'C'},\n",
" \n",
" {\"visitor\": 2, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'Z'},\n",
" {\"visitor\": 2, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n",
" {\"visitor\": 2, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'F'},\n",
" \n",
" {\"visitor\": 3, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'K'},\n",
" {\"visitor\": 3, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n",
" {\"visitor\": 3, \"timestamp\": '2020-01-01 02:00:00', \"page\": 'L'},\n",
" \n",
" {\"visitor\": 4, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n",
" {\"visitor\": 4, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n",
" \n",
" {\"visitor\": 5, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'A'},\n",
" {\"visitor\": 5, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'B'},\n",
" \n",
"\n",
" {\"visitor\": 6, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'C'},\n",
" {\"visitor\": 6, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'B'},\n",
"\n",
" {\"visitor\": 7, \"timestamp\": '2020-01-01 01:00:00', \"page\": 'C'},\n",
" {\"visitor\": 7, \"timestamp\": '2020-01-01 00:00:00', \"page\": 'B'},\n",
" \n",
" \n",
" \n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/guilherme_braccialli/anaconda3/envs/pyspark/lib/python3.7/site-packages/pyspark/sql/session.py:381: UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead\n",
" warnings.warn(\"inferring schema from dict is deprecated,\"\n"
]
}
],
"source": [
"df_events = spark.createDataFrame(events)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>page</th>\n",
" <th>timestamp</th>\n",
" <th>visitor</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>B</td>\n",
" <td>2020-01-01 01:00:00</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>C</td>\n",
" <td>2020-01-01 02:00:00</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Z</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>B</td>\n",
" <td>2020-01-01 01:00:00</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>F</td>\n",
" <td>2020-01-01 02:00:00</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>K</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>B</td>\n",
" <td>2020-01-01 01:00:00</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>L</td>\n",
" <td>2020-01-01 02:00:00</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>A</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>B</td>\n",
" <td>2020-01-01 01:00:00</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>A</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>B</td>\n",
" <td>2020-01-01 01:00:00</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>C</td>\n",
" <td>2020-01-01 01:00:00</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>B</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>C</td>\n",
" <td>2020-01-01 01:00:00</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>B</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" page timestamp visitor\n",
"0 A 2020-01-01 00:00:00 1\n",
"1 B 2020-01-01 01:00:00 1\n",
"2 C 2020-01-01 02:00:00 1\n",
"3 Z 2020-01-01 00:00:00 2\n",
"4 B 2020-01-01 01:00:00 2\n",
"5 F 2020-01-01 02:00:00 2\n",
"6 K 2020-01-01 00:00:00 3\n",
"7 B 2020-01-01 01:00:00 3\n",
"8 L 2020-01-01 02:00:00 3\n",
"9 A 2020-01-01 00:00:00 4\n",
"10 B 2020-01-01 01:00:00 4\n",
"11 A 2020-01-01 00:00:00 5\n",
"12 B 2020-01-01 01:00:00 5\n",
"13 C 2020-01-01 01:00:00 6\n",
"14 B 2020-01-01 00:00:00 6\n",
"15 C 2020-01-01 01:00:00 7\n",
"16 B 2020-01-01 00:00:00 7"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_events.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df_sequence_per_user = (\n",
" df_events\n",
" .sort(\"timestamp\")\n",
" .groupby(\"visitor\")\n",
" .agg(\n",
" F.collect_list(F.col(\"page\")).alias(\"sequence\")\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>visitor</th>\n",
" <th>sequence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>[B, C]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>[B, C]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>[A, B]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>[A, B, C]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>[K, B, L]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2</td>\n",
" <td>[Z, B, F]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>4</td>\n",
" <td>[A, B]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" visitor sequence\n",
"0 7 [B, C]\n",
"1 6 [B, C]\n",
"2 5 [A, B]\n",
"3 1 [A, B, C]\n",
"4 3 [K, B, L]\n",
"5 2 [Z, B, F]\n",
"6 4 [A, B]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_sequence_per_user.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df_count_per_sequence = (\n",
" df_sequence_per_user\n",
" .groupby(\"sequence\")\n",
" .count()\n",
" .sort(F.desc(\"count\"))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sequence</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[B, C]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[A, B]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[K, B, L]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[A, B, C]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[Z, B, F]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sequence count\n",
"0 [B, C] 2\n",
"1 [A, B] 2\n",
"2 [K, B, L] 1\n",
"3 [A, B, C] 1\n",
"4 [Z, B, F] 1"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_count_per_sequence.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def print_json_sankey(df, column):\n",
" @udf(returnType=ArrayType(StringType()))\n",
" def array_index(arr):\n",
" return [f\"{count}|{item}\" for count, item in enumerate(arr, start=1)]\n",
" #add index, so same step can be showed as multiple times in the sequence with same color\n",
" df_sankey = (\n",
" df\n",
" .withColumn(\"path_index\", array_index(F.col(column)))\n",
" .cache()\n",
" )\n",
" #get color pallet from seaborn\n",
" pal = sns.color_palette(n_colors=500)\n",
" colors = pal.as_hex()\n",
" \n",
" #find unique nodes colors\n",
" nodes_colors_aux = df.select(F.explode(column)).distinct().toPandas().to_dict('records')\n",
" nodes_colors = {}\n",
" for count, item in enumerate(sorted([r['col'] for r in nodes_colors_aux])):\n",
" nodes_colors[item] = colors[count % len(colors)]\n",
" nodes_colors['end'] = 'grey'\n",
" \n",
" #deal with different node ids, keeping same color\n",
" nodes_aux = df_sankey.select(F.explode(\"path_index\")).distinct().toPandas().to_dict('records')\n",
" nodes_aux2 = {}\n",
" for count, item in enumerate(sorted([r['col'] for r in nodes_aux])):\n",
" nodes_aux2[item] = count\n",
" nodes_aux2['end|end'] = len(nodes_aux2)\n",
" nodes = []\n",
" for node, id in sorted(nodes_aux2.items()):\n",
" nodes.append({\"name\" : node.split(\"|\")[1], \"id\" : node, \"color\" : nodes_colors[node.split(\"|\")[1]]})\n",
" \n",
" #for each step in the sequence, print a link for s3 sankey\n",
" links_aux = df_sankey.select(\"path_index\", \"count\").toPandas().to_dict('records')\n",
" links = []\n",
" for l in links_aux:\n",
" journey = l['path_index'].append('end|end')\n",
" journey_len = len(l['path_index'])\n",
" for i in range(journey_len -1):\n",
" links.append({\"source\" : nodes_aux2[l['path_index'][i]], \"value\" : l['count'], \"target\" : nodes_aux2[l['path_index'][i+1]]})\n",
" \n",
" print({\"nodes\": nodes, \"links\": links})\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'nodes': [{'name': 'A', 'id': '1|A', 'color': '#1f77b4'}, {'name': 'B', 'id': '1|B', 'color': '#ff7f0e'}, {'name': 'K', 'id': '1|K', 'color': '#9467bd'}, {'name': 'Z', 'id': '1|Z', 'color': '#e377c2'}, {'name': 'B', 'id': '2|B', 'color': '#ff7f0e'}, {'name': 'C', 'id': '2|C', 'color': '#2ca02c'}, {'name': 'C', 'id': '3|C', 'color': '#2ca02c'}, {'name': 'F', 'id': '3|F', 'color': '#d62728'}, {'name': 'L', 'id': '3|L', 'color': '#8c564b'}, {'name': 'end', 'id': 'end|end', 'color': 'grey'}], 'links': [{'source': 1, 'value': 2, 'target': 5}, {'source': 5, 'value': 2, 'target': 9}, {'source': 0, 'value': 2, 'target': 4}, {'source': 4, 'value': 2, 'target': 9}, {'source': 2, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 8}, {'source': 8, 'value': 1, 'target': 9}, {'source': 0, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 6}, {'source': 6, 'value': 1, 'target': 9}, {'source': 3, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 7}, {'source': 7, 'value': 1, 'target': 9}]}\n"
]
}
],
"source": [
"print_json_sankey(df_count_per_sequence, \"sequence\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#open d3-sankey.html and replace contents of data and colors as above"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"with open (\"d3-sankey.html\", \"r\") as html_file:\n",
" html=html_file.read()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<iframe id=iframe1 width=1000 height=900 src='./d3-sankey.html'></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HTML(\"\"\"<iframe id=iframe1 width=1000 height=900 src='./d3-sankey.html'></iframe>\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df_list_per_user = (\n",
" df_events\n",
" .groupby(\"visitor\")\n",
" .agg(\n",
" F.collect_set(F.col(\"page\")).alias(\"list\")\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>visitor</th>\n",
" <th>list</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>[C, B]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>[C, B]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>[B, A]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>[C, B, A]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>[K, B, L]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2</td>\n",
" <td>[F, Z, B]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>4</td>\n",
" <td>[B, A]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" visitor list\n",
"0 7 [C, B]\n",
"1 6 [C, B]\n",
"2 5 [B, A]\n",
"3 1 [C, B, A]\n",
"4 3 [K, B, L]\n",
"5 2 [F, Z, B]\n",
"6 4 [B, A]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_list_per_user.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"@udf(returnType=ArrayType(ArrayType(StringType())))\n",
"\n",
"def arrray_combination(list_all):\n",
" list_out = []\n",
" for i in range(0, len(list_all)):\n",
" for j in range(i+1, len(list_all)):\n",
" list_out.append([list_all[i], list_all[j]])\n",
" return list_out\n",
"\n",
"\n",
"@udf(\"array<struct<_1: string, _2: string>>\")\n",
"def arrray_combination2(x):\n",
" from itertools import combinations\n",
" pairs = [pair for pair in combinations(x, 2)] \n",
" return pairs"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df_combination = (\n",
" df_list_per_user\n",
" .withColumn(\"combination_pairs\", arrray_combination2(F.sort_array(F.col(\"list\"))))\n",
" .select(\n",
" F.col(\"visitor\"),\n",
" F.explode(\"combination_pairs\").alias(\"combination_exploded\")\n",
" )\n",
" .select(\"visitor\", F.expr(\"combination_exploded._1\").alias(\"part_1\"), F.col(\"combination_exploded._2\").alias(\"part_2\"))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>visitor</th>\n",
" <th>part_1</th>\n",
" <th>part_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>B</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>B</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>A</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>A</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>A</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>B</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>B</td>\n",
" <td>K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3</td>\n",
" <td>B</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>3</td>\n",
" <td>K</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2</td>\n",
" <td>B</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2</td>\n",
" <td>B</td>\n",
" <td>Z</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2</td>\n",
" <td>F</td>\n",
" <td>Z</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>A</td>\n",
" <td>B</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" visitor part_1 part_2\n",
"0 7 B C\n",
"1 6 B C\n",
"2 5 A B\n",
"3 1 A B\n",
"4 1 A C\n",
"5 1 B C\n",
"6 3 B K\n",
"7 3 B L\n",
"8 3 K L\n",
"9 2 B F\n",
"10 2 B Z\n",
"11 2 F Z\n",
"12 4 A B"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_combination.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"df_top_pairs = (\n",
" df_combination\n",
" .groupby(\"part_1\", \"part_2\")\n",
" .count()\n",
" .sort(F.desc(\"count\"))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>part_1</th>\n",
" <th>part_2</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>B</td>\n",
" <td>C</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A</td>\n",
" <td>B</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>B</td>\n",
" <td>Z</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>B</td>\n",
" <td>L</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>B</td>\n",
" <td>F</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>A</td>\n",
" <td>C</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>B</td>\n",
" <td>K</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>K</td>\n",
" <td>L</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>F</td>\n",
" <td>Z</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" part_1 part_2 count\n",
"0 B C 3\n",
"1 A B 3\n",
"2 B Z 1\n",
"3 B L 1\n",
"4 B F 1\n",
"5 A C 1\n",
"6 B K 1\n",
"7 K L 1\n",
"8 F Z 1"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_top_pairs.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"df_pairs_chord_viz = (\n",
" df_top_pairs\n",
" .union(\n",
" df_top_pairs.select(F.col(\"part_2\").alias(\"part_1\"), F.col(\"part_1\").alias(\"part_2\"), F.col(\"count\"))\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"dict_pairs_pandas = df_pairs_chord_viz.toPandas().to_dict('records')\n",
"dict_nodes = df_pairs_chord_viz.select(\"part_1\").distinct().toPandas().to_dict('records')\n",
"pairs = [[record['part_1'], record['part_2'], record['count']] for record in dict_pairs_pandas]\n",
"pal = sns.color_palette(n_colors=50)\n",
"colors = pal.as_hex()\n",
"node_colors = {}\n",
"i = 0\n",
"for record in dict_nodes:\n",
" node_colors[record['part_1']] = colors[i]\n",
" i+=1"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"var data = [['B', 'C', 3], ['A', 'B', 3], ['B', 'Z', 1], ['B', 'L', 1], ['B', 'F', 1], ['A', 'C', 1], ['B', 'K', 1], ['K', 'L', 1], ['F', 'Z', 1], ['C', 'B', 3], ['B', 'A', 3], ['Z', 'B', 1], ['L', 'B', 1], ['F', 'B', 1], ['C', 'A', 1], ['K', 'B', 1], ['L', 'K', 1], ['Z', 'F', 1]]\n",
"var colors = {'K': '#1f77b4', 'F': '#ff7f0e', 'B': '#2ca02c', 'L': '#d62728', 'C': '#9467bd', 'Z': '#8c564b', 'A': '#e377c2'}\n"
]
}
],
"source": [
"print(f\"var data = {pairs}\")\n",
"print(f\"var colors = {node_colors}\")\n",
"#open d3-chord.html and replace contents of data and colors as above"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<iframe width=1000 height=900 src='./d3-chord.html'></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HTML(\"<iframe width=1000 height=900 src='./d3-chord.html'></iframe>\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
<!DOCTYPE html>
<meta charset="utf-8">
<style>
body {
font: 10px sans-serif;
}
svg text{
fill:grey;
font-size:11px;
}
svg .values text{
pointer-events:none;
stroke-width: 0.5px;
}
.groups:hover{
cursor:pointer;
font-weight:bold;
}
</style>
<title>
Chord
</title>
<body>
<script src="https://d3js.org/d3.v4.min.js"></script>
<script src="http://vizjs.org/viz.v1.1.0.min.js"></script>
<script>
var data = []
var colors = {}
var sortOrder =[]
var data = [['B', 'C', 3], ['A', 'B', 3], ['B', 'Z', 1], ['B', 'L', 1], ['B', 'F', 1], ['A', 'C', 1], ['B', 'K', 1], ['K', 'L', 1], ['F', 'Z', 1], ['C', 'B', 3], ['B', 'A', 3], ['Z', 'B', 1], ['L', 'B', 1], ['F', 'B', 1], ['C', 'A', 1], ['K', 'B', 1], ['L', 'K', 1], ['Z', 'F', 1]]
var colors = {'K': '#1f77b4', 'F': '#ff7f0e', 'B': '#2ca02c', 'L': '#d62728', 'C': '#9467bd', 'Z': '#8c564b', 'A': '#e377c2'}
function sort(a,b){ return d3.ascending(sortOrder.indexOf(a),sortOrder.indexOf(b)); }
var ch = viz.ch().data(data)
.padding(.01)
.sort(sort)
.innerRadius(430)
.outerRadius(450)
.duration(1000)
.chordOpacity(0.3)
.labelPadding(.03)
.fill(function(d){ return colors[d];});
var width=1200, height=1100;
var svg = d3.select("body").append("svg").attr("height",height).attr("width",width);
svg.append("g").attr("transform", "translate(600,550)").call(ch);
// adjust height of frame in bl.ocks.org
d3.select(self.frameElement).style("height", height+"px").style("width", width+"px");
</script>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Sankey</title>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script src="http://cdn.rawgit.com/newrelic-forks/d3-plugins-sankey/master/sankey.js"></script>
<script src="http://cdn.rawgit.com/misoproject/d3.chart/master/d3.chart.min.js"></script>
<script src="http://cdn.rawgit.com/q-m/d3.chart.sankey/master/d3.chart.sankey.min.js"></script>
<style>
body {
padding: 30px;
min-width: 600px;
max-width: 1200px;
margin: auto;
}
#chart {
height: 700px;
font: 8px sans-serif;
}
.node rect {
fill-opacity: .9;
shape-rendering: crispEdges;
stroke-width: 0;
}
.node text {
text-shadow: 0 1px 0 #fff;
}
.link {
fill: none;
stroke: #000;
stroke-opacity: .2;
}
</style>
</head>
<body>
<div id="chart"></div>
<script>
//d3.json("http://cdn.rawgit.com/q-m/d3.chart.sankey/master/example/data/product.json", function(error, json) {
json = {}
json = {'nodes': [{'name': 'A', 'id': '1|A', 'color': '#1f77b4'}, {'name': 'B', 'id': '1|B', 'color': '#ff7f0e'}, {'name': 'K', 'id': '1|K', 'color': '#9467bd'}, {'name': 'Z', 'id': '1|Z', 'color': '#e377c2'}, {'name': 'B', 'id': '2|B', 'color': '#ff7f0e'}, {'name': 'C', 'id': '2|C', 'color': '#2ca02c'}, {'name': 'C', 'id': '3|C', 'color': '#2ca02c'}, {'name': 'F', 'id': '3|F', 'color': '#d62728'}, {'name': 'L', 'id': '3|L', 'color': '#8c564b'}, {'name': 'end', 'id': 'end|end', 'color': 'grey'}], 'links': [{'source': 1, 'value': 2, 'target': 5}, {'source': 5, 'value': 2, 'target': 9}, {'source': 0, 'value': 2, 'target': 4}, {'source': 4, 'value': 2, 'target': 9}, {'source': 2, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 8}, {'source': 8, 'value': 1, 'target': 9}, {'source': 0, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 6}, {'source': 6, 'value': 1, 'target': 9}, {'source': 3, 'value': 1, 'target': 4}, {'source': 4, 'value': 1, 'target': 7}, {'source': 7, 'value': 1, 'target': 9}]}
var chart = d3.select("#chart").append("svg").chart("Sankey.Path");
chart
.name(label)
.colorNodes(function(name, node) {
return color(node, 1) || colors.fallback;
})
.colorLinks(function(link) {
return color(link.source, 4) || color(link.target, 1) || colors.fallback;
})
.nodeWidth(15)
.nodePadding(10)
.spread(true)
.iterations(0)
.draw(json);
function label(node) {
return node.name.replace(/\s*\(.*?\)$/, '');
}
function color(node, depth) {
if (node.color) {
return node.color;
} else {
return 'grey';
}
}
//});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment