Skip to content

Instantly share code, notes, and snippets.

@igorbrigadir
Created February 26, 2019 18:23
Show Gist options
  • Save igorbrigadir/80638589973fbb172c9b8abcf1ce0d71 to your computer and use it in GitHub Desktop.
Save igorbrigadir/80638589973fbb172c9b8abcf1ce0d71 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:21:41.310151Z",
"start_time": "2019-02-26T18:21:40.905070Z"
}
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import zstandard as zstd\n",
"from io import TextIOWrapper\n",
"\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:21:41.330110Z",
"start_time": "2019-02-26T18:21:41.311603Z"
}
},
"outputs": [],
"source": [
"def melt_snowflake(snowflake_id):\n",
" \"\"\"return tuple of snowflake components given a tweet id\"\"\"\n",
" timestamp_ms = ((snowflake_id >> 22) + 1288834974657)\n",
" datacenter_id = (snowflake_id >> 17) & 0b11111\n",
" worker_id = (snowflake_id >> 12) & 0b11111\n",
" sequence_id = snowflake_id & 0b111111111111\n",
" # this is a combination of worker_id id and datacenter id\n",
" machine_id = (snowflake_id >> 12) & 0b1111111111\n",
" return (timestamp_ms, datacenter_id, worker_id, sequence_id, machine_id)\n",
"\n",
"def plt_counter(c):\n",
" labels, values = zip(*c.items())\n",
" indexes = np.arange(len(labels))\n",
" width = 0.95\n",
" plt.figure(figsize=(15,5))\n",
" plt.bar(indexes, values, width)\n",
" plt.xticks(indexes + width * 0.5, labels)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:21:41.346300Z",
"start_time": "2019-02-26T18:21:41.333560Z"
}
},
"outputs": [],
"source": [
"# defaultdict is faster than Counter()\n",
"datacenters = defaultdict(int)\n",
"workers = defaultdict(int)\n",
"sequences = defaultdict(int)\n",
"machines = defaultdict(int)\n",
"total_tweets = defaultdict(int)\n",
"\n",
"def count_components(tweet_id):\n",
" parts = melt_snowflake(tweet_id)\n",
" datacenters[parts[1]] += 1\n",
" workers[parts[2]] += 1\n",
" sequences[parts[3]] += 1\n",
" machines[parts[4]] += 1\n",
" total_tweets['ids'] += 1 "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:50.768668Z",
"start_time": "2019-02-26T18:21:41.348518Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed 43956390 tweets\n"
]
}
],
"source": [
"path = 'TREC2015-tweetids.txt.zst'\n",
"\n",
"with open(path, 'rb') as fh:\n",
" decompressor = zstd.ZstdDecompressor()\n",
" with decompressor.stream_reader(fh, read_size=64192) as reader: #\n",
" with TextIOWrapper(reader, encoding='UTF-8', newline='\\n', line_buffering=True) as line_reader:\n",
" for line in line_reader:\n",
" count_components(int(line)) \n",
"\n",
"print(\"Processed\", total_tweets['ids'], \"tweets\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:50.778351Z",
"start_time": "2019-02-26T18:22:50.770998Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[320,\n",
" 321,\n",
" 323,\n",
" 326,\n",
" 327,\n",
" 329,\n",
" 331,\n",
" 332,\n",
" 336,\n",
" 337,\n",
" 353,\n",
" 354,\n",
" 357,\n",
" 359,\n",
" 360,\n",
" 362,\n",
" 363,\n",
" 364,\n",
" 367,\n",
" 368]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# All Machine IDs in TREC Tweets\n",
"sorted(machines.keys())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:51.306941Z",
"start_time": "2019-02-26T18:22:50.781374Z"
}
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f7f0c6f2780>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt_counter(sequences)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:51.316456Z",
"start_time": "2019-02-26T18:22:51.309764Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {0: 34006666,\n",
" 1: 7208821,\n",
" 2: 1161513,\n",
" 3: 163420,\n",
" 4: 999521,\n",
" 5: 311015,\n",
" 6: 60669,\n",
" 7: 9671,\n",
" 8: 23058,\n",
" 9: 8681,\n",
" 10: 1964,\n",
" 11: 396,\n",
" 12: 512,\n",
" 13: 235,\n",
" 14: 69,\n",
" 15: 28,\n",
" 16: 31,\n",
" 17: 16,\n",
" 18: 14,\n",
" 19: 10,\n",
" 20: 9,\n",
" 21: 5,\n",
" 22: 4,\n",
" 23: 3,\n",
" 24: 3,\n",
" 25: 3,\n",
" 26: 1,\n",
" 27: 4,\n",
" 28: 2,\n",
" 29: 2,\n",
" 31: 3,\n",
" 32: 1,\n",
" 33: 1,\n",
" 34: 2,\n",
" 35: 1,\n",
" 36: 2,\n",
" 37: 3,\n",
" 38: 1,\n",
" 39: 2,\n",
" 40: 1,\n",
" 41: 1,\n",
" 42: 1,\n",
" 43: 1,\n",
" 45: 1,\n",
" 46: 1,\n",
" 47: 1,\n",
" 48: 3,\n",
" 50: 3,\n",
" 51: 2,\n",
" 52: 2,\n",
" 54: 1,\n",
" 55: 1,\n",
" 56: 1,\n",
" 57: 1,\n",
" 58: 1,\n",
" 59: 1,\n",
" 60: 1,\n",
" 63: 1,\n",
" 80: 1,\n",
" 87: 1,\n",
" 88: 1})"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sequences"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:51.332695Z",
"start_time": "2019-02-26T18:22:51.319399Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int, {10: 23029374, 11: 20927016})"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"datacenters"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:51.546211Z",
"start_time": "2019-02-26T18:22:51.335245Z"
}
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f7f0989f048>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt_counter(workers)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:51.787088Z",
"start_time": "2019-02-26T18:22:51.548732Z"
}
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f7f0989f5f8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt_counter(machines)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-02-26T18:22:51.793579Z",
"start_time": "2019-02-26T18:22:51.789315Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {320: 2302480,\n",
" 321: 2302821,\n",
" 323: 2302907,\n",
" 326: 2302221,\n",
" 327: 2307031,\n",
" 329: 2302184,\n",
" 331: 2303465,\n",
" 332: 2303599,\n",
" 336: 2300150,\n",
" 337: 2302516,\n",
" 353: 2092357,\n",
" 354: 2092798,\n",
" 357: 2095881,\n",
" 359: 2092744,\n",
" 360: 2092296,\n",
" 362: 2092223,\n",
" 363: 2096692,\n",
" 364: 2090417,\n",
" 367: 2087621,\n",
" 368: 2093987})"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"machines"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment