Created
February 26, 2019 18:23
-
-
Save igorbrigadir/80638589973fbb172c9b8abcf1ce0d71 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:21:41.310151Z", | |
| "start_time": "2019-02-26T18:21:40.905070Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib inline\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "\n", | |
| "import zstandard as zstd\n", | |
| "from io import TextIOWrapper\n", | |
| "\n", | |
| "from collections import defaultdict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:21:41.330110Z", | |
| "start_time": "2019-02-26T18:21:41.311603Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def melt_snowflake(snowflake_id):\n", | |
| " \"\"\"return tuple of snowflake components given a tweet id\"\"\"\n", | |
| " timestamp_ms = ((snowflake_id >> 22) + 1288834974657)\n", | |
| " datacenter_id = (snowflake_id >> 17) & 0b11111\n", | |
| " worker_id = (snowflake_id >> 12) & 0b11111\n", | |
| " sequence_id = snowflake_id & 0b111111111111\n", | |
| " # this is a combination of worker_id id and datacenter id\n", | |
| " machine_id = (snowflake_id >> 12) & 0b1111111111\n", | |
| " return (timestamp_ms, datacenter_id, worker_id, sequence_id, machine_id)\n", | |
| "\n", | |
| "def plt_counter(c):\n", | |
| " labels, values = zip(*c.items())\n", | |
| " indexes = np.arange(len(labels))\n", | |
| " width = 0.95\n", | |
| " plt.figure(figsize=(15,5))\n", | |
| " plt.bar(indexes, values, width)\n", | |
| " plt.xticks(indexes + width * 0.5, labels)\n", | |
| " plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:21:41.346300Z", | |
| "start_time": "2019-02-26T18:21:41.333560Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# defaultdict is faster than Counter()\n", | |
| "datacenters = defaultdict(int)\n", | |
| "workers = defaultdict(int)\n", | |
| "sequences = defaultdict(int)\n", | |
| "machines = defaultdict(int)\n", | |
| "total_tweets = defaultdict(int)\n", | |
| "\n", | |
| "def count_components(tweet_id):\n", | |
| " parts = melt_snowflake(tweet_id)\n", | |
| " datacenters[parts[1]] += 1\n", | |
| " workers[parts[2]] += 1\n", | |
| " sequences[parts[3]] += 1\n", | |
| " machines[parts[4]] += 1\n", | |
| " total_tweets['ids'] += 1 " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:50.768668Z", | |
| "start_time": "2019-02-26T18:21:41.348518Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Processed 43956390 tweets\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "path = 'TREC2015-tweetids.txt.zst'\n", | |
| "\n", | |
| "with open(path, 'rb') as fh:\n", | |
| " decompressor = zstd.ZstdDecompressor()\n", | |
| " with decompressor.stream_reader(fh, read_size=64192) as reader: #\n", | |
| " with TextIOWrapper(reader, encoding='UTF-8', newline='\\n', line_buffering=True) as line_reader:\n", | |
| " for line in line_reader:\n", | |
| " count_components(int(line)) \n", | |
| "\n", | |
| "print(\"Processed\", total_tweets['ids'], \"tweets\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:50.778351Z", | |
| "start_time": "2019-02-26T18:22:50.770998Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[320,\n", | |
| " 321,\n", | |
| " 323,\n", | |
| " 326,\n", | |
| " 327,\n", | |
| " 329,\n", | |
| " 331,\n", | |
| " 332,\n", | |
| " 336,\n", | |
| " 337,\n", | |
| " 353,\n", | |
| " 354,\n", | |
| " 357,\n", | |
| " 359,\n", | |
| " 360,\n", | |
| " 362,\n", | |
| " 363,\n", | |
| " 364,\n", | |
| " 367,\n", | |
| " 368]" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# All Machine IDs in TREC Tweets\n", | |
| "sorted(machines.keys())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:51.306941Z", | |
| "start_time": "2019-02-26T18:22:50.781374Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x7f7f0c6f2780>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt_counter(sequences)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:51.316456Z", | |
| "start_time": "2019-02-26T18:22:51.309764Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(int,\n", | |
| " {0: 34006666,\n", | |
| " 1: 7208821,\n", | |
| " 2: 1161513,\n", | |
| " 3: 163420,\n", | |
| " 4: 999521,\n", | |
| " 5: 311015,\n", | |
| " 6: 60669,\n", | |
| " 7: 9671,\n", | |
| " 8: 23058,\n", | |
| " 9: 8681,\n", | |
| " 10: 1964,\n", | |
| " 11: 396,\n", | |
| " 12: 512,\n", | |
| " 13: 235,\n", | |
| " 14: 69,\n", | |
| " 15: 28,\n", | |
| " 16: 31,\n", | |
| " 17: 16,\n", | |
| " 18: 14,\n", | |
| " 19: 10,\n", | |
| " 20: 9,\n", | |
| " 21: 5,\n", | |
| " 22: 4,\n", | |
| " 23: 3,\n", | |
| " 24: 3,\n", | |
| " 25: 3,\n", | |
| " 26: 1,\n", | |
| " 27: 4,\n", | |
| " 28: 2,\n", | |
| " 29: 2,\n", | |
| " 31: 3,\n", | |
| " 32: 1,\n", | |
| " 33: 1,\n", | |
| " 34: 2,\n", | |
| " 35: 1,\n", | |
| " 36: 2,\n", | |
| " 37: 3,\n", | |
| " 38: 1,\n", | |
| " 39: 2,\n", | |
| " 40: 1,\n", | |
| " 41: 1,\n", | |
| " 42: 1,\n", | |
| " 43: 1,\n", | |
| " 45: 1,\n", | |
| " 46: 1,\n", | |
| " 47: 1,\n", | |
| " 48: 3,\n", | |
| " 50: 3,\n", | |
| " 51: 2,\n", | |
| " 52: 2,\n", | |
| " 54: 1,\n", | |
| " 55: 1,\n", | |
| " 56: 1,\n", | |
| " 57: 1,\n", | |
| " 58: 1,\n", | |
| " 59: 1,\n", | |
| " 60: 1,\n", | |
| " 63: 1,\n", | |
| " 80: 1,\n", | |
| " 87: 1,\n", | |
| " 88: 1})" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sequences" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:51.332695Z", | |
| "start_time": "2019-02-26T18:22:51.319399Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(int, {10: 23029374, 11: 20927016})" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "datacenters" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:51.546211Z", | |
| "start_time": "2019-02-26T18:22:51.335245Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x7f7f0989f048>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt_counter(workers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:51.787088Z", | |
| "start_time": "2019-02-26T18:22:51.548732Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x7f7f0989f5f8>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt_counter(machines)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-02-26T18:22:51.793579Z", | |
| "start_time": "2019-02-26T18:22:51.789315Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(int,\n", | |
| " {320: 2302480,\n", | |
| " 321: 2302821,\n", | |
| " 323: 2302907,\n", | |
| " 326: 2302221,\n", | |
| " 327: 2307031,\n", | |
| " 329: 2302184,\n", | |
| " 331: 2303465,\n", | |
| " 332: 2303599,\n", | |
| " 336: 2300150,\n", | |
| " 337: 2302516,\n", | |
| " 353: 2092357,\n", | |
| " 354: 2092798,\n", | |
| " 357: 2095881,\n", | |
| " 359: 2092744,\n", | |
| " 360: 2092296,\n", | |
| " 362: 2092223,\n", | |
| " 363: 2096692,\n", | |
| " 364: 2090417,\n", | |
| " 367: 2087621,\n", | |
| " 368: 2093987})" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "machines" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.4" | |
| }, | |
| "varInspector": { | |
| "cols": { | |
| "lenName": 16, | |
| "lenType": 16, | |
| "lenVar": 40 | |
| }, | |
| "kernels_config": { | |
| "python": { | |
| "delete_cmd_postfix": "", | |
| "delete_cmd_prefix": "del ", | |
| "library": "var_list.py", | |
| "varRefreshCmd": "print(var_dic_list())" | |
| }, | |
| "r": { | |
| "delete_cmd_postfix": ") ", | |
| "delete_cmd_prefix": "rm(", | |
| "library": "var_list.r", | |
| "varRefreshCmd": "cat(var_dic_list()) " | |
| } | |
| }, | |
| "types_to_exclude": [ | |
| "module", | |
| "function", | |
| "builtin_function_or_method", | |
| "instance", | |
| "_Feature" | |
| ], | |
| "window_display": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment