Last active
January 6, 2024 00:46
-
-
Save jerryan999/79bb451d13c3b4e37009a642454cb464 to your computer and use it in GitHub Desktop.
Given tag name, this script starts to crawl tag related post in the medium archive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 118, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pymongo\n", | |
| "import pandas as pd\n", | |
| "import numpy as np \n", | |
| "import seaborn as sns\n", | |
| "sns.set(style=\"darkgrid\")\n", | |
| "\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "%matplotlib inline " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "client = pymongo.MongoClient()\n", | |
| "db = client['medium']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 315, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "tag_interest = \"Data Science\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 316, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "users = [doc for doc in db.User.find()]\n", | |
| "posts = [doc for doc in db.Post.find({\"detectedLanguage\":\"en\",\n", | |
| " 'virtuals.tags.name': tag_interest})]\n", | |
| "collections = [doc for doc in db.Collection.find()]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Preprocess" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 317, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_posts = pd.DataFrame(posts)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 318, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# add some columns in top level\n", | |
| "# from vituals field\n", | |
| "for f in [\"wordCount\",\"readingTime\",\"subtitle\",\"totalClapCount\",\"recommends\",\n", | |
| " \"responsesCreatedCount\",\"sectionCount\",\"socialRecommendsCount\"]:\n", | |
| " df_posts[f] = df_posts.virtuals.apply(lambda x:x[f])\n", | |
| "\n", | |
| "# from users \n", | |
| "lookup_users = {u['userId']:u for u in users}\n", | |
| "lookup_collections = {c['id']:c for c in collections}\n", | |
| "\n", | |
| "df_posts['username'] = df_posts.creatorId.apply(lambda x:lookup_users[x]['username'])\n", | |
| "df_posts['user_bio'] = df_posts.creatorId.apply(lambda x:lookup_users[x]['bio'])\n", | |
| "df_posts['isWriterProgramEnrolled'] = df_posts.creatorId.apply(lambda x:lookup_users[x]['isWriterProgramEnrolled'])\n", | |
| "df_posts['user_type'] = df_posts.creatorId.apply(lambda x:lookup_users[x]['type'])\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Exploratory Data Analysis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Top Post" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- What's the top posts based on *totalClapCount*? \n", | |
| "- Ten claps is good enough?\n", | |
| "- What's the average word count and average reading time?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 319, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title</th>\n", | |
| " <th>slug</th>\n", | |
| " <th>uniqueSlug</th>\n", | |
| " <th>subtitle</th>\n", | |
| " <th>wordCount</th>\n", | |
| " <th>readingTime</th>\n", | |
| " <th>totalClapCount</th>\n", | |
| " <th>username</th>\n", | |
| " <th>user_bio</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>6859</th>\n", | |
| " <td>Artificial Intelligence — The Revolution Hasn’...</td>\n", | |
| " <td>artificial-intelligence-the-revolution-hasnt-h...</td>\n", | |
| " <td>artificial-intelligence-the-revolution-hasnt-h...</td>\n", | |
| " <td>Artificial Intelligence (AI) is the mantra of ...</td>\n", | |
| " <td>3989</td>\n", | |
| " <td>15.252830</td>\n", | |
| " <td>50806</td>\n", | |
| " <td>mijordan3</td>\n", | |
| " <td>Michael I. Jordan is a Professor in the Depart...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5962</th>\n", | |
| " <td>Why so many data scientists are leaving their ...</td>\n", | |
| " <td>why-so-many-data-scientists-are-leaving-their-...</td>\n", | |
| " <td>why-so-many-data-scientists-are-leaving-their-...</td>\n", | |
| " <td>Frustrations of the data scientist!</td>\n", | |
| " <td>1689</td>\n", | |
| " <td>7.206918</td>\n", | |
| " <td>47057</td>\n", | |
| " <td>jonnybrooks04</td>\n", | |
| " <td>Data scientist at Deliveroo, public speaker, s...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9426</th>\n", | |
| " <td>What exactly can you do with Python? Here are ...</td>\n", | |
| " <td>what-can-you-do-with-python-the-3-main-applica...</td>\n", | |
| " <td>what-can-you-do-with-python-the-3-main-applica...</td>\n", | |
| " <td>The most common applications of Python are: we...</td>\n", | |
| " <td>2310</td>\n", | |
| " <td>9.666981</td>\n", | |
| " <td>42562</td>\n", | |
| " <td>ykdojo</td>\n", | |
| " <td>YouTuber at CS Dojo / Podcaster at Towards Dat...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7926</th>\n", | |
| " <td>How to build your own Neural Network from scra...</td>\n", | |
| " <td>how-to-build-your-own-neural-network-from-scra...</td>\n", | |
| " <td>how-to-build-your-own-neural-network-from-scra...</td>\n", | |
| " <td>A beginner’s guide to understanding the inner ...</td>\n", | |
| " <td>1247</td>\n", | |
| " <td>6.055660</td>\n", | |
| " <td>41018</td>\n", | |
| " <td>jamesloyys</td>\n", | |
| " <td>M.Sc. in CS (Machine Learning) @ Georgia Tech</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2738</th>\n", | |
| " <td>30 Amazing Machine Learning Projects for the P...</td>\n", | |
| " <td>30-amazing-machine-learning-projects-for-the-p...</td>\n", | |
| " <td>30-amazing-machine-learning-projects-for-the-p...</td>\n", | |
| " <td>For the past year, we’ve compared nearly 8,800...</td>\n", | |
| " <td>895</td>\n", | |
| " <td>5.927358</td>\n", | |
| " <td>24039</td>\n", | |
| " <td>Mybridge</td>\n", | |
| " <td>We rank articles for professionals</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " title \\\n", | |
| "6859 Artificial Intelligence — The Revolution Hasn’... \n", | |
| "5962 Why so many data scientists are leaving their ... \n", | |
| "9426 What exactly can you do with Python? Here are ... \n", | |
| "7926 How to build your own Neural Network from scra... \n", | |
| "2738 30 Amazing Machine Learning Projects for the P... \n", | |
| "\n", | |
| " slug \\\n", | |
| "6859 artificial-intelligence-the-revolution-hasnt-h... \n", | |
| "5962 why-so-many-data-scientists-are-leaving-their-... \n", | |
| "9426 what-can-you-do-with-python-the-3-main-applica... \n", | |
| "7926 how-to-build-your-own-neural-network-from-scra... \n", | |
| "2738 30-amazing-machine-learning-projects-for-the-p... \n", | |
| "\n", | |
| " uniqueSlug \\\n", | |
| "6859 artificial-intelligence-the-revolution-hasnt-h... \n", | |
| "5962 why-so-many-data-scientists-are-leaving-their-... \n", | |
| "9426 what-can-you-do-with-python-the-3-main-applica... \n", | |
| "7926 how-to-build-your-own-neural-network-from-scra... \n", | |
| "2738 30-amazing-machine-learning-projects-for-the-p... \n", | |
| "\n", | |
| " subtitle wordCount \\\n", | |
| "6859 Artificial Intelligence (AI) is the mantra of ... 3989 \n", | |
| "5962 Frustrations of the data scientist! 1689 \n", | |
| "9426 The most common applications of Python are: we... 2310 \n", | |
| "7926 A beginner’s guide to understanding the inner ... 1247 \n", | |
| "2738 For the past year, we’ve compared nearly 8,800... 895 \n", | |
| "\n", | |
| " readingTime totalClapCount username \\\n", | |
| "6859 15.252830 50806 mijordan3 \n", | |
| "5962 7.206918 47057 jonnybrooks04 \n", | |
| "9426 9.666981 42562 ykdojo \n", | |
| "7926 6.055660 41018 jamesloyys \n", | |
| "2738 5.927358 24039 Mybridge \n", | |
| "\n", | |
| " user_bio \n", | |
| "6859 Michael I. Jordan is a Professor in the Depart... \n", | |
| "5962 Data scientist at Deliveroo, public speaker, s... \n", | |
| "9426 YouTuber at CS Dojo / Podcaster at Towards Dat... \n", | |
| "7926 M.Sc. in CS (Machine Learning) @ Georgia Tech \n", | |
| "2738 We rank articles for professionals " | |
| ] | |
| }, | |
| "execution_count": 319, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "top_posts = df_posts[['title','slug','uniqueSlug','subtitle','wordCount',\n", | |
| " 'readingTime','totalClapCount','username','user_bio']].sort_values(['totalClapCount'],\n", | |
| " ascending=False)\n", | |
| "top_posts.head(n=5)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 320, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def assign_clapbin(claps):\n", | |
| " if claps < 10:\n", | |
| " return \"[0-9]\"\n", | |
| " elif claps < 100:\n", | |
| " return \"[10-99]\"\n", | |
| " else:\n", | |
| " return \"[100-210k]\"\n", | |
| "\n", | |
| "top_posts['ClapBin'] = top_posts.totalClapCount.apply(assign_clapbin)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 335, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 只看鼓掌超过1500的\n", | |
| "top_posts[top_posts['totalClapCount']>=1500][['title','subtitle','slug','uniqueSlug','wordCount','readingTime','totalClapCount','username','user_bio']].to_csv(\"posts.csv\",encoding='utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 321, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "count 57346.000000\n", | |
| "mean 167.025233\n", | |
| "std 772.268616\n", | |
| "min 0.000000\n", | |
| "25% 0.000000\n", | |
| "50% 15.000000\n", | |
| "75% 104.000000\n", | |
| "90% 341.500000\n", | |
| "95% 651.000000\n", | |
| "97% 999.300000\n", | |
| "97.5% 1149.375000\n", | |
| "99% 2493.100000\n", | |
| "max 50806.000000\n", | |
| "Name: totalClapCount, dtype: float64" | |
| ] | |
| }, | |
| "execution_count": 321, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "top_posts.totalClapCount.describe([0.25,0.5,0.75,0.9,0.95,0.97, 0.975, 0.99])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 322, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<matplotlib.axes._subplots.AxesSubplot at 0x119afcda0>" | |
| ] | |
| }, | |
| "execution_count": 322, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x118188c18>" | |
| ] | |
| }, | |
| "metadata": { | |
| "needs_background": "light" | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "sns.countplot(top_posts['ClapBin'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 323, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[0-9] 0.458846\n", | |
| "[10-99] 0.280490\n", | |
| "[100-210k] 0.260663\n", | |
| "Name: ClapBin, dtype: float64" | |
| ] | |
| }, | |
| "execution_count": 323, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "top_posts.ClapBin.value_counts(normalize=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "If your post receive 10 claps, congratulations! 58.5 percent of posts have fallen behind your post.\n", | |
| "\n", | |
| "Ten claps means above average, 100 claps deserve a bottle of champagne (82%)\n", | |
| "\n", | |
| "If Your post receive more than 1000 claps, it is better than 97 percent of posts \n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 324, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<seaborn.axisgrid.FacetGrid at 0x1183d8a20>" | |
| ] | |
| }, | |
| "execution_count": 324, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x118411828>" | |
| ] | |
| }, | |
| "metadata": { | |
| "needs_background": "light" | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "sns.catplot(x='ClapBin',y=\"wordCount\",data=top_posts, kind='violin')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 325, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<seaborn.axisgrid.FacetGrid at 0x11404d438>" | |
| ] | |
| }, | |
| "execution_count": 325, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x114049908>" | |
| ] | |
| }, | |
| "metadata": { | |
| "needs_background": "light" | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "sns.catplot(x='ClapBin',y=\"readingTime\",data=top_posts, kind='violin')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Average word count and average reading time is 1500 and 5 mins, respectively\n", | |
| "\n", | |
| "There is no much difference between word count and totalClapCount" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Tag" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 326, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tags = []\n", | |
| "for p in posts:\n", | |
| " if \"virtuals\" in p and 'tags' in p['virtuals']:\n", | |
| " for t in p['virtuals']['tags']:\n", | |
| " tags.append(t)\n", | |
| " \n", | |
| "df_tags = pd.DataFrame(tags)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 327, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# dedup\n", | |
| "df_tags.drop_duplicates(subset=['slug','name'],inplace=True)\n", | |
| "\n", | |
| "# drop useless column\n", | |
| "df_tags.drop(['metadata','type'],axis=1,inplace=True)\n", | |
| "\n", | |
| "# sort\n", | |
| "df_tags.sort_values(['postCount'],ascending=False,inplace=True)\n", | |
| "\n", | |
| "df_tags.reset_index(inplace=True,drop=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 328, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>slug</th>\n", | |
| " <th>name</th>\n", | |
| " <th>postCount</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>startup</td>\n", | |
| " <td>Startup</td>\n", | |
| " <td>391142</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>blockchain</td>\n", | |
| " <td>Blockchain</td>\n", | |
| " <td>364815</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>life</td>\n", | |
| " <td>Life</td>\n", | |
| " <td>348365</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>life-lessons</td>\n", | |
| " <td>Life Lessons</td>\n", | |
| " <td>314796</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>politics</td>\n", | |
| " <td>Politics</td>\n", | |
| " <td>302991</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>poetry</td>\n", | |
| " <td>Poetry</td>\n", | |
| " <td>299349</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>seo</td>\n", | |
| " <td>SEO</td>\n", | |
| " <td>285150</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>travel</td>\n", | |
| " <td>Travel</td>\n", | |
| " <td>275250</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>entrepreneurship</td>\n", | |
| " <td>Entrepreneurship</td>\n", | |
| " <td>268533</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>education</td>\n", | |
| " <td>Education</td>\n", | |
| " <td>257122</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " slug name postCount\n", | |
| "0 startup Startup 391142\n", | |
| "1 blockchain Blockchain 364815\n", | |
| "2 life Life 348365\n", | |
| "3 life-lessons Life Lessons 314796\n", | |
| "4 politics Politics 302991\n", | |
| "5 poetry Poetry 299349\n", | |
| "6 seo SEO 285150\n", | |
| "7 travel Travel 275250\n", | |
| "8 entrepreneurship Entrepreneurship 268533\n", | |
| "9 education Education 257122" | |
| ] | |
| }, | |
| "execution_count": 328, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_tags.head(n=10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 337, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_tags[df_tags.postCount>1000].to_csv(\"tags.csv\",encoding='utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Topics" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 338, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "topics = []\n", | |
| "for p in posts:\n", | |
| " if \"virtuals\" in p and 'topics' in p['virtuals']:\n", | |
| " for t in p['virtuals']['topics']:\n", | |
| " topics.append(t)\n", | |
| " \n", | |
| "df_topics = pd.DataFrame(topics)\n", | |
| "\n", | |
| "df_topics.drop_duplicates(subset=['slug','topicId'],inplace=True)\n", | |
| "\n", | |
| "df_topics.sort_values(['createdAt'],inplace=True)\n", | |
| "\n", | |
| "df_topics.reset_index(drop=True,inplace=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 339, | |
| "metadata": { | |
| "scrolled": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>topicId</th>\n", | |
| " <th>slug</th>\n", | |
| " <th>createdAt</th>\n", | |
| " <th>deletedAt</th>\n", | |
| " <th>image</th>\n", | |
| " <th>name</th>\n", | |
| " <th>description</th>\n", | |
| " <th>relatedTopics</th>\n", | |
| " <th>visibility</th>\n", | |
| " <th>relatedTags</th>\n", | |
| " <th>relatedTopicIds</th>\n", | |
| " <th>type</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>90</th>\n", | |
| " <td>ca00a0701472</td>\n", | |
| " <td>outdoors</td>\n", | |
| " <td>1563396390225</td>\n", | |
| " <td>0</td>\n", | |
| " <td>{'id': '1*[email protected]', 'or...</td>\n", | |
| " <td>Outdoors</td>\n", | |
| " <td>Into the great wide open.</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>1</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>Topic</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>91</th>\n", | |
| " <td>34e2a09fdb28</td>\n", | |
| " <td>fitness</td>\n", | |
| " <td>1563398313265</td>\n", | |
| " <td>0</td>\n", | |
| " <td>{'id': '1*[email protected]', 'or...</td>\n", | |
| " <td>Fitness</td>\n", | |
| " <td>No pain no gains.</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>1</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>Topic</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>92</th>\n", | |
| " <td>3be109cfd3be</td>\n", | |
| " <td>biotech</td>\n", | |
| " <td>1563820846786</td>\n", | |
| " <td>0</td>\n", | |
| " <td>{'id': '1*[email protected]', 'or...</td>\n", | |
| " <td>Biotech</td>\n", | |
| " <td>Genetically predisposed.</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>1</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>Topic</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>93</th>\n", | |
| " <td>e6d44cf5196e</td>\n", | |
| " <td>makers</td>\n", | |
| " <td>1563822557220</td>\n", | |
| " <td>0</td>\n", | |
| " <td>{'id': '1*[email protected]', 'or...</td>\n", | |
| " <td>Makers</td>\n", | |
| " <td>For those who do.</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>1</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>Topic</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>94</th>\n", | |
| " <td>6158eb913466</td>\n", | |
| " <td>coronavirus</td>\n", | |
| " <td>1583259171963</td>\n", | |
| " <td>0</td>\n", | |
| " <td>{'id': '1*[email protected]', 'or...</td>\n", | |
| " <td>Coronavirus</td>\n", | |
| " <td>The latest news about the 2020 coronavirus and...</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>1</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>[]</td>\n", | |
| " <td>Topic</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " topicId slug createdAt deletedAt \\\n", | |
| "90 ca00a0701472 outdoors 1563396390225 0 \n", | |
| "91 34e2a09fdb28 fitness 1563398313265 0 \n", | |
| "92 3be109cfd3be biotech 1563820846786 0 \n", | |
| "93 e6d44cf5196e makers 1563822557220 0 \n", | |
| "94 6158eb913466 coronavirus 1583259171963 0 \n", | |
| "\n", | |
| " image name \\\n", | |
| "90 {'id': '1*[email protected]', 'or... Outdoors \n", | |
| "91 {'id': '1*[email protected]', 'or... Fitness \n", | |
| "92 {'id': '1*[email protected]', 'or... Biotech \n", | |
| "93 {'id': '1*[email protected]', 'or... Makers \n", | |
| "94 {'id': '1*[email protected]', 'or... Coronavirus \n", | |
| "\n", | |
| " description relatedTopics \\\n", | |
| "90 Into the great wide open. [] \n", | |
| "91 No pain no gains. [] \n", | |
| "92 Genetically predisposed. [] \n", | |
| "93 For those who do. [] \n", | |
| "94 The latest news about the 2020 coronavirus and... [] \n", | |
| "\n", | |
| " visibility relatedTags relatedTopicIds type \n", | |
| "90 1 [] [] Topic \n", | |
| "91 1 [] [] Topic \n", | |
| "92 1 [] [] Topic \n", | |
| "93 1 [] [] Topic \n", | |
| "94 1 [] [] Topic " | |
| ] | |
| }, | |
| "execution_count": 339, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_topics.tail()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 340, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_topics[['name','description']].to_csv(\"topics.csv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Writer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- top 10 growth-hacking writer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 341, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "top_writers = df_posts.groupby('creatorId').size().nlargest(30)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 342, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_writer = pd.DataFrame(top_writers,columns=['count'])\n", | |
| "df_writer.reset_index(inplace=True)\n", | |
| "\n", | |
| "df_writer['username'] = df_writer.creatorId.apply(lambda x:lookup_users[x]['username'])\n", | |
| "df_writer['bio'] = df_writer.creatorId.apply(lambda x:lookup_users[x]['bio'])\n", | |
| "df_writer['twitterScreenName'] = df_writer.creatorId.apply(lambda x:lookup_users[x]['twitterScreenName'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 343, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>creatorId</th>\n", | |
| " <th>count</th>\n", | |
| " <th>username</th>\n", | |
| " <th>bio</th>\n", | |
| " <th>twitterScreenName</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2b9d62538208</td>\n", | |
| " <td>647</td>\n", | |
| " <td>ODSC</td>\n", | |
| " <td>Our passion is bringing thousands of the best ...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>46674a2c9422</td>\n", | |
| " <td>246</td>\n", | |
| " <td>jrodthoughts</td>\n", | |
| " <td>Chief Scientist, Managing Partner at Invector ...</td>\n", | |
| " <td>jrdothoughts</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>d8b99ba6ec83</td>\n", | |
| " <td>143</td>\n", | |
| " <td>rinu.gour123</td>\n", | |
| " <td>Data Science Enthusiast | Research writer | Bl...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>880781a85c2</td>\n", | |
| " <td>125</td>\n", | |
| " <td>NYUDataScience</td>\n", | |
| " <td>Official account of the Center for Data Scienc...</td>\n", | |
| " <td>NYUDataScience</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>aff72a0c1243</td>\n", | |
| " <td>116</td>\n", | |
| " <td>sh.tsang</td>\n", | |
| " <td>PhD, Researcher. I share what I've learnt and ...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>6264ceea3dd4</td>\n", | |
| " <td>115</td>\n", | |
| " <td>Corsairs</td>\n", | |
| " <td>Articles that engage, educate, and entertain t...</td>\n", | |
| " <td>CorsairsIn</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>e2f299e30cb9</td>\n", | |
| " <td>103</td>\n", | |
| " <td>williamkoehrsen</td>\n", | |
| " <td>Data Scientist at Cortex Intel, Data Science C...</td>\n", | |
| " <td>koehrsen_will</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>1335786e6357</td>\n", | |
| " <td>94</td>\n", | |
| " <td>YvesMulkers</td>\n", | |
| " <td>BI And Data Architect enjoying Family, Social ...</td>\n", | |
| " <td>YvesMulkers</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>41338000425f</td>\n", | |
| " <td>93</td>\n", | |
| " <td>ibelmopan</td>\n", | |
| " <td>ML and NLP Research Scientist | Ph.D. | Twitte...</td>\n", | |
| " <td>omarsar0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>e8cce06956c9</td>\n", | |
| " <td>91</td>\n", | |
| " <td>rahul_agarwal</td>\n", | |
| " <td>Bridging the gap between Data Science and Intu...</td>\n", | |
| " <td>MLWhiz</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>4acc091611c9</td>\n", | |
| " <td>90</td>\n", | |
| " <td>Cambridge_Spark</td>\n", | |
| " <td>Data Science Specialists</td>\n", | |
| " <td>CambridgeSpark</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>bd51f1a63813</td>\n", | |
| " <td>90</td>\n", | |
| " <td>jonathan_hui</td>\n", | |
| " <td>Deep Learning</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>e960cfb4e73c</td>\n", | |
| " <td>84</td>\n", | |
| " <td>365datascience</td>\n", | |
| " <td>https://365datascience.com is an educational c...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13</th>\n", | |
| " <td>41cd8f154e82</td>\n", | |
| " <td>83</td>\n", | |
| " <td>SeattleDataGuy</td>\n", | |
| " <td>#Data #Engineer, Strategy Development Consulta...</td>\n", | |
| " <td>SeattleDataGuy</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14</th>\n", | |
| " <td>825fa70f9e24</td>\n", | |
| " <td>83</td>\n", | |
| " <td>vimarshk</td>\n", | |
| " <td>Engineering Manager | Editor/Founder of Acing AI</td>\n", | |
| " <td>VimarshApi</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>15</th>\n", | |
| " <td>60b579a69a7a</td>\n", | |
| " <td>82</td>\n", | |
| " <td>analyticbridge</td>\n", | |
| " <td>Data science pioneer, founder, entrepreneur, i...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>16</th>\n", | |
| " <td>88be7c24b7fd</td>\n", | |
| " <td>81</td>\n", | |
| " <td>nilimeshhalder</td>\n", | |
| " <td></td>\n", | |
| " <td>SETScholarsInfo</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>17</th>\n", | |
| " <td>d9b237bc89f0</td>\n", | |
| " <td>81</td>\n", | |
| " <td>farhadmalik</td>\n", | |
| " <td>My personal blog, aiming to explain complex ma...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>18</th>\n", | |
| " <td>47a07cc4eb4e</td>\n", | |
| " <td>80</td>\n", | |
| " <td>magnimind</td>\n", | |
| " <td>Let’s change the world by acquiring AI and Mac...</td>\n", | |
| " <td>MagnimindA</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>19</th>\n", | |
| " <td>1bfa80768afa</td>\n", | |
| " <td>79</td>\n", | |
| " <td>kanaugust</td>\n", | |
| " <td>CEO / Founder at Exploratory(https://explorato...</td>\n", | |
| " <td>KanAugust</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>20</th>\n", | |
| " <td>88e3e673aa8b</td>\n", | |
| " <td>79</td>\n", | |
| " <td>pchojecki</td>\n", | |
| " <td>AI entrepreneur with a PhD in mathematics, For...</td>\n", | |
| " <td>prz_chojecki</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>21</th>\n", | |
| " <td>ba547bff904f</td>\n", | |
| " <td>78</td>\n", | |
| " <td>makcedward</td>\n", | |
| " <td>Focus in Natural Language Processing, Data Sci...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>22</th>\n", | |
| " <td>afde3432bf28</td>\n", | |
| " <td>76</td>\n", | |
| " <td>posey</td>\n", | |
| " <td>Founder @ Spawner.ai. Formerly AI @ P&G. 👨🏻💻 ...</td>\n", | |
| " <td>PoseysThumbs</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>23</th>\n", | |
| " <td>e8ec6fa4d7d4</td>\n", | |
| " <td>73</td>\n", | |
| " <td>faviovazquez</td>\n", | |
| " <td>Data scientist, physicist and computer enginee...</td>\n", | |
| " <td>FavioVaz</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>24</th>\n", | |
| " <td>3a025d440e6b</td>\n", | |
| " <td>68</td>\n", | |
| " <td>benjaminobi</td>\n", | |
| " <td>Physicist, Data Science Educator, Writer. Inte...</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25</th>\n", | |
| " <td>d2ef7f5ede53</td>\n", | |
| " <td>66</td>\n", | |
| " <td>jhsu98</td>\n", | |
| " <td>I love to share my experiences learning to cod...</td>\n", | |
| " <td>jhsu98</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>26</th>\n", | |
| " <td>370952daf49</td>\n", | |
| " <td>65</td>\n", | |
| " <td>OpexAnalytics</td>\n", | |
| " <td>Author of The Opex Analytics Blog.</td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>27</th>\n", | |
| " <td>ddc8f44ec90</td>\n", | |
| " <td>65</td>\n", | |
| " <td>anebellyliza45</td>\n", | |
| " <td></td>\n", | |
| " <td></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>28</th>\n", | |
| " <td>2fccb851bb5e</td>\n", | |
| " <td>64</td>\n", | |
| " <td>kozyrkov</td>\n", | |
| " <td>Head of Decision Intelligence, Google. ❤️ Stat...</td>\n", | |
| " <td>quaesita</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>29</th>\n", | |
| " <td>e2af5c8737ec</td>\n", | |
| " <td>63</td>\n", | |
| " <td>george.seif94</td>\n", | |
| " <td>Certified Nerd</td>\n", | |
| " <td>GeorgeSeif94</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " creatorId count username \\\n", | |
| "0 2b9d62538208 647 ODSC \n", | |
| "1 46674a2c9422 246 jrodthoughts \n", | |
| "2 d8b99ba6ec83 143 rinu.gour123 \n", | |
| "3 880781a85c2 125 NYUDataScience \n", | |
| "4 aff72a0c1243 116 sh.tsang \n", | |
| "5 6264ceea3dd4 115 Corsairs \n", | |
| "6 e2f299e30cb9 103 williamkoehrsen \n", | |
| "7 1335786e6357 94 YvesMulkers \n", | |
| "8 41338000425f 93 ibelmopan \n", | |
| "9 e8cce06956c9 91 rahul_agarwal \n", | |
| "10 4acc091611c9 90 Cambridge_Spark \n", | |
| "11 bd51f1a63813 90 jonathan_hui \n", | |
| "12 e960cfb4e73c 84 365datascience \n", | |
| "13 41cd8f154e82 83 SeattleDataGuy \n", | |
| "14 825fa70f9e24 83 vimarshk \n", | |
| "15 60b579a69a7a 82 analyticbridge \n", | |
| "16 88be7c24b7fd 81 nilimeshhalder \n", | |
| "17 d9b237bc89f0 81 farhadmalik \n", | |
| "18 47a07cc4eb4e 80 magnimind \n", | |
| "19 1bfa80768afa 79 kanaugust \n", | |
| "20 88e3e673aa8b 79 pchojecki \n", | |
| "21 ba547bff904f 78 makcedward \n", | |
| "22 afde3432bf28 76 posey \n", | |
| "23 e8ec6fa4d7d4 73 faviovazquez \n", | |
| "24 3a025d440e6b 68 benjaminobi \n", | |
| "25 d2ef7f5ede53 66 jhsu98 \n", | |
| "26 370952daf49 65 OpexAnalytics \n", | |
| "27 ddc8f44ec90 65 anebellyliza45 \n", | |
| "28 2fccb851bb5e 64 kozyrkov \n", | |
| "29 e2af5c8737ec 63 george.seif94 \n", | |
| "\n", | |
| " bio twitterScreenName \n", | |
| "0 Our passion is bringing thousands of the best ... \n", | |
| "1 Chief Scientist, Managing Partner at Invector ... jrdothoughts \n", | |
| "2 Data Science Enthusiast | Research writer | Bl... \n", | |
| "3 Official account of the Center for Data Scienc... NYUDataScience \n", | |
| "4 PhD, Researcher. I share what I've learnt and ... \n", | |
| "5 Articles that engage, educate, and entertain t... CorsairsIn \n", | |
| "6 Data Scientist at Cortex Intel, Data Science C... koehrsen_will \n", | |
| "7 BI And Data Architect enjoying Family, Social ... YvesMulkers \n", | |
| "8 ML and NLP Research Scientist | Ph.D. | Twitte... omarsar0 \n", | |
| "9 Bridging the gap between Data Science and Intu... MLWhiz \n", | |
| "10 Data Science Specialists CambridgeSpark \n", | |
| "11 Deep Learning \n", | |
| "12 https://365datascience.com is an educational c... \n", | |
| "13 #Data #Engineer, Strategy Development Consulta... SeattleDataGuy \n", | |
| "14 Engineering Manager | Editor/Founder of Acing AI VimarshApi \n", | |
| "15 Data science pioneer, founder, entrepreneur, i... \n", | |
| "16 SETScholarsInfo \n", | |
| "17 My personal blog, aiming to explain complex ma... \n", | |
| "18 Let’s change the world by acquiring AI and Mac... MagnimindA \n", | |
| "19 CEO / Founder at Exploratory(https://explorato... KanAugust \n", | |
| "20 AI entrepreneur with a PhD in mathematics, For... prz_chojecki \n", | |
| "21 Focus in Natural Language Processing, Data Sci... \n", | |
| "22 Founder @ Spawner.ai. Formerly AI @ P&G. 👨🏻💻 ... PoseysThumbs \n", | |
| "23 Data scientist, physicist and computer enginee... FavioVaz \n", | |
| "24 Physicist, Data Science Educator, Writer. Inte... \n", | |
| "25 I love to share my experiences learning to cod... jhsu98 \n", | |
| "26 Author of The Opex Analytics Blog. \n", | |
| "27 \n", | |
| "28 Head of Decision Intelligence, Google. ❤️ Stat... quaesita \n", | |
| "29 Certified Nerd GeorgeSeif94 " | |
| ] | |
| }, | |
| "execution_count": 343, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_writer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 344, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_writer.to_csv(\"writer.csv\",encoding='utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Collections" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 345, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "top_collections = df_posts.groupby('homeCollectionId').size().nlargest(30)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 346, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_collections = pd.DataFrame(top_collections,columns=['count'])\n", | |
| "df_collections.reset_index(inplace=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "some post has no collection, we ignore it" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 347, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_collections = df_collections[df_collections.homeCollectionId!='']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 348, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>homeCollectionId</th>\n", | |
| " <th>count</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>7f60cf5620c9</td>\n", | |
| " <td>11406</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>7219b4dc6c4c</td>\n", | |
| " <td>1632</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>32881626c9c9</td>\n", | |
| " <td>1028</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>f5af2b715248</td>\n", | |
| " <td>617</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>3a8144eabfe3</td>\n", | |
| " <td>342</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>d0b105d10f0a</td>\n", | |
| " <td>269</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>5e5bef33608a</td>\n", | |
| " <td>196</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>7b837cf1fd73</td>\n", | |
| " <td>193</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>98111c9905da</td>\n", | |
| " <td>187</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>336d898217ee</td>\n", | |
| " <td>165</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>d5e885e906a7</td>\n", | |
| " <td>100</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>31f4f88d6548</td>\n", | |
| " <td>98</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13</th>\n", | |
| " <td>356ca48206e6</td>\n", | |
| " <td>96</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14</th>\n", | |
| " <td>5517fd7b58a6</td>\n", | |
| " <td>94</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>15</th>\n", | |
| " <td>6ea408ec434d</td>\n", | |
| " <td>92</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>16</th>\n", | |
| " <td>cb942d4b5d89</td>\n", | |
| " <td>90</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>17</th>\n", | |
| " <td>f0db56adb08d</td>\n", | |
| " <td>86</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>18</th>\n", | |
| " <td>a2487db7984a</td>\n", | |
| " <td>84</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>19</th>\n", | |
| " <td>f3225cc85e15</td>\n", | |
| " <td>76</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>20</th>\n", | |
| " <td>4689c8214177</td>\n", | |
| " <td>75</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>21</th>\n", | |
| " <td>fc78dab2b103</td>\n", | |
| " <td>71</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>22</th>\n", | |
| " <td>721b17443fd5</td>\n", | |
| " <td>69</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>23</th>\n", | |
| " <td>4c5221789b3</td>\n", | |
| " <td>67</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>24</th>\n", | |
| " <td>2a678b52fc4f</td>\n", | |
| " <td>64</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25</th>\n", | |
| " <td>680eee12c50d</td>\n", | |
| " <td>62</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>26</th>\n", | |
| " <td>2d7ba3077a44</td>\n", | |
| " <td>60</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>27</th>\n", | |
| " <td>d02e65779d7b</td>\n", | |
| " <td>53</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>28</th>\n", | |
| " <td>e8dd4fd2bda0</td>\n", | |
| " <td>52</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>29</th>\n", | |
| " <td>c5f05be4e189</td>\n", | |
| " <td>48</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " homeCollectionId count\n", | |
| "1 7f60cf5620c9 11406\n", | |
| "2 7219b4dc6c4c 1632\n", | |
| "3 32881626c9c9 1028\n", | |
| "4 f5af2b715248 617\n", | |
| "5 3a8144eabfe3 342\n", | |
| "6 d0b105d10f0a 269\n", | |
| "7 5e5bef33608a 196\n", | |
| "8 7b837cf1fd73 193\n", | |
| "9 98111c9905da 187\n", | |
| "10 336d898217ee 165\n", | |
| "11 d5e885e906a7 100\n", | |
| "12 31f4f88d6548 98\n", | |
| "13 356ca48206e6 96\n", | |
| "14 5517fd7b58a6 94\n", | |
| "15 6ea408ec434d 92\n", | |
| "16 cb942d4b5d89 90\n", | |
| "17 f0db56adb08d 86\n", | |
| "18 a2487db7984a 84\n", | |
| "19 f3225cc85e15 76\n", | |
| "20 4689c8214177 75\n", | |
| "21 fc78dab2b103 71\n", | |
| "22 721b17443fd5 69\n", | |
| "23 4c5221789b3 67\n", | |
| "24 2a678b52fc4f 64\n", | |
| "25 680eee12c50d 62\n", | |
| "26 2d7ba3077a44 60\n", | |
| "27 d02e65779d7b 53\n", | |
| "28 e8dd4fd2bda0 52\n", | |
| "29 c5f05be4e189 48" | |
| ] | |
| }, | |
| "execution_count": 348, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_collections" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 349, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_collections['name'] = df_collections.homeCollectionId.apply(lambda x:lookup_collections[x]['name'])\n", | |
| "df_collections['description'] = df_collections.homeCollectionId.apply(lambda x:lookup_collections[x]['description'])\n", | |
| "df_collections['domain'] = df_collections.homeCollectionId.apply(lambda x:lookup_collections[x].get('domain'))\n", | |
| "df_collections['subscriberCount'] = df_collections.homeCollectionId.apply(lambda x:lookup_collections[x].get('subscriberCount'))\n", | |
| "df_collections['tagline'] = df_collections.homeCollectionId.apply(lambda x:lookup_collections[x].get('tagline'))\n", | |
| "df_collections['tags'] = df_collections.homeCollectionId.apply(lambda x:lookup_collections[x].get('tags'))\n", | |
| "df_collections['slug'] = df_collections.homeCollectionId.apply(lambda x:lookup_collections[x].get('slug'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 350, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>homeCollectionId</th>\n", | |
| " <th>count</th>\n", | |
| " <th>name</th>\n", | |
| " <th>description</th>\n", | |
| " <th>domain</th>\n", | |
| " <th>subscriberCount</th>\n", | |
| " <th>tagline</th>\n", | |
| " <th>tags</th>\n", | |
| " <th>slug</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>7f60cf5620c9</td>\n", | |
| " <td>11406</td>\n", | |
| " <td>Towards Data Science</td>\n", | |
| " <td>A Medium publication sharing concepts, ideas, ...</td>\n", | |
| " <td>towardsdatascience.com</td>\n", | |
| " <td>370008</td>\n", | |
| " <td>A Medium publication sharing concepts, ideas, ...</td>\n", | |
| " <td>[DATA SCIENCE, MACHINE LEARNING, ARTIFICIAL IN...</td>\n", | |
| " <td>towards-data-science</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>7219b4dc6c4c</td>\n", | |
| " <td>1632</td>\n", | |
| " <td>Analytics Vidhya</td>\n", | |
| " <td>Analytics Vidhya is a community of Analytics a...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>20817</td>\n", | |
| " <td>Analytics Vidhya is a community of Analytics a...</td>\n", | |
| " <td>[MACHINE LEARNING, ARTIFICIAL INTELLIGENCE, DE...</td>\n", | |
| " <td>analytics-vidhya</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>32881626c9c9</td>\n", | |
| " <td>1028</td>\n", | |
| " <td>Data Driven Investor</td>\n", | |
| " <td>from confusion to clarity, not insanity</td>\n", | |
| " <td>None</td>\n", | |
| " <td>27911</td>\n", | |
| " <td>from confusion to clarity, not insanity</td>\n", | |
| " <td>[TECHNOLOGY, ARTIFICIAL INTELLIGENCE, BLOCKCHA...</td>\n", | |
| " <td>datadriveninvestor</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>f5af2b715248</td>\n", | |
| " <td>617</td>\n", | |
| " <td>The Startup</td>\n", | |
| " <td>Medium's largest active publication, followed ...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>603226</td>\n", | |
| " <td>Medium's largest active publication, followed ...</td>\n", | |
| " <td>[STARTUP, TECH, ENTREPRENEURSHIP, DESIGN, LIFE]</td>\n", | |
| " <td>swlh</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>3a8144eabfe3</td>\n", | |
| " <td>342</td>\n", | |
| " <td>HackerNoon.com</td>\n", | |
| " <td>how hackers start their afternoons.</td>\n", | |
| " <td>None</td>\n", | |
| " <td>480793</td>\n", | |
| " <td>how hackers start their afternoons.</td>\n", | |
| " <td>[HACKING, PROGRAMMING, TECH, HACKER, TECHNOLOGY]</td>\n", | |
| " <td>hackernoon</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>d0b105d10f0a</td>\n", | |
| " <td>269</td>\n", | |
| " <td>Better Programming</td>\n", | |
| " <td>Advice for programmers.</td>\n", | |
| " <td>None</td>\n", | |
| " <td>108299</td>\n", | |
| " <td>Advice for programmers.</td>\n", | |
| " <td>[SOFTWARE DEVELOPMENT, ENGINEERING, REACT, JAV...</td>\n", | |
| " <td>better-programming</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>5e5bef33608a</td>\n", | |
| " <td>196</td>\n", | |
| " <td>Becoming Human: Artificial Intelligence Magazine</td>\n", | |
| " <td>Latest News, Info and Tutorials on Artificial ...</td>\n", | |
| " <td>becominghuman.ai</td>\n", | |
| " <td>31390</td>\n", | |
| " <td>Latest News, Info and Tutorials on Artificial ...</td>\n", | |
| " <td>[ARTIFICIAL INTELLIGENCE, DEEP LEARNING, MACHI...</td>\n", | |
| " <td>becoming-human</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>7b837cf1fd73</td>\n", | |
| " <td>193</td>\n", | |
| " <td>Noteworthy - The Journal Blog</td>\n", | |
| " <td>The Official Journal Blog</td>\n", | |
| " <td>blog.usejournal.com</td>\n", | |
| " <td>65543</td>\n", | |
| " <td>The Official Journal Blog</td>\n", | |
| " <td>[STARTUP, PRODUCTIVITY, ENTREPRENEURSHIP, TECH...</td>\n", | |
| " <td>did-you-know-the-journal-blog</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>98111c9905da</td>\n", | |
| " <td>187</td>\n", | |
| " <td>Towards AI</td>\n", | |
| " <td>Towards AI, is the world’s fastest-growing AI ...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>6566</td>\n", | |
| " <td>Towards AI, is the world’s fastest-growing AI ...</td>\n", | |
| " <td>[ARTIFICIAL INTELLIGENCE, MACHINE LEARNING, DE...</td>\n", | |
| " <td>towards-artificial-intelligence</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>336d898217ee</td>\n", | |
| " <td>165</td>\n", | |
| " <td>freeCodeCamp.org</td>\n", | |
| " <td>This is no longer updated. Go to https://freec...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>608699</td>\n", | |
| " <td>This is no longer updated.</td>\n", | |
| " <td>[TECHNOLOGY, DESIGN, TECH, STARTUP, PRODUCTIVITY]</td>\n", | |
| " <td>free-code-camp</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>d5e885e906a7</td>\n", | |
| " <td>100</td>\n", | |
| " <td>DataSeries</td>\n", | |
| " <td>Connecting data leaders and curating their tho...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>3217</td>\n", | |
| " <td>Connecting data leaders and curating their tho...</td>\n", | |
| " <td>[STARTUP, DATA SCIENCE, ARTIFICIAL INTELLIGENC...</td>\n", | |
| " <td>dataseries</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>31f4f88d6548</td>\n", | |
| " <td>98</td>\n", | |
| " <td>Center for Data Science</td>\n", | |
| " <td>This is the official research blog of the NYU ...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>1820</td>\n", | |
| " <td>This is the official research blog of the NYU ...</td>\n", | |
| " <td>[DATA SCIENCE, DATA MINING, TECHNOLOGY, ARTIFI...</td>\n", | |
| " <td>center-for-data-science</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13</th>\n", | |
| " <td>356ca48206e6</td>\n", | |
| " <td>96</td>\n", | |
| " <td>Nightingale</td>\n", | |
| " <td>The Journal of the Data Visualization Society</td>\n", | |
| " <td>None</td>\n", | |
| " <td>8496</td>\n", | |
| " <td>The Journal of the Data Visualization Society</td>\n", | |
| " <td>[DATA SCIENCE, DATA VISUALIZATION, DESIGN, PRO...</td>\n", | |
| " <td>nightingale</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14</th>\n", | |
| " <td>5517fd7b58a6</td>\n", | |
| " <td>94</td>\n", | |
| " <td>Level Up Coding</td>\n", | |
| " <td>Coding tutorials and news. The developer homep...</td>\n", | |
| " <td>levelup.gitconnected.com</td>\n", | |
| " <td>30293</td>\n", | |
| " <td>Coding tutorials and news.</td>\n", | |
| " <td>[PROGRAMMING, WEB DEVELOPMENT, JAVASCRIPT, PYT...</td>\n", | |
| " <td>gitconnected</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>15</th>\n", | |
| " <td>6ea408ec434d</td>\n", | |
| " <td>92</td>\n", | |
| " <td>learn data science</td>\n", | |
| " <td>Unpacking Data Science One Step At A Time</td>\n", | |
| " <td>blog.exploratory.io</td>\n", | |
| " <td>6197</td>\n", | |
| " <td>Unpacking Data Science One Step At A Time</td>\n", | |
| " <td>[DATA SCIENCE, R PROGRAMMING, DATA VISUALIZATI...</td>\n", | |
| " <td>learn-dplyr</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>16</th>\n", | |
| " <td>cb942d4b5d89</td>\n", | |
| " <td>90</td>\n", | |
| " <td>Good Audience</td>\n", | |
| " <td>The front page of Deep Tech. Don't miss the la...</td>\n", | |
| " <td>blog.goodaudience.com</td>\n", | |
| " <td>17483</td>\n", | |
| " <td>The front page of Deep Tech.</td>\n", | |
| " <td>[CRYPTOCURRENCY, BLOCKCHAIN, ARTIFICIAL INTELL...</td>\n", | |
| " <td>good-audience</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>17</th>\n", | |
| " <td>f0db56adb08d</td>\n", | |
| " <td>86</td>\n", | |
| " <td>dair.ai</td>\n", | |
| " <td>Democratizing Artificial Intelligence Research...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>3400</td>\n", | |
| " <td>Democratizing Artificial Intelligence Research...</td>\n", | |
| " <td>[MACHINE LEARNING, ARTIFICIAL INTELLIGENCE, RE...</td>\n", | |
| " <td>dair-ai</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>18</th>\n", | |
| " <td>a2487db7984a</td>\n", | |
| " <td>84</td>\n", | |
| " <td>Cambridge Spark</td>\n", | |
| " <td>Data Science Tutorials, Webinars and Resources...</td>\n", | |
| " <td>blog.cambridgespark.com</td>\n", | |
| " <td>1023</td>\n", | |
| " <td>Data Science Tutorials, Webinars and Resources...</td>\n", | |
| " <td>[DATA SCIENCE, MACHINE LEARNING, PYTHON]</td>\n", | |
| " <td>cambridgespark</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>19</th>\n", | |
| " <td>f3225cc85e15</td>\n", | |
| " <td>76</td>\n", | |
| " <td>Acing AI</td>\n", | |
| " <td>Acing AI provides analysis of AI companies and...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>3498</td>\n", | |
| " <td>Acing AI provides analysis of AI companies and...</td>\n", | |
| " <td>[ARTIFICIAL INTELLIGENCE, DATA SCIENCE, MACHIN...</td>\n", | |
| " <td>acing-ai</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>20</th>\n", | |
| " <td>4689c8214177</td>\n", | |
| " <td>75</td>\n", | |
| " <td>FinTechExplained</td>\n", | |
| " <td>This blog aims to bridge the gap between techn...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>4577</td>\n", | |
| " <td>This blog aims to bridge the gap between techn...</td>\n", | |
| " <td>[FINANCE, TECHNOLOGY, DATA SCIENCE, FINTECH, M...</td>\n", | |
| " <td>fintechexplained</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>21</th>\n", | |
| " <td>fc78dab2b103</td>\n", | |
| " <td>71</td>\n", | |
| " <td>Budding Data Scientists</td>\n", | |
| " <td>A pilot data science hackathon for high school...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>17</td>\n", | |
| " <td>A pilot data science hackathon for high school...</td>\n", | |
| " <td>[DATA SCIENCE, EDUCATION, HACKATHONS, SOCIAL C...</td>\n", | |
| " <td>budding-data-scientists</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>22</th>\n", | |
| " <td>721b17443fd5</td>\n", | |
| " <td>69</td>\n", | |
| " <td>Coinmonks</td>\n", | |
| " <td>Coinmonks is a non-profit Crypto educational p...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>19309</td>\n", | |
| " <td>Coinmonks is a non-profit Crypto educational p...</td>\n", | |
| " <td>[BITCOIN, TECHNOLOGY, CRYPTOCURRENCY, BLOCKCHA...</td>\n", | |
| " <td>coinmonks</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>23</th>\n", | |
| " <td>4c5221789b3</td>\n", | |
| " <td>67</td>\n", | |
| " <td>Openbridge</td>\n", | |
| " <td>All things data, big and small</td>\n", | |
| " <td>blog.openbridge.com</td>\n", | |
| " <td>842</td>\n", | |
| " <td>All things data, big and small</td>\n", | |
| " <td>[DATA SCIENCE, DATA, ANALYTICS, TECHNOLOGY, BU...</td>\n", | |
| " <td>openbridge</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>24</th>\n", | |
| " <td>2a678b52fc4f</td>\n", | |
| " <td>64</td>\n", | |
| " <td>The Opex Analytics Blog</td>\n", | |
| " <td>Solving Complex Business Problems with Human a...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>517</td>\n", | |
| " <td>Solving Complex Business Problems with Human a...</td>\n", | |
| " <td>[DATA SCIENCE, OPTIMIZATION, AI, MACHINE LEARN...</td>\n", | |
| " <td>opex-analytics</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25</th>\n", | |
| " <td>680eee12c50d</td>\n", | |
| " <td>62</td>\n", | |
| " <td>Heartbeat</td>\n", | |
| " <td>Exploring the intersection of mobile developme...</td>\n", | |
| " <td>heartbeat.fritz.ai</td>\n", | |
| " <td>7540</td>\n", | |
| " <td>Exploring the intersection of mobile developme...</td>\n", | |
| " <td>[ARTIFICIAL INTELLIGENCE, MACHINE LEARNING, DE...</td>\n", | |
| " <td>fritzheartbeat</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>26</th>\n", | |
| " <td>2d7ba3077a44</td>\n", | |
| " <td>60</td>\n", | |
| " <td>RAPIDS AI</td>\n", | |
| " <td>RAPIDS is a suite of software libraries for ex...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>1423</td>\n", | |
| " <td>RAPIDS is a suite of software libraries for ex...</td>\n", | |
| " <td>[DATA SCIENCE, BIG DATA ANALYTICS, MACHINE LEA...</td>\n", | |
| " <td>rapids-ai</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>27</th>\n", | |
| " <td>d02e65779d7b</td>\n", | |
| " <td>53</td>\n", | |
| " <td>Insight Fellows Program</td>\n", | |
| " <td>Insight Fellows Program - Your bridge to a thr...</td>\n", | |
| " <td>blog.insightdatascience.com</td>\n", | |
| " <td>13908</td>\n", | |
| " <td>Insight Fellows Program - Your bridge to a thr...</td>\n", | |
| " <td>[EDUCATION, DATA SCIENCE, DATA ENGINEERING, AI...</td>\n", | |
| " <td>insight-data</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>28</th>\n", | |
| " <td>e8dd4fd2bda0</td>\n", | |
| " <td>52</td>\n", | |
| " <td>The Circular Theory</td>\n", | |
| " <td>Conservation of the circle is the core dynamic...</td>\n", | |
| " <td>None</td>\n", | |
| " <td>154</td>\n", | |
| " <td>Conservation of the circle is the core dynamic...</td>\n", | |
| " <td>[DEEP LEARNING, MACHINE LEARNING, QUANTUM COMP...</td>\n", | |
| " <td>the-circular-theory</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>29</th>\n", | |
| " <td>c5f05be4e189</td>\n", | |
| " <td>48</td>\n", | |
| " <td>365 Data Science</td>\n", | |
| " <td>365 Data Science is an educational platform</td>\n", | |
| " <td>None</td>\n", | |
| " <td>32</td>\n", | |
| " <td>365 Data Science is an educational platform</td>\n", | |
| " <td>[DATA SCIENCE, BUSINESS INTELLIGENCE, DATA SCI...</td>\n", | |
| " <td>365datascience</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " homeCollectionId count name \\\n", | |
| "1 7f60cf5620c9 11406 Towards Data Science \n", | |
| "2 7219b4dc6c4c 1632 Analytics Vidhya \n", | |
| "3 32881626c9c9 1028 Data Driven Investor \n", | |
| "4 f5af2b715248 617 The Startup \n", | |
| "5 3a8144eabfe3 342 HackerNoon.com \n", | |
| "6 d0b105d10f0a 269 Better Programming \n", | |
| "7 5e5bef33608a 196 Becoming Human: Artificial Intelligence Magazine \n", | |
| "8 7b837cf1fd73 193 Noteworthy - The Journal Blog \n", | |
| "9 98111c9905da 187 Towards AI \n", | |
| "10 336d898217ee 165 freeCodeCamp.org \n", | |
| "11 d5e885e906a7 100 DataSeries \n", | |
| "12 31f4f88d6548 98 Center for Data Science \n", | |
| "13 356ca48206e6 96 Nightingale \n", | |
| "14 5517fd7b58a6 94 Level Up Coding \n", | |
| "15 6ea408ec434d 92 learn data science \n", | |
| "16 cb942d4b5d89 90 Good Audience \n", | |
| "17 f0db56adb08d 86 dair.ai \n", | |
| "18 a2487db7984a 84 Cambridge Spark \n", | |
| "19 f3225cc85e15 76 Acing AI \n", | |
| "20 4689c8214177 75 FinTechExplained \n", | |
| "21 fc78dab2b103 71 Budding Data Scientists \n", | |
| "22 721b17443fd5 69 Coinmonks \n", | |
| "23 4c5221789b3 67 Openbridge \n", | |
| "24 2a678b52fc4f 64 The Opex Analytics Blog \n", | |
| "25 680eee12c50d 62 Heartbeat \n", | |
| "26 2d7ba3077a44 60 RAPIDS AI \n", | |
| "27 d02e65779d7b 53 Insight Fellows Program \n", | |
| "28 e8dd4fd2bda0 52 The Circular Theory \n", | |
| "29 c5f05be4e189 48 365 Data Science \n", | |
| "\n", | |
| " description \\\n", | |
| "1 A Medium publication sharing concepts, ideas, ... \n", | |
| "2 Analytics Vidhya is a community of Analytics a... \n", | |
| "3 from confusion to clarity, not insanity \n", | |
| "4 Medium's largest active publication, followed ... \n", | |
| "5 how hackers start their afternoons. \n", | |
| "6 Advice for programmers. \n", | |
| "7 Latest News, Info and Tutorials on Artificial ... \n", | |
| "8 The Official Journal Blog \n", | |
| "9 Towards AI, is the world’s fastest-growing AI ... \n", | |
| "10 This is no longer updated. Go to https://freec... \n", | |
| "11 Connecting data leaders and curating their tho... \n", | |
| "12 This is the official research blog of the NYU ... \n", | |
| "13 The Journal of the Data Visualization Society \n", | |
| "14 Coding tutorials and news. The developer homep... \n", | |
| "15 Unpacking Data Science One Step At A Time \n", | |
| "16 The front page of Deep Tech. Don't miss the la... \n", | |
| "17 Democratizing Artificial Intelligence Research... \n", | |
| "18 Data Science Tutorials, Webinars and Resources... \n", | |
| "19 Acing AI provides analysis of AI companies and... \n", | |
| "20 This blog aims to bridge the gap between techn... \n", | |
| "21 A pilot data science hackathon for high school... \n", | |
| "22 Coinmonks is a non-profit Crypto educational p... \n", | |
| "23 All things data, big and small \n", | |
| "24 Solving Complex Business Problems with Human a... \n", | |
| "25 Exploring the intersection of mobile developme... \n", | |
| "26 RAPIDS is a suite of software libraries for ex... \n", | |
| "27 Insight Fellows Program - Your bridge to a thr... \n", | |
| "28 Conservation of the circle is the core dynamic... \n", | |
| "29 365 Data Science is an educational platform \n", | |
| "\n", | |
| " domain subscriberCount \\\n", | |
| "1 towardsdatascience.com 370008 \n", | |
| "2 None 20817 \n", | |
| "3 None 27911 \n", | |
| "4 None 603226 \n", | |
| "5 None 480793 \n", | |
| "6 None 108299 \n", | |
| "7 becominghuman.ai 31390 \n", | |
| "8 blog.usejournal.com 65543 \n", | |
| "9 None 6566 \n", | |
| "10 None 608699 \n", | |
| "11 None 3217 \n", | |
| "12 None 1820 \n", | |
| "13 None 8496 \n", | |
| "14 levelup.gitconnected.com 30293 \n", | |
| "15 blog.exploratory.io 6197 \n", | |
| "16 blog.goodaudience.com 17483 \n", | |
| "17 None 3400 \n", | |
| "18 blog.cambridgespark.com 1023 \n", | |
| "19 None 3498 \n", | |
| "20 None 4577 \n", | |
| "21 None 17 \n", | |
| "22 None 19309 \n", | |
| "23 blog.openbridge.com 842 \n", | |
| "24 None 517 \n", | |
| "25 heartbeat.fritz.ai 7540 \n", | |
| "26 None 1423 \n", | |
| "27 blog.insightdatascience.com 13908 \n", | |
| "28 None 154 \n", | |
| "29 None 32 \n", | |
| "\n", | |
| " tagline \\\n", | |
| "1 A Medium publication sharing concepts, ideas, ... \n", | |
| "2 Analytics Vidhya is a community of Analytics a... \n", | |
| "3 from confusion to clarity, not insanity \n", | |
| "4 Medium's largest active publication, followed ... \n", | |
| "5 how hackers start their afternoons. \n", | |
| "6 Advice for programmers. \n", | |
| "7 Latest News, Info and Tutorials on Artificial ... \n", | |
| "8 The Official Journal Blog \n", | |
| "9 Towards AI, is the world’s fastest-growing AI ... \n", | |
| "10 This is no longer updated. \n", | |
| "11 Connecting data leaders and curating their tho... \n", | |
| "12 This is the official research blog of the NYU ... \n", | |
| "13 The Journal of the Data Visualization Society \n", | |
| "14 Coding tutorials and news. \n", | |
| "15 Unpacking Data Science One Step At A Time \n", | |
| "16 The front page of Deep Tech. \n", | |
| "17 Democratizing Artificial Intelligence Research... \n", | |
| "18 Data Science Tutorials, Webinars and Resources... \n", | |
| "19 Acing AI provides analysis of AI companies and... \n", | |
| "20 This blog aims to bridge the gap between techn... \n", | |
| "21 A pilot data science hackathon for high school... \n", | |
| "22 Coinmonks is a non-profit Crypto educational p... \n", | |
| "23 All things data, big and small \n", | |
| "24 Solving Complex Business Problems with Human a... \n", | |
| "25 Exploring the intersection of mobile developme... \n", | |
| "26 RAPIDS is a suite of software libraries for ex... \n", | |
| "27 Insight Fellows Program - Your bridge to a thr... \n", | |
| "28 Conservation of the circle is the core dynamic... \n", | |
| "29 365 Data Science is an educational platform \n", | |
| "\n", | |
| " tags \\\n", | |
| "1 [DATA SCIENCE, MACHINE LEARNING, ARTIFICIAL IN... \n", | |
| "2 [MACHINE LEARNING, ARTIFICIAL INTELLIGENCE, DE... \n", | |
| "3 [TECHNOLOGY, ARTIFICIAL INTELLIGENCE, BLOCKCHA... \n", | |
| "4 [STARTUP, TECH, ENTREPRENEURSHIP, DESIGN, LIFE] \n", | |
| "5 [HACKING, PROGRAMMING, TECH, HACKER, TECHNOLOGY] \n", | |
| "6 [SOFTWARE DEVELOPMENT, ENGINEERING, REACT, JAV... \n", | |
| "7 [ARTIFICIAL INTELLIGENCE, DEEP LEARNING, MACHI... \n", | |
| "8 [STARTUP, PRODUCTIVITY, ENTREPRENEURSHIP, TECH... \n", | |
| "9 [ARTIFICIAL INTELLIGENCE, MACHINE LEARNING, DE... \n", | |
| "10 [TECHNOLOGY, DESIGN, TECH, STARTUP, PRODUCTIVITY] \n", | |
| "11 [STARTUP, DATA SCIENCE, ARTIFICIAL INTELLIGENC... \n", | |
| "12 [DATA SCIENCE, DATA MINING, TECHNOLOGY, ARTIFI... \n", | |
| "13 [DATA SCIENCE, DATA VISUALIZATION, DESIGN, PRO... \n", | |
| "14 [PROGRAMMING, WEB DEVELOPMENT, JAVASCRIPT, PYT... \n", | |
| "15 [DATA SCIENCE, R PROGRAMMING, DATA VISUALIZATI... \n", | |
| "16 [CRYPTOCURRENCY, BLOCKCHAIN, ARTIFICIAL INTELL... \n", | |
| "17 [MACHINE LEARNING, ARTIFICIAL INTELLIGENCE, RE... \n", | |
| "18 [DATA SCIENCE, MACHINE LEARNING, PYTHON] \n", | |
| "19 [ARTIFICIAL INTELLIGENCE, DATA SCIENCE, MACHIN... \n", | |
| "20 [FINANCE, TECHNOLOGY, DATA SCIENCE, FINTECH, M... \n", | |
| "21 [DATA SCIENCE, EDUCATION, HACKATHONS, SOCIAL C... \n", | |
| "22 [BITCOIN, TECHNOLOGY, CRYPTOCURRENCY, BLOCKCHA... \n", | |
| "23 [DATA SCIENCE, DATA, ANALYTICS, TECHNOLOGY, BU... \n", | |
| "24 [DATA SCIENCE, OPTIMIZATION, AI, MACHINE LEARN... \n", | |
| "25 [ARTIFICIAL INTELLIGENCE, MACHINE LEARNING, DE... \n", | |
| "26 [DATA SCIENCE, BIG DATA ANALYTICS, MACHINE LEA... \n", | |
| "27 [EDUCATION, DATA SCIENCE, DATA ENGINEERING, AI... \n", | |
| "28 [DEEP LEARNING, MACHINE LEARNING, QUANTUM COMP... \n", | |
| "29 [DATA SCIENCE, BUSINESS INTELLIGENCE, DATA SCI... \n", | |
| "\n", | |
| " slug \n", | |
| "1 towards-data-science \n", | |
| "2 analytics-vidhya \n", | |
| "3 datadriveninvestor \n", | |
| "4 swlh \n", | |
| "5 hackernoon \n", | |
| "6 better-programming \n", | |
| "7 becoming-human \n", | |
| "8 did-you-know-the-journal-blog \n", | |
| "9 towards-artificial-intelligence \n", | |
| "10 free-code-camp \n", | |
| "11 dataseries \n", | |
| "12 center-for-data-science \n", | |
| "13 nightingale \n", | |
| "14 gitconnected \n", | |
| "15 learn-dplyr \n", | |
| "16 good-audience \n", | |
| "17 dair-ai \n", | |
| "18 cambridgespark \n", | |
| "19 acing-ai \n", | |
| "20 fintechexplained \n", | |
| "21 budding-data-scientists \n", | |
| "22 coinmonks \n", | |
| "23 openbridge \n", | |
| "24 opex-analytics \n", | |
| "25 fritzheartbeat \n", | |
| "26 rapids-ai \n", | |
| "27 insight-data \n", | |
| "28 the-circular-theory \n", | |
| "29 365datascience " | |
| ] | |
| }, | |
| "execution_count": 350, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_collections" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 351, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_collections.to_csv(\"collections.csv\",encoding='utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import json | |
| from pymongo import MongoClient | |
| import datetime | |
| mongo_client = MongoClient('localhost', 27017) | |
| db = mongo_client.medium | |
| # create mongo unque index | |
| # Collection: medium collection | |
| # User: medium user | |
| # Post: medium post | |
| col_collection = db.Collection | |
| col_collection.create_index('id',unique=True) | |
| col_user = db.User | |
| col_user.create_index('userId', unique=True) | |
| col_post = db.Post | |
| col_post.create_index('id', unique=True) | |
| def get_article_archive(tag_slug,year,month,day): | |
| # tag_slug for example growth-hacking | |
| # year: 2018 | |
| # month: 01 | |
| # day: 01 | |
| try: | |
| response = requests.get( | |
| url="https://medium.com/tag/{tag_slug}/archive/{year}/{month}/{day}".format(tag_slug=tag_slug,year=year,month=month,day=day), | |
| params={ | |
| "count": "9", | |
| "ignore": ",,,", | |
| }, | |
| headers={ | |
| "Accept-Encoding": "gzip, deflate, br", | |
| "Upgrade-Insecure-Requests": "1", | |
| "Content-Type": "application/json", | |
| "Authority": "medium.com", | |
| "Sec-Fetch-Site": "same-origin", | |
| "Cache-Control": "no-cache", | |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36", | |
| "Sec-Fetch-Mode": "navigate", | |
| "Sec-Fetch-User": "?1", | |
| "Pragma": "no-cache", | |
| "Accept": "application/json", # assure response data format | |
| "Accept-Language": "en", | |
| }, | |
| ) | |
| res = json.loads(response.content[16:]) | |
| return res | |
| except requests.exceptions.RequestException: | |
| print('HTTP Request failed') | |
| if __name__ == '__main__': | |
| start_year, start_month, start_day = "2018", "01", "04" | |
| begin_date = datetime.date(int(start_year), int(start_month), int(start_day)) | |
| end_date = datetime.date.today() - datetime.timedelta(days=2) | |
| tag_slugs = ["machine-learning"] | |
| tag_slug = tag_slugs[0] | |
| for i in range((end_date-begin_date).days): | |
| single_date = begin_date + datetime.timedelta(days=i) | |
| year, month, day = single_date.isoformat()[:4], single_date.isoformat()[5:7], single_date.isoformat()[8:] | |
| print(i, year, month, day) | |
| data = get_article_archive(tag_slug=tag_slug,year=year,month=month,day=day) | |
| if data['payload']['references'].get('Collection'): | |
| for doc in data['payload']['references']['Collection'].values(): | |
| col_collection.update_one({'id':doc['id']},{"$set":doc},upsert=True) | |
| if data['payload']['references'].get('User'): | |
| for doc in data['payload']['references']['User'].values(): | |
| col_user.update_one({'userId':doc['userId']},{"$set":doc},upsert=True) | |
| if data['payload']['references'].get('Post'): | |
| for doc in data['payload']['references']['Post'].values(): | |
| col_post.update_one({'id':doc['id']},{"$set":doc},upsert=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment