Skip to content

Instantly share code, notes, and snippets.

@jerryan999
Last active January 6, 2024 00:46
Show Gist options
  • Save jerryan999/79bb451d13c3b4e37009a642454cb464 to your computer and use it in GitHub Desktop.
Save jerryan999/79bb451d13c3b4e37009a642454cb464 to your computer and use it in GitHub Desktop.
Given tag name, this script starts to crawl tag related post in the medium archive
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import requests
import json
from pymongo import MongoClient
import datetime
mongo_client = MongoClient('localhost', 27017)
db = mongo_client.medium
# create mongo unque index
# Collection: medium collection
# User: medium user
# Post: medium post
col_collection = db.Collection
col_collection.create_index('id',unique=True)
col_user = db.User
col_user.create_index('userId', unique=True)
col_post = db.Post
col_post.create_index('id', unique=True)
def get_article_archive(tag_slug,year,month,day):
# tag_slug for example growth-hacking
# year: 2018
# month: 01
# day: 01
try:
response = requests.get(
url="https://medium.com/tag/{tag_slug}/archive/{year}/{month}/{day}".format(tag_slug=tag_slug,year=year,month=month,day=day),
params={
"count": "9",
"ignore": ",,,",
},
headers={
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Content-Type": "application/json",
"Authority": "medium.com",
"Sec-Fetch-Site": "same-origin",
"Cache-Control": "no-cache",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Pragma": "no-cache",
"Accept": "application/json", # assure response data format
"Accept-Language": "en",
},
)
res = json.loads(response.content[16:])
return res
except requests.exceptions.RequestException:
print('HTTP Request failed')
if __name__ == '__main__':
start_year, start_month, start_day = "2018", "01", "04"
begin_date = datetime.date(int(start_year), int(start_month), int(start_day))
end_date = datetime.date.today() - datetime.timedelta(days=2)
tag_slugs = ["machine-learning"]
tag_slug = tag_slugs[0]
for i in range((end_date-begin_date).days):
single_date = begin_date + datetime.timedelta(days=i)
year, month, day = single_date.isoformat()[:4], single_date.isoformat()[5:7], single_date.isoformat()[8:]
print(i, year, month, day)
data = get_article_archive(tag_slug=tag_slug,year=year,month=month,day=day)
if data['payload']['references'].get('Collection'):
for doc in data['payload']['references']['Collection'].values():
col_collection.update_one({'id':doc['id']},{"$set":doc},upsert=True)
if data['payload']['references'].get('User'):
for doc in data['payload']['references']['User'].values():
col_user.update_one({'userId':doc['userId']},{"$set":doc},upsert=True)
if data['payload']['references'].get('Post'):
for doc in data['payload']['references']['Post'].values():
col_post.update_one({'id':doc['id']},{"$set":doc},upsert=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment