Last active
January 6, 2024 00:46
-
-
Save jerryan999/79bb451d13c3b4e37009a642454cb464 to your computer and use it in GitHub Desktop.
Given tag name, this script starts to crawl tag related post in the medium archive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
from pymongo import MongoClient | |
import datetime | |
mongo_client = MongoClient('localhost', 27017) | |
db = mongo_client.medium | |
# create mongo unque index | |
# Collection: medium collection | |
# User: medium user | |
# Post: medium post | |
col_collection = db.Collection | |
col_collection.create_index('id',unique=True) | |
col_user = db.User | |
col_user.create_index('userId', unique=True) | |
col_post = db.Post | |
col_post.create_index('id', unique=True) | |
def get_article_archive(tag_slug,year,month,day): | |
# tag_slug for example growth-hacking | |
# year: 2018 | |
# month: 01 | |
# day: 01 | |
try: | |
response = requests.get( | |
url="https://medium.com/tag/{tag_slug}/archive/{year}/{month}/{day}".format(tag_slug=tag_slug,year=year,month=month,day=day), | |
params={ | |
"count": "9", | |
"ignore": ",,,", | |
}, | |
headers={ | |
"Accept-Encoding": "gzip, deflate, br", | |
"Upgrade-Insecure-Requests": "1", | |
"Content-Type": "application/json", | |
"Authority": "medium.com", | |
"Sec-Fetch-Site": "same-origin", | |
"Cache-Control": "no-cache", | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36", | |
"Sec-Fetch-Mode": "navigate", | |
"Sec-Fetch-User": "?1", | |
"Pragma": "no-cache", | |
"Accept": "application/json", # assure response data format | |
"Accept-Language": "en", | |
}, | |
) | |
res = json.loads(response.content[16:]) | |
return res | |
except requests.exceptions.RequestException: | |
print('HTTP Request failed') | |
if __name__ == '__main__': | |
start_year, start_month, start_day = "2018", "01", "04" | |
begin_date = datetime.date(int(start_year), int(start_month), int(start_day)) | |
end_date = datetime.date.today() - datetime.timedelta(days=2) | |
tag_slugs = ["machine-learning"] | |
tag_slug = tag_slugs[0] | |
for i in range((end_date-begin_date).days): | |
single_date = begin_date + datetime.timedelta(days=i) | |
year, month, day = single_date.isoformat()[:4], single_date.isoformat()[5:7], single_date.isoformat()[8:] | |
print(i, year, month, day) | |
data = get_article_archive(tag_slug=tag_slug,year=year,month=month,day=day) | |
if data['payload']['references'].get('Collection'): | |
for doc in data['payload']['references']['Collection'].values(): | |
col_collection.update_one({'id':doc['id']},{"$set":doc},upsert=True) | |
if data['payload']['references'].get('User'): | |
for doc in data['payload']['references']['User'].values(): | |
col_user.update_one({'userId':doc['userId']},{"$set":doc},upsert=True) | |
if data['payload']['references'].get('Post'): | |
for doc in data['payload']['references']['Post'].values(): | |
col_post.update_one({'id':doc['id']},{"$set":doc},upsert=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment