Created
July 6, 2020 11:51
-
-
Save DibyaranjanSathua/5c0147b8a1e0bb0f1c5ea380cbef3fad to your computer and use it in GitHub Desktop.
Scrapy Spider to scrape user information from Medium.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import json | |
import scrapy | |
from scraper4.items import Scraper4Item | |
class MediumSpider(scrapy.Spider): | |
name = 'medium' | |
allowed_domains = ['medium.com'] | |
def __init__(self, user, *args, **kwargs): | |
super(MediumSpider, self).__init__(*args, **kwargs) | |
self.user = user | |
def start_requests(self): | |
url = 'https://www.medium.com/' | |
if self.user is None: | |
raise Exception("Please provide the user handle as arguments. Like -a user=@dibya") | |
user_following_url = f"{url}/{self.user}/following" | |
yield scrapy.Request(url=user_following_url, callback=self.parse) | |
def parse(self, response): | |
""" Default parse function """ | |
initial_user_list = response.css(".streamItem .u-flex1") | |
for user in initial_user_list: | |
a_section = user.css(".ui-captionStrong a") | |
user_profile_link = a_section.xpath("@href").get() | |
if user_profile_link is not None: | |
yield scrapy.Request(url=user_profile_link, callback=self.parse_user_profle) | |
# Get the user id of the last user. | |
# It will be use as query parameters for infinite scroll url. | |
# https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2 | |
last_user_id = initial_user_list.pop().css(".ui-captionStrong a")\ | |
.xpath("@data-user-id").get() | |
# span.followState.js-followState gives the user id for all the user | |
current_user_id = response.css("div.js-headerButtons span.followState.js-followState::attr('data-user-id')").get() | |
# https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2 | |
scroll_url = f"https://medium.com/_/api/users/{current_user_id}/profile/stream" \ | |
f"?limit=8&to={last_user_id}&source=following&page=2" | |
yield scrapy.Request(url=scroll_url, callback=self.parse_infinite_scroll_response) | |
def parse_infinite_scroll_response(self, response): | |
""" Process the JSON response of infinite scroll response """ | |
# Ignoring the beginning characters which is present to prevent executing JSON directly | |
start_curly_braces = response.text.index("{") | |
response_dict = json.loads(response.text[start_curly_braces:]) | |
# "User" does not present in the JSON response thats means we have reached end of the list | |
if "User" in response_dict["payload"]["references"]: | |
for user in response_dict["payload"]["references"]["User"].values(): | |
yield Scraper4Item( | |
name=user["name"], | |
bio=user["bio"], | |
twitter_link=f"https://twitter.com/{user['twitterScreenName']}" | |
if "twitterScreenName" in user else "", | |
created_date=user["mediumMemberAt"] | |
) | |
paging = response_dict["payload"]["paging"] | |
url = paging["path"] | |
query_parameters = paging["next"] | |
# When we reached the end "to" will be missing from paging next | |
# Sometimes the last html response will have some user (less than 8 in this case) and | |
# "to" will not be present because it is the last page. | |
if "to" in query_parameters: | |
url = f"{url}?limit={query_parameters['limit']}&to={query_parameters['to']}&" \ | |
f"source={query_parameters['source']}&page={query_parameters['page']}" | |
yield scrapy.Request(url=url, callback=self.parse_infinite_scroll_response) | |
def parse_user_profle(self, response): | |
""" Get user information """ | |
# This will not work as the h1 element contains multiple classes | |
# name = response.xpath("//h1[@class='aw']/text()").get() | |
name = response.css("h1.aw::text").get() | |
bio = response.css("p.eh.ei.ci") | |
bio = bio.xpath("text()").get() if bio else "" | |
# There 20 a elements with a.cc.cd.bm | |
# So used its parent div element to filter out the correct a element | |
twitter_link = response.css("div.ep.z a.cc.cd.bm") | |
twitter_link = twitter_link.xpath("@href").get() if twitter_link else "" | |
created_date = response.css("span.ci div.eu") | |
created_date = created_date.xpath("text()").get() if created_date else "" | |
yield Scraper4Item( | |
name=name, | |
bio=bio, | |
twitter_link=twitter_link, | |
created_date=created_date | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment