Skip to content

Instantly share code, notes, and snippets.

@DibyaranjanSathua
Created July 6, 2020 11:51
Show Gist options
  • Save DibyaranjanSathua/5c0147b8a1e0bb0f1c5ea380cbef3fad to your computer and use it in GitHub Desktop.
Save DibyaranjanSathua/5c0147b8a1e0bb0f1c5ea380cbef3fad to your computer and use it in GitHub Desktop.
Scrapy Spider to scrape user information from Medium.com
# -*- coding: utf-8 -*-
import json
import scrapy
from scraper4.items import Scraper4Item
class MediumSpider(scrapy.Spider):
name = 'medium'
allowed_domains = ['medium.com']
def __init__(self, user, *args, **kwargs):
super(MediumSpider, self).__init__(*args, **kwargs)
self.user = user
def start_requests(self):
url = 'https://www.medium.com/'
if self.user is None:
raise Exception("Please provide the user handle as arguments. Like -a user=@dibya")
user_following_url = f"{url}/{self.user}/following"
yield scrapy.Request(url=user_following_url, callback=self.parse)
def parse(self, response):
""" Default parse function """
initial_user_list = response.css(".streamItem .u-flex1")
for user in initial_user_list:
a_section = user.css(".ui-captionStrong a")
user_profile_link = a_section.xpath("@href").get()
if user_profile_link is not None:
yield scrapy.Request(url=user_profile_link, callback=self.parse_user_profle)
# Get the user id of the last user.
# It will be use as query parameters for infinite scroll url.
# https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2
last_user_id = initial_user_list.pop().css(".ui-captionStrong a")\
.xpath("@data-user-id").get()
# span.followState.js-followState gives the user id for all the user
current_user_id = response.css("div.js-headerButtons span.followState.js-followState::attr('data-user-id')").get()
# https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2
scroll_url = f"https://medium.com/_/api/users/{current_user_id}/profile/stream" \
f"?limit=8&to={last_user_id}&source=following&page=2"
yield scrapy.Request(url=scroll_url, callback=self.parse_infinite_scroll_response)
def parse_infinite_scroll_response(self, response):
""" Process the JSON response of infinite scroll response """
# Ignoring the beginning characters which is present to prevent executing JSON directly
start_curly_braces = response.text.index("{")
response_dict = json.loads(response.text[start_curly_braces:])
# "User" does not present in the JSON response thats means we have reached end of the list
if "User" in response_dict["payload"]["references"]:
for user in response_dict["payload"]["references"]["User"].values():
yield Scraper4Item(
name=user["name"],
bio=user["bio"],
twitter_link=f"https://twitter.com/{user['twitterScreenName']}"
if "twitterScreenName" in user else "",
created_date=user["mediumMemberAt"]
)
paging = response_dict["payload"]["paging"]
url = paging["path"]
query_parameters = paging["next"]
# When we reached the end "to" will be missing from paging next
# Sometimes the last html response will have some user (less than 8 in this case) and
# "to" will not be present because it is the last page.
if "to" in query_parameters:
url = f"{url}?limit={query_parameters['limit']}&to={query_parameters['to']}&" \
f"source={query_parameters['source']}&page={query_parameters['page']}"
yield scrapy.Request(url=url, callback=self.parse_infinite_scroll_response)
def parse_user_profle(self, response):
""" Get user information """
# This will not work as the h1 element contains multiple classes
# name = response.xpath("//h1[@class='aw']/text()").get()
name = response.css("h1.aw::text").get()
bio = response.css("p.eh.ei.ci")
bio = bio.xpath("text()").get() if bio else ""
# There 20 a elements with a.cc.cd.bm
# So used its parent div element to filter out the correct a element
twitter_link = response.css("div.ep.z a.cc.cd.bm")
twitter_link = twitter_link.xpath("@href").get() if twitter_link else ""
created_date = response.css("span.ci div.eu")
created_date = created_date.xpath("text()").get() if created_date else ""
yield Scraper4Item(
name=name,
bio=bio,
twitter_link=twitter_link,
created_date=created_date
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment