DibyaranjanSathua · July 6, 2020 11:51
diff --git a/medium.py b/medium.py
 # -*- coding: utf-8 -*-
 import json
 import scrapy
 from scraper4.items import Scraper4Item


 class MediumSpider(scrapy.Spider):
    name = 'medium'
    allowed_domains = ['medium.com']
    
    def __init__(self, user, *args, **kwargs):
        super(MediumSpider, self).__init__(*args, **kwargs)
        self.user = user

    def start_requests(self):
        url = 'https://www.medium.com/'
        if self.user is None:
            raise Exception("Please provide the user handle as arguments. Like -a user=@dibya")

        user_following_url = f"{url}/{self.user}/following"
        yield scrapy.Request(url=user_following_url, callback=self.parse)

    def parse(self, response):
        """ Default parse function """
        initial_user_list = response.css(".streamItem .u-flex1")
        for user in initial_user_list:
            a_section = user.css(".ui-captionStrong a")
            user_profile_link = a_section.xpath("@href").get()
            if user_profile_link is not None:
                yield scrapy.Request(url=user_profile_link, callback=self.parse_user_profle)
        # Get the user id of the last user.
        # It will be use as query parameters for infinite scroll url.
        # https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2
        last_user_id = initial_user_list.pop().css(".ui-captionStrong a")\
            .xpath("@data-user-id").get()

        # span.followState.js-followState gives the user id for all the user
        current_user_id = response.css("div.js-headerButtons span.followState.js-followState::attr('data-user-id')").get()

        # https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2
        scroll_url = f"https://medium.com/_/api/users/{current_user_id}/profile/stream" \
                     f"?limit=8&to={last_user_id}&source=following&page=2"
        yield scrapy.Request(url=scroll_url, callback=self.parse_infinite_scroll_response)

    def parse_infinite_scroll_response(self, response):
        """ Process the JSON response of infinite scroll response """
        # Ignoring the beginning characters which is present to prevent executing JSON directly
        start_curly_braces = response.text.index("{")
        response_dict = json.loads(response.text[start_curly_braces:])
        # "User" does not present in the JSON response thats means we have reached end of the list
        if "User" in response_dict["payload"]["references"]:
            for user in response_dict["payload"]["references"]["User"].values():
                yield Scraper4Item(
                    name=user["name"],
                    bio=user["bio"],
                    twitter_link=f"https://twitter.com/{user['twitterScreenName']}"
                    if "twitterScreenName" in user else "",
                    created_date=user["mediumMemberAt"]
                )

        paging = response_dict["payload"]["paging"]
        url = paging["path"]
        query_parameters = paging["next"]
        # When we reached the end "to" will be missing from paging next
        # Sometimes the last html response will have some user (less than 8 in this case) and
        # "to" will not be present because it is the last page.
        if "to" in query_parameters:
            url = f"{url}?limit={query_parameters['limit']}&to={query_parameters['to']}&" \
                  f"source={query_parameters['source']}&page={query_parameters['page']}"
            yield scrapy.Request(url=url, callback=self.parse_infinite_scroll_response)

    def parse_user_profle(self, response):
        """ Get user information """
        # This will not work as the h1 element contains multiple classes
        # name = response.xpath("//h1[@class='aw']/text()").get()
        name = response.css("h1.aw::text").get()
        bio = response.css("p.eh.ei.ci")
        bio = bio.xpath("text()").get() if bio else ""
        # There 20 a elements with a.cc.cd.bm
        # So used its parent div element to filter out the correct a element
        twitter_link = response.css("div.ep.z a.cc.cd.bm")
        twitter_link = twitter_link.xpath("@href").get() if twitter_link else ""
        created_date = response.css("span.ci div.eu")
        created_date = created_date.xpath("text()").get() if created_date else ""
        yield Scraper4Item(
            name=name,
            bio=bio,
            twitter_link=twitter_link,
            created_date=created_date
        )
	# -- coding: utf-8 --
	import json
	import scrapy
	from scraper4.items import Scraper4Item


	class MediumSpider(scrapy.Spider):
	name = 'medium'
	allowed_domains = ['medium.com']

	def __init__(self, user, args, *kwargs):
	super(MediumSpider, self).__init__(args, *kwargs)
	self.user = user

	def start_requests(self):
	url = 'https://www.medium.com/'
	if self.user is None:
	raise Exception("Please provide the user handle as arguments. Like -a user=@dibya")

	user_following_url = f"{url}/{self.user}/following"
	yield scrapy.Request(url=user_following_url, callback=self.parse)

	def parse(self, response):
	""" Default parse function """
	initial_user_list = response.css(".streamItem .u-flex1")
	for user in initial_user_list:
	a_section = user.css(".ui-captionStrong a")
	user_profile_link = a_section.xpath("@href").get()
	if user_profile_link is not None:
	yield scrapy.Request(url=user_profile_link, callback=self.parse_user_profle)
	# Get the user id of the last user.
	# It will be use as query parameters for infinite scroll url.
	# https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2
	last_user_id = initial_user_list.pop().css(".ui-captionStrong a")\
	.xpath("@data-user-id").get()

	# span.followState.js-followState gives the user id for all the user
	current_user_id = response.css("div.js-headerButtons span.followState.js-followState::attr('data-user-id')").get()

	# https://medium.com/_/api/users/3f7042fbd53d/profile/stream?limit=8&to=10c51c66e32e&source=following&page=2
	scroll_url = f"https://medium.com/_/api/users/{current_user_id}/profile/stream" \
	f"?limit=8&to={last_user_id}&source=following&page=2"
	yield scrapy.Request(url=scroll_url, callback=self.parse_infinite_scroll_response)

	def parse_infinite_scroll_response(self, response):
	""" Process the JSON response of infinite scroll response """
	# Ignoring the beginning characters which is present to prevent executing JSON directly
	start_curly_braces = response.text.index("{")
	response_dict = json.loads(response.text[start_curly_braces:])
	# "User" does not present in the JSON response thats means we have reached end of the list
	if "User" in response_dict["payload"]["references"]:
	for user in response_dict["payload"]["references"]["User"].values():
	yield Scraper4Item(
	name=user["name"],
	bio=user["bio"],
	twitter_link=f"https://twitter.com/{user['twitterScreenName']}"
	if "twitterScreenName" in user else "",
	created_date=user["mediumMemberAt"]
	)

	paging = response_dict["payload"]["paging"]
	url = paging["path"]
	query_parameters = paging["next"]
	# When we reached the end "to" will be missing from paging next
	# Sometimes the last html response will have some user (less than 8 in this case) and
	# "to" will not be present because it is the last page.
	if "to" in query_parameters:
	url = f"{url}?limit={query_parameters['limit']}&to={query_parameters['to']}&" \
	f"source={query_parameters['source']}&page={query_parameters['page']}"
	yield scrapy.Request(url=url, callback=self.parse_infinite_scroll_response)

	def parse_user_profle(self, response):
	""" Get user information """
	# This will not work as the h1 element contains multiple classes
	# name = response.xpath("//h1[@class='aw']/text()").get()
	name = response.css("h1.aw::text").get()
	bio = response.css("p.eh.ei.ci")
	bio = bio.xpath("text()").get() if bio else ""
	# There 20 a elements with a.cc.cd.bm
	# So used its parent div element to filter out the correct a element
	twitter_link = response.css("div.ep.z a.cc.cd.bm")
	twitter_link = twitter_link.xpath("@href").get() if twitter_link else ""
	created_date = response.css("span.ci div.eu")
	created_date = created_date.xpath("text()").get() if created_date else ""
	yield Scraper4Item(
	name=name,
	bio=bio,
	twitter_link=twitter_link,
	created_date=created_date
	)