ygrenzinger · March 9, 2021 14:24
diff --git a/scoopit_scrap.py b/scoopit_scrap.py
 # coding=utf-8
 # This is a sample Python script.

 # Press ⌃R to execute it or replace it with your code.
 # Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.

 import scrapy
 from requests import get
 from scrapy import Selector
 import json


 # class BlogSpider(scrapy.Spider):
 #     name = 'blogspider'
 #     start_urls = ['https://www.zyte.com/blog/']
 #
 #     def parse(self, response):
 #         for title in response.css('.oxy-post-title'):
 #             yield {'title': title.css('::text').get()}
 #
 #         for next_page in response.css('a.next'):
 #             yield response.follow(next_page, self.parse)

 def retrieve_posts(topic_url, number, posts):
    posts_elmts = Selector(text=get(topic_url + "?page=" + str(number)).text).css(".postView")
    for post_elmt in posts_elmts:
        post = {}
        post_url = post_elmt.css(".postTitleView a::attr(href)").get()
        post_title = post_elmt.css(".postTitleView a::text").get()
        if post_url is None or post_title is None:
            continue
        post["title"] = post_title.strip()
        post["url"] = post_url
        post_description = post_elmt.css(".post-description blockquote::text").get()
        if post_description:
            post["description"] = post_description.strip()
        posts.append(post)


 def max_page(topic_url):
    page_numbers = Selector(text=get(topic_url).text).css("nav.pagination li a::attr(data-page)").getall()
    return max([int(x) for x in page_numbers])


 def parse_topic(topic_url, file_name):
    posts = []
    for n in range(max_page(topic_url)):
        print("retrieving page " + str(n))
        retrieve_posts(topic_url, n+1, posts)
    with open(file_name, 'w') as outfile:
        json.dump(posts, outfile, indent=2)


 if __name__ == '__main__':
    parse_topic("https://www.scoop.it/topic/software-craftmanship-and-development", "software-engineering.json")
	# coding=utf-8
	# This is a sample Python script.

	# Press ⌃R to execute it or replace it with your code.
	# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.

	import scrapy
	from requests import get
	from scrapy import Selector
	import json


	# class BlogSpider(scrapy.Spider):
	# name = 'blogspider'
	# start_urls = ['https://www.zyte.com/blog/']
	#
	# def parse(self, response):
	# for title in response.css('.oxy-post-title'):
	# yield {'title': title.css('::text').get()}
	#
	# for next_page in response.css('a.next'):
	# yield response.follow(next_page, self.parse)

	def retrieve_posts(topic_url, number, posts):
	posts_elmts = Selector(text=get(topic_url + "?page=" + str(number)).text).css(".postView")
	for post_elmt in posts_elmts:
	post = {}
	post_url = post_elmt.css(".postTitleView a::attr(href)").get()
	post_title = post_elmt.css(".postTitleView a::text").get()
	if post_url is None or post_title is None:
	continue
	post["title"] = post_title.strip()
	post["url"] = post_url
	post_description = post_elmt.css(".post-description blockquote::text").get()
	if post_description:
	post["description"] = post_description.strip()
	posts.append(post)


	def max_page(topic_url):
	page_numbers = Selector(text=get(topic_url).text).css("nav.pagination li a::attr(data-page)").getall()
	return max([int(x) for x in page_numbers])


	def parse_topic(topic_url, file_name):
	posts = []
	for n in range(max_page(topic_url)):
	print("retrieving page " + str(n))
	retrieve_posts(topic_url, n+1, posts)
	with open(file_name, 'w') as outfile:
	json.dump(posts, outfile, indent=2)


	if __name__ == '__main__':
	parse_topic("https://www.scoop.it/topic/software-craftmanship-and-development", "software-engineering.json")