Created
March 9, 2021 14:24
-
-
Save ygrenzinger/7d5c6f18197eff8309ce0a7d12d25e4d to your computer and use it in GitHub Desktop.
Scraping scoop.it
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
# This is a sample Python script. | |
# Press ⌃R to execute it or replace it with your code. | |
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. | |
import scrapy | |
from requests import get | |
from scrapy import Selector | |
import json | |
# class BlogSpider(scrapy.Spider): | |
# name = 'blogspider' | |
# start_urls = ['https://www.zyte.com/blog/'] | |
# | |
# def parse(self, response): | |
# for title in response.css('.oxy-post-title'): | |
# yield {'title': title.css('::text').get()} | |
# | |
# for next_page in response.css('a.next'): | |
# yield response.follow(next_page, self.parse) | |
def retrieve_posts(topic_url, number, posts): | |
posts_elmts = Selector(text=get(topic_url + "?page=" + str(number)).text).css(".postView") | |
for post_elmt in posts_elmts: | |
post = {} | |
post_url = post_elmt.css(".postTitleView a::attr(href)").get() | |
post_title = post_elmt.css(".postTitleView a::text").get() | |
if post_url is None or post_title is None: | |
continue | |
post["title"] = post_title.strip() | |
post["url"] = post_url | |
post_description = post_elmt.css(".post-description blockquote::text").get() | |
if post_description: | |
post["description"] = post_description.strip() | |
posts.append(post) | |
def max_page(topic_url): | |
page_numbers = Selector(text=get(topic_url).text).css("nav.pagination li a::attr(data-page)").getall() | |
return max([int(x) for x in page_numbers]) | |
def parse_topic(topic_url, file_name): | |
posts = [] | |
for n in range(max_page(topic_url)): | |
print("retrieving page " + str(n)) | |
retrieve_posts(topic_url, n+1, posts) | |
with open(file_name, 'w') as outfile: | |
json.dump(posts, outfile, indent=2) | |
if __name__ == '__main__': | |
parse_topic("https://www.scoop.it/topic/software-craftmanship-and-development", "software-engineering.json") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment