Created
December 11, 2018 02:29
-
-
Save noahpryor/3d7543f3f77380957751592d1bb9936e to your computer and use it in GitHub Desktop.
houston chronicle recipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string, re | |
from collections import OrderedDict | |
from calibre import strftime | |
from calibre.web.feeds.recipes import BasicNewsRecipe | |
from calibre.ebooks.BeautifulSoup import BeautifulSoup | |
sections = [('World', '/us-world/world/'), ('US', '/us-world/us/'), ('Texas', '/local/texas/'), ('Houston', '/local/houston/'), ('Opinion', '/opinion/editorials/')] | |
base_url = "https://www.houstonchronicle.com" | |
class HoustonChroniclePaid(BasicNewsRecipe): | |
title = 'Houston Chronicle' | |
__author__ = 'Noah Pryor' | |
description = 'Daily news from HoustonChronicle.com' | |
timefmt = '%Y-%m-%d' | |
needs_subscription = False | |
keep_only_tags = [dict(name='h1', attrs={'itemprop': ['headline']}), dict(name='section', attrs={'class': ['body']})] | |
no_stylesheets = True | |
# extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' | |
# def get_browser(self): | |
# br = BasicNewsRecipe.get_browser() | |
# if self.username is not None and self.password is not None: | |
# br.open('https://www.nytimes.com/auth/login') | |
# br.select_form(name='login') | |
# br['USERID'] = self.username | |
# br['PASSWORD'] = self.password | |
# br.submit() | |
# return br | |
def parse_index(self): | |
feeds = [] | |
timestampfmt = '%Y%m%d%H%M%S' | |
for item in sections: | |
section = item[0] | |
path = item[1] | |
self.log('starting parse_index: ' + section) | |
articles = [] | |
soup = self.index_to_soup('https://www.houstonchronicle.com' + path) | |
headlines = soup.find("div", {'id':'content'}).findAll("h2", {'class': "headline"}) | |
for h2 in headlines: | |
a = h2.find("a") | |
title = self.tag_to_string(a) | |
url = base_url + a['href'] | |
articles.append({'title': title,'date': '-', 'url': url, 'description': '' }) | |
feeds.append((section, articles)) | |
self.log(feeds) | |
return feeds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment