Created
November 6, 2019 01:19
-
-
Save ggsalas/6dae989ac7db618ec027c52f1f2ccc9f to your computer and use it in GitHub Desktop.
Recipe for Calibre to get lacapitalmdp.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# vim:fileencoding=utf-8 | |
from __future__ import unicode_literals, division, absolute_import, print_function | |
from calibre.web.feeds.news import AutomaticNewsRecipe | |
from calibre import strftime | |
class BasicUserRecipe1572999042(AutomaticNewsRecipe): | |
title = 'Diario La Capital de Mar del Plata' | |
oldest_article = 1 | |
max_articles_per_feed = 35 | |
cover_url = strftime('http://www.lacapitalmdp.com/tapas/%Y/%m/%d/grande.jpg') | |
ignore_duplicate_articles = {'title', 'url'} | |
remove_empty_feeds = True | |
publication_type = 'newspaper' | |
use_embedded_content = False | |
compress_news_images = True | |
scale_news_images_to_device = True | |
compress_news_images_max_size = 10 # kB | |
scale_news_images = True | |
handle_gzip = True | |
# To get all the data (images) | |
auto_cleanup = False | |
keep_only_tags = [ | |
dict(attrs={'class': ['titulo_bajada', 'nota_imagen_container', 'nota_content']}), | |
] | |
remove_tags = [ | |
dict(name=['meta', 'base', 'link', 'iframe', 'embed', 'object']), | |
dict(attrs={'class': ['category_container']}), | |
] | |
remove_attributes = ['style', 'font'] | |
no_stylesheets = True | |
extra_css = """ | |
.nota_imagen_container { | |
font-style: italic; | |
font-size: .9em; | |
margin-bottom: .5em; | |
} | |
.titulo_bajada h1 { | |
line-height: 1em; | |
margin: 0 0 .5em 0; | |
} | |
.titulo_bajada .bajada { | |
font-size: 1em; | |
line-height: 1em; | |
color: #666666; | |
margin: 0 0 1em 0; | |
} | |
""" | |
# Images on hightlights view | |
def populate_article_metadata(self, article, soup, first): | |
if first and hasattr(self, 'add_toc_thumbnail'): | |
picdiv = soup.find('img') | |
if picdiv is not None: | |
self.add_toc_thumbnail(article, picdiv['src']) | |
feeds = [ | |
('Portada', 'https://www.lacapitalmdp.com/feed/'), | |
('Inter\xe9s General', 'https://www.lacapitalmdp.com/categorias/interes-general/feed/'), | |
('La Ciudad', 'https://www.lacapitalmdp.com/categorias/la-ciudad/feed/'), | |
('Provincia', 'https://www.lacapitalmdp.com/categorias/provincia/feed/'), | |
('El Pa\xeds', 'https://www.lacapitalmdp.com/categorias/el-pais/feed/'), | |
('El Mundo', 'https://www.lacapitalmdp.com/categorias/el-mundo/feed/'), | |
('Tecnolog\xeda', 'https://www.lacapitalmdp.com/categorias/tecnologia/feed/'), | |
('Gastronom\xeda', 'https://www.lacapitalmdp.com/categorias/gastronomia/feed/'), | |
('Arte y Espect\xe1culos', 'https://www.lacapitalmdp.com/categorias/espectaculos/feed/'), | |
('Salud', 'https://www.lacapitalmdp.com/categorias/salud/feed/'), | |
('Cartelera', 'http://cartelera.lacapitalmdp.com/feed/'), | |
('Deportes', 'https://www.lacapitalmdp.com/categorias/deportes/feed/'), | |
('Policiales', 'https://www.lacapitalmdp.com/categorias/policiales/feed/'), | |
] | |
def postprocess_html(self, soup, first_fetch): | |
for t in soup.findAll(['a']): | |
t.name = 'strong' | |
nonBreakSpace = u'\xa0' | |
for empty in soup.findAll('p', string=nonBreakSpace): | |
empty.extract() | |
return soup |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment