Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ggsalas/6dae989ac7db618ec027c52f1f2ccc9f to your computer and use it in GitHub Desktop.
Save ggsalas/6dae989ac7db618ec027c52f1f2ccc9f to your computer and use it in GitHub Desktop.
Recipe for Calibre to get lacapitalmdp.com
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import AutomaticNewsRecipe
from calibre import strftime
class BasicUserRecipe1572999042(AutomaticNewsRecipe):
title = 'Diario La Capital de Mar del Plata'
oldest_article = 1
max_articles_per_feed = 35
cover_url = strftime('http://www.lacapitalmdp.com/tapas/%Y/%m/%d/grande.jpg')
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
publication_type = 'newspaper'
use_embedded_content = False
compress_news_images = True
scale_news_images_to_device = True
compress_news_images_max_size = 10 # kB
scale_news_images = True
handle_gzip = True
# To get all the data (images)
auto_cleanup = False
keep_only_tags = [
dict(attrs={'class': ['titulo_bajada', 'nota_imagen_container', 'nota_content']}),
]
remove_tags = [
dict(name=['meta', 'base', 'link', 'iframe', 'embed', 'object']),
dict(attrs={'class': ['category_container']}),
]
remove_attributes = ['style', 'font']
no_stylesheets = True
extra_css = """
.nota_imagen_container {
font-style: italic;
font-size: .9em;
margin-bottom: .5em;
}
.titulo_bajada h1 {
line-height: 1em;
margin: 0 0 .5em 0;
}
.titulo_bajada .bajada {
font-size: 1em;
line-height: 1em;
color: #666666;
margin: 0 0 1em 0;
}
"""
# Images on hightlights view
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article, picdiv['src'])
feeds = [
('Portada', 'https://www.lacapitalmdp.com/feed/'),
('Inter\xe9s General', 'https://www.lacapitalmdp.com/categorias/interes-general/feed/'),
('La Ciudad', 'https://www.lacapitalmdp.com/categorias/la-ciudad/feed/'),
('Provincia', 'https://www.lacapitalmdp.com/categorias/provincia/feed/'),
('El Pa\xeds', 'https://www.lacapitalmdp.com/categorias/el-pais/feed/'),
('El Mundo', 'https://www.lacapitalmdp.com/categorias/el-mundo/feed/'),
('Tecnolog\xeda', 'https://www.lacapitalmdp.com/categorias/tecnologia/feed/'),
('Gastronom\xeda', 'https://www.lacapitalmdp.com/categorias/gastronomia/feed/'),
('Arte y Espect\xe1culos', 'https://www.lacapitalmdp.com/categorias/espectaculos/feed/'),
('Salud', 'https://www.lacapitalmdp.com/categorias/salud/feed/'),
('Cartelera', 'http://cartelera.lacapitalmdp.com/feed/'),
('Deportes', 'https://www.lacapitalmdp.com/categorias/deportes/feed/'),
('Policiales', 'https://www.lacapitalmdp.com/categorias/policiales/feed/'),
]
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['a']):
t.name = 'strong'
nonBreakSpace = u'\xa0'
for empty in soup.findAll('p', string=nonBreakSpace):
empty.extract()
return soup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment