Last active
June 6, 2020 06:31
-
-
Save anibalpacheco/f06b169295704e9e0f99 to your computer and use it in GitHub Desktop.
la diaria frontpage title spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import operator | |
import cssutils | |
import logging | |
from datetime import date, datetime, timedelta | |
from scrapy.spiders import CrawlSpider, Rule | |
from scrapy.linkextractors import LinkExtractor | |
from scrapy.http import Request | |
from titulos import local_settings | |
from titulos.items import TituloItem, TituloLoader | |
today_only = getattr(local_settings, 'TODAY_ONLY', True) | |
try: | |
specific_date_only = datetime.strptime( | |
getattr(local_settings, 'SPECIFIC_DATE_ONLY'), '%Y-%m-%d').date() | |
except Exception: | |
specific_date_only = None | |
class CompartidoSpider(CrawlSpider): | |
name = "compartido" | |
allowed_domains = ["compartido.ladiaria", "pdftohtml.ladiaria"] | |
start_urls = ( | |
'http://compartido.ladiaria/Archivo/PublicacionesCompletas/', | |
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200604_ABRIL/', | |
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200606_JUNIO/', | |
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2008/200803_MARZO/', | |
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/', | |
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_04ABRIL/', | |
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_08AGOSTO/', | |
) | |
# Comment rules or set RULES=False if your SPECIFIC_DATE_ONLY pdf is on the | |
# base path. | |
# Change rule to a more precise year-month, etc. where your target pdf is. | |
if not today_only and getattr(local_settings, 'RULES', True): | |
rules = ( | |
Rule(LinkExtractor(allow=('/20', )), callback='parse_internal'), | |
) | |
pdfdates = [] | |
new_style_date = date(2008, 3, 20) | |
def __init__(self, yesterday=False, *args, **kwargs): | |
super(CompartidoSpider, self).__init__(*args, **kwargs) | |
self.yesterday = yesterday == u'1' | |
def parse_internal(self, response): | |
""" | |
@url http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/ | |
@returns requests 22 | |
""" | |
dates, requests = self.find_pdf_urls(response) | |
if not dates: | |
le = LinkExtractor() | |
for url in le.extract_links(response): | |
requests.append(Request(url.url)) | |
else: | |
self.pdfdates.extend(dates) | |
return requests | |
def parse_start_url(self, response): | |
dates, requests = self.find_pdf_urls(response) | |
self.pdfdates.extend(dates) | |
return requests | |
def find_pdf_urls(self, response): | |
""" | |
Return a tuple containing the dates for the pdf found and the list of | |
requests to processs with those pdf. | |
if today_only it scrapes only today's pdf. (or for specific date) | |
""" | |
dates, requests = [], [] | |
le = LinkExtractor( | |
deny_extensions=[], restrict_xpaths='//tr[position()>3]') | |
for url in le.extract_links(response): | |
m = re.match( | |
r'.*ladiaria_(20[012]\d)([01]\d)([0123]\d).pdf$', url.url) | |
if m: | |
pdfdate = date(*tuple([int(x) for x in m.groups()])) | |
if not today_only or pdfdate == date.today() or ( | |
self.yesterday and pdfdate == date.today() - timedelta(1) | |
) or specific_date_only and pdfdate == specific_date_only: | |
request = Request( | |
'http://pdftohtml.ladiaria/?url=' + url.url, | |
callback=self.parse_pages, method='HEAD') | |
request.meta['pdfdate'] = pdfdate | |
requests.append(request) | |
dates.append(pdfdate) | |
return dates, requests | |
def parse_pages(self, response): | |
""" | |
@url http://pdftohtml.ladiaria/?url=http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/ladiaria_20140131.pdf | |
@returns requests 1 | |
""" | |
request = Request( | |
re.sub(r'/salida.html$', '/salida-1.html', response.url), | |
callback=self.parse_frontpage) | |
request.meta['pdfdate'] = response.meta.get('pdfdate') | |
return request | |
def parse_frontpage(self, response): | |
""" | |
Parses the newspaper frontapge in HTML, it detects first the font sizes | |
using cssutils. | |
Based on the biggest font (or the second biggest for the old style | |
design) it scrapes the frontpage title. | |
""" | |
sheet = cssutils.parseString( | |
response.xpath("//style/text()").extract()[0]) | |
fontsizes = {} | |
for rule in sheet: | |
name = rule.selectorList.selectorText | |
if name.startswith('.ft'): | |
fontsizes[name] = int( | |
re.match( | |
r'(\d+)px', rule.style['font-size']).groups()[0]) | |
sorted_fontsizes = sorted( | |
fontsizes.items(), key=operator.itemgetter(1)) | |
pdfdate = response.meta['pdfdate'] | |
olstyle = pdfdate < self.new_style_date | |
tl = TituloLoader( | |
TituloItem(), response.css( | |
'p[class="%s"]' % | |
sorted_fontsizes[-2 if olstyle else -1][0][1:])) | |
tl.add_xpath('title', ".//text()") | |
tl.add_value('pdfdate', pdfdate) | |
tl.add_value( | |
'check', | |
sorted_fontsizes[-2 if olstyle else -1][1] == | |
sorted_fontsizes[-3 if olstyle else -2][1]) | |
return tl.load_item() | |
def close(self): | |
""" | |
Check if in all the week days we found a title, or if we found one in a | |
weekend. | |
""" | |
first_diaria = today_only and date.today() or specific_date_only or \ | |
date(2006, 3, 20) | |
for single_date in ( | |
first_diaria + timedelta(n) for n in range( | |
(date.today() - first_diaria).days + 1)): | |
if single_date.isoweekday() < 6: | |
if single_date not in self.pdfdates: | |
logging.error( | |
"No hay diaria " + single_date.strftime(u'%Y-%m-%d')) | |
else: | |
if single_date in self.pdfdates: | |
logging.error( | |
"Salio sab o dom " + single_date.strftime(u'%Y-%m-%d')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment