anibalpacheco · June 6, 2020 06:31
diff --git a/compartido.py b/compartido.py
 # -*- coding: utf-8 -*-
 import re
 import operator
 import cssutils
 import logging
 from datetime import date, datetime, timedelta

 from scrapy.spiders import CrawlSpider, Rule
 from scrapy.linkextractors import LinkExtractor
 from scrapy.http import Request

 from titulos import local_settings
 from titulos.items import TituloItem, TituloLoader

 today_only = getattr(local_settings, 'TODAY_ONLY', True)
 try:
    specific_date_only = datetime.strptime(
        getattr(local_settings, 'SPECIFIC_DATE_ONLY'), '%Y-%m-%d').date()
 except Exception:
    specific_date_only = None


 class CompartidoSpider(CrawlSpider):
    name = "compartido"
    allowed_domains = ["compartido.ladiaria", "pdftohtml.ladiaria"]
    start_urls = (
        'http://compartido.ladiaria/Archivo/PublicacionesCompletas/',
        #'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200604_ABRIL/',
        #'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200606_JUNIO/',
        #'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2008/200803_MARZO/',
        #'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/',
        #'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_04ABRIL/',
        #'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_08AGOSTO/',
    )

    # Comment rules or set RULES=False if your SPECIFIC_DATE_ONLY pdf is on the
    # base path.
    # Change rule to a more precise year-month, etc. where your target pdf is.
    if not today_only and getattr(local_settings, 'RULES', True):
        rules = (
            Rule(LinkExtractor(allow=('/20', )), callback='parse_internal'),
        )
    pdfdates = []
    new_style_date = date(2008, 3, 20)

    def __init__(self, yesterday=False, *args, **kwargs):
        super(CompartidoSpider, self).__init__(*args, **kwargs)
        self.yesterday = yesterday == u'1'

    def parse_internal(self, response):
        """
        @url http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/
        @returns requests 22
        """
        dates, requests = self.find_pdf_urls(response)
        if not dates:
            le = LinkExtractor()
            for url in le.extract_links(response):
                requests.append(Request(url.url))
        else:
            self.pdfdates.extend(dates)
        return requests

    def parse_start_url(self, response):
        dates, requests = self.find_pdf_urls(response)
        self.pdfdates.extend(dates)
        return requests

    def find_pdf_urls(self, response):
        """
        Return a tuple containing the dates for the pdf found and the list of
        requests to processs with those pdf.
        if today_only it scrapes only today's pdf. (or for specific date)
        """
        dates, requests = [], []
        le = LinkExtractor(
            deny_extensions=[], restrict_xpaths='//tr[position()>3]')
        for url in le.extract_links(response):
            m = re.match(
                r'.*ladiaria_(20[012]\d)([01]\d)([0123]\d).pdf$', url.url)
            if m:
                pdfdate = date(*tuple([int(x) for x in m.groups()]))
                if not today_only or pdfdate == date.today() or (
                    self.yesterday and pdfdate == date.today() - timedelta(1)
                ) or specific_date_only and pdfdate == specific_date_only:
                    request = Request(
                        'http://pdftohtml.ladiaria/?url=' + url.url,
                        callback=self.parse_pages, method='HEAD')
                    request.meta['pdfdate'] = pdfdate
                    requests.append(request)
                    dates.append(pdfdate)
        return dates, requests

    def parse_pages(self, response):
        """
        @url http://pdftohtml.ladiaria/?url=http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/ladiaria_20140131.pdf
        @returns requests 1
        """
        request = Request(
            re.sub(r'/salida.html$', '/salida-1.html', response.url),
            callback=self.parse_frontpage)
        request.meta['pdfdate'] = response.meta.get('pdfdate')
        return request

    def parse_frontpage(self, response):
        """
        Parses the newspaper frontapge in HTML, it detects first the font sizes
        using cssutils.
        Based on the biggest font (or the second biggest for the old style
        design) it scrapes the frontpage title.
        """
        sheet = cssutils.parseString(
            response.xpath("//style/text()").extract()[0])
        fontsizes = {}
        for rule in sheet:
            name = rule.selectorList.selectorText
            if name.startswith('.ft'):
                fontsizes[name] = int(
                    re.match(
                        r'(\d+)px', rule.style['font-size']).groups()[0])
        sorted_fontsizes = sorted(
            fontsizes.items(), key=operator.itemgetter(1))
        pdfdate = response.meta['pdfdate']
        olstyle = pdfdate < self.new_style_date
        tl = TituloLoader(
            TituloItem(), response.css(
                'p[class="%s"]' %
                sorted_fontsizes[-2 if olstyle else -1][0][1:]))
        tl.add_xpath('title', ".//text()")
        tl.add_value('pdfdate', pdfdate)
        tl.add_value(
            'check',
            sorted_fontsizes[-2 if olstyle else -1][1] ==
            sorted_fontsizes[-3 if olstyle else -2][1])
        return tl.load_item()

    def close(self):
        """
        Check if in all the week days we found a title, or if we found one in a
        weekend.
        """
        first_diaria = today_only and date.today() or specific_date_only or \
            date(2006, 3, 20)
        for single_date in (
                first_diaria + timedelta(n) for n in range(
                    (date.today() - first_diaria).days + 1)):
            if single_date.isoweekday() < 6:
                if single_date not in self.pdfdates:
                    logging.error(
                        "No hay diaria " + single_date.strftime(u'%Y-%m-%d'))
            else:
                if single_date in self.pdfdates:
                    logging.error(
                        "Salio sab o dom " + single_date.strftime(u'%Y-%m-%d'))
	# -- coding: utf-8 --
	import re
	import operator
	import cssutils
	import logging
	from datetime import date, datetime, timedelta

	from scrapy.spiders import CrawlSpider, Rule
	from scrapy.linkextractors import LinkExtractor
	from scrapy.http import Request

	from titulos import local_settings
	from titulos.items import TituloItem, TituloLoader

	today_only = getattr(local_settings, 'TODAY_ONLY', True)
	try:
	specific_date_only = datetime.strptime(
	getattr(local_settings, 'SPECIFIC_DATE_ONLY'), '%Y-%m-%d').date()
	except Exception:
	specific_date_only = None


	class CompartidoSpider(CrawlSpider):
	name = "compartido"
	allowed_domains = ["compartido.ladiaria", "pdftohtml.ladiaria"]
	start_urls = (
	'http://compartido.ladiaria/Archivo/PublicacionesCompletas/',
	#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200604_ABRIL/',
	#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200606_JUNIO/',
	#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2008/200803_MARZO/',
	#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/',
	#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_04ABRIL/',
	#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_08AGOSTO/',
	)

	# Comment rules or set RULES=False if your SPECIFIC_DATE_ONLY pdf is on the
	# base path.
	# Change rule to a more precise year-month, etc. where your target pdf is.
	if not today_only and getattr(local_settings, 'RULES', True):
	rules = (
	Rule(LinkExtractor(allow=('/20', )), callback='parse_internal'),
	)
	pdfdates = []
	new_style_date = date(2008, 3, 20)

	def __init__(self, yesterday=False, args, *kwargs):
	super(CompartidoSpider, self).__init__(args, *kwargs)
	self.yesterday = yesterday == u'1'

	def parse_internal(self, response):
	"""
	@url http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/
	@returns requests 22
	"""
	dates, requests = self.find_pdf_urls(response)
	if not dates:
	le = LinkExtractor()
	for url in le.extract_links(response):
	requests.append(Request(url.url))
	else:
	self.pdfdates.extend(dates)
	return requests

	def parse_start_url(self, response):
	dates, requests = self.find_pdf_urls(response)
	self.pdfdates.extend(dates)
	return requests

	def find_pdf_urls(self, response):
	"""
	Return a tuple containing the dates for the pdf found and the list of
	requests to processs with those pdf.
	if today_only it scrapes only today's pdf. (or for specific date)
	"""
	dates, requests = [], []
	le = LinkExtractor(
	deny_extensions=[], restrict_xpaths='//tr[position()>3]')
	for url in le.extract_links(response):
	m = re.match(
	r'.*ladiaria_(20[012]\d)([01]\d)([0123]\d).pdf$', url.url)
	if m:
	pdfdate = date(*tuple([int(x) for x in m.groups()]))
	if not today_only or pdfdate == date.today() or (
	self.yesterday and pdfdate == date.today() - timedelta(1)
	) or specific_date_only and pdfdate == specific_date_only:
	request = Request(
	'http://pdftohtml.ladiaria/?url=' + url.url,
	callback=self.parse_pages, method='HEAD')
	request.meta['pdfdate'] = pdfdate
	requests.append(request)
	dates.append(pdfdate)
	return dates, requests

	def parse_pages(self, response):
	"""
	@url http://pdftohtml.ladiaria/?url=http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/ladiaria_20140131.pdf
	@returns requests 1
	"""
	request = Request(
	re.sub(r'/salida.html$', '/salida-1.html', response.url),
	callback=self.parse_frontpage)
	request.meta['pdfdate'] = response.meta.get('pdfdate')
	return request

	def parse_frontpage(self, response):
	"""
	Parses the newspaper frontapge in HTML, it detects first the font sizes
	using cssutils.
	Based on the biggest font (or the second biggest for the old style
	design) it scrapes the frontpage title.
	"""
	sheet = cssutils.parseString(
	response.xpath("//style/text()").extract()[0])
	fontsizes = {}
	for rule in sheet:
	name = rule.selectorList.selectorText
	if name.startswith('.ft'):
	fontsizes[name] = int(
	re.match(
	r'(\d+)px', rule.style['font-size']).groups()[0])
	sorted_fontsizes = sorted(
	fontsizes.items(), key=operator.itemgetter(1))
	pdfdate = response.meta['pdfdate']
	olstyle = pdfdate < self.new_style_date
	tl = TituloLoader(
	TituloItem(), response.css(
	'p[class="%s"]' %
	sorted_fontsizes[-2 if olstyle else -1][0][1:]))
	tl.add_xpath('title', ".//text()")
	tl.add_value('pdfdate', pdfdate)
	tl.add_value(
	'check',
	sorted_fontsizes[-2 if olstyle else -1][1] ==
	sorted_fontsizes[-3 if olstyle else -2][1])
	return tl.load_item()

	def close(self):
	"""
	Check if in all the week days we found a title, or if we found one in a
	weekend.
	"""
	first_diaria = today_only and date.today() or specific_date_only or \
	date(2006, 3, 20)
	for single_date in (
	first_diaria + timedelta(n) for n in range(
	(date.today() - first_diaria).days + 1)):
	if single_date.isoweekday() < 6:
	if single_date not in self.pdfdates:
	logging.error(
	"No hay diaria " + single_date.strftime(u'%Y-%m-%d'))
	else:
	if single_date in self.pdfdates:
	logging.error(
	"Salio sab o dom " + single_date.strftime(u'%Y-%m-%d'))