Created
October 14, 2018 23:57
-
-
Save dasdachs/2118c71a73a83fea31a0d05474c63f6c to your computer and use it in GitHub Desktop.
Check built docs for met tags with scrapy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# To run it install Scrapy | |
# https://doc.scrapy.org/en/latest/intro/install.html | |
# And then do the following | |
# $ scrapy startproject [your_project_name] | |
# $ cd your_project_name | |
# $ scrapy genspider docs [docs_domain.tld] | |
# | |
# Erase the content of your_project_name/spider/docs.py | |
# and replace it with this code and run | |
# | |
# $ scrapy crawl docs -o check_meta_tags.csv | |
# | |
# Now you have a CSV file with all the endpoint and the meta tags | |
import re | |
import scrapy | |
from scrapy.linkextractors import LinkExtractor | |
class DocsSpider(scrapy.Spider): | |
VERSION_RE = re.compile(r'(?<=en/)[\w\d\.]+(?=/)') | |
name = 'docs' | |
allowed_domains = ['astro-docs.readthedocs.io'] | |
# The start_urls are hardcoded for the test I made | |
# Plase replace them with your or the offical Astropy docs URLs | |
start_urls = [ | |
'https://astro-docs.readthedocs.io/en/latest/genindex.html', | |
'https://astro-docs.readthedocs.io/en/v3.0.x/genindex.html', | |
'https://astro-docs.readthedocs.io/en/v2.0.x_a/genindex.html', | |
] | |
def parse(self, response): | |
for link in LinkExtractor().extract_links(response): | |
yield scrapy.Request(link.url, callback=self.check_meta) | |
def check_meta(self, response): | |
version = re.search(self.VERSION_RE, response.url).group() | |
meta = response.xpath('//meta[contains(@name, "robots")]').extract_first() | |
uri = response.url.split(version)[1] | |
yield { | |
"docs_version": version, | |
"uri": uri, | |
"meta_tag": meta | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment