Created
August 11, 2017 05:17
-
-
Save zimmicz/f69a5ce5d3cf3a220e171553c35e0391 to your computer and use it in GitHub Desktop.
PostgreSQL docs scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#!/usr/bin/env python | |
import requests | |
from bs4 import BeautifulSoup | |
URL = "https://www.postgresql.org/docs/VERSION/static/sql-commands.html" | |
VERSIONS = ["10.0", "9.6", "9.5", "9.4", "9.3", "9.2", "9.1", "9.0", "8.4", "8.3", "8.2", "8.1", "8.0", "7.4", "7.3", "7.2", "7.1", "7.0"] | |
CAPS = {} | |
def replace_version_in_url(new_version): | |
return URL.replace("VERSION", new_version) | |
def fetch_page(url): | |
return requests.get(url).text | |
def make_soup(html): | |
return BeautifulSoup(html, "lxml") | |
def get_links_from_soup(soup): | |
links = [] | |
_links = soup("a") | |
for link in _links: | |
try: | |
if link["href"].startswith("sql-"): | |
links.append(link["href"]) | |
except KeyError as err: | |
pass | |
return links | |
def get_word_count_from_soup(soup): | |
synopsis = soup("pre")[0] | |
letter_count = 0 | |
for s in synopsis.stripped_strings: | |
letter_count += len(s) | |
return letter_count | |
def get_title_from_soup(soup): | |
return soup("title")[0].string.split(": ")[-1] | |
def get_data_for_version(links, version): | |
data = {} | |
for link in links: | |
url = replace_version_in_url(version).rsplit("/")[0:-1] | |
url.extend([link]) | |
url = "/".join(url) | |
page = fetch_page(url) | |
soup = make_soup(page) | |
title = get_title_from_soup(soup) | |
data[title] = get_word_count_from_soup(soup) | |
return data | |
for VERSION in VERSIONS: | |
print VERSION | |
url = replace_version_in_url(VERSION) | |
page = fetch_page(url) | |
soup = make_soup(page) | |
links = get_links_from_soup(soup) | |
CAPS[VERSION] = get_data_for_version(links, VERSION) | |
with open("data.csv", "a") as f: | |
for func, letter_count in CAPS[VERSION].items(): | |
print func, letter_count | |
f.write(VERSION + ";" + func.strip() + ";" + str(letter_count) + "\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment