Created
August 5, 2014 23:10
-
-
Save romanofski/d6c7076da35d5e11fed7 to your computer and use it in GitHub Desktop.
Follows links from a main page and downloads subsequent links config
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env /usr/bin/python3 | |
# | |
# Follows links from a main page and downloads subsequent links config. | |
# The script was written for a specific site, so don't wonder why all | |
# selectors are hard coded. | |
# | |
from lxml import html | |
from lxml import etree | |
import requests | |
import sys | |
MAIN_PAGE = '' | |
def get_main_links(): | |
req = requests.get(MAIN_PAGE, verify=False) | |
page = html.document_fromstring(req.text) | |
return page.cssselect('.field-item.even table a') | |
def clean_empty_tags(node): | |
""" | |
Finds all <p> tags with a whitespace in it. They come out broke and | |
we won't need them anyways. | |
""" | |
for empty in node.xpath("//p[.='\xa0']"): | |
empty.getparent().remove(empty) | |
def writeout_html(): | |
for link in get_main_links(): | |
page = html.document_fromstring( | |
requests.get(link.get('href'), verify=False).text) | |
page_content = page.cssselect( | |
'div.region-content div.field-name-body')[0] | |
clean_empty_tags(page_content) | |
sys.stdout.write( | |
etree.tostring( | |
page_content, | |
encoding='utf-8', | |
pretty_print=False).decode('utf-8') | |
) | |
if __name__ == '__main__': | |
writeout_html() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment