Last active
May 25, 2017 19:06
-
-
Save parkj90/b8d878abd4091efc63f9fe99cbb36ac0 to your computer and use it in GitHub Desktop.
wikipedia challenge
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import requests | |
from lxml import html | |
import time | |
if len(sys.argv) < 2: | |
print("Usage: {} URL [page jump limit]".format(sys.argv[0])) | |
exit() | |
link = sys.argv[1] | |
limit = int(sys.argv[2]) if len(sys.argv) > 2 else 25 | |
#input: URL, return: root tree element | |
def get_page_tree(page): | |
return html.fromstring(page.content) | |
#input: root tree, return: title of article | |
def get_title(tree): | |
return tree.xpath('//*[@id="firstHeading"]')[0].text | |
#input: root tree element, return: next URL in chain | |
def get_first_link(tree): | |
parenthesized = 0 | |
for p in tree.xpath('//*[@id="mw-content-text"]/p'): | |
for e in p: | |
if e.tag == 'a' and not parenthesized: | |
return 'https://en.wikipedia.org' + e.attrib['href'] | |
raw = str(html.tostring(e)) | |
for c in raw: | |
if c == '(': | |
parenthesized += 1 | |
if c == ')': | |
parenthesized -= 1 | |
for step in range(limit): | |
page = requests.get(link) | |
tree = get_page_tree(page) | |
title = get_title(tree) | |
print("{}\n#{}: {}".format(link,step+1, title)) | |
if title == 'Philosophy': | |
break | |
link = get_first_link(tree) | |
time.sleep(0.1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Line no: 27 'https://en.wikipedia.org' + str(e.attrib['href']).replace('(', '%28').replace(')', '%29')
To solve parenthesize try this method