Skip to content

Instantly share code, notes, and snippets.

@wf34
Last active May 12, 2018 14:55
Show Gist options
  • Save wf34/c55e265c44b3ca4b7102bee5693efbfe to your computer and use it in GitHub Desktop.
Save wf34/c55e265c44b3ca4b7102bee5693efbfe to your computer and use it in GitHub Desktop.
Web Scraping Wikipedia
import requests
import lxml.html as html
def get_page(url):
return requests.get(url, timeout =1.).text
def parse_page(song_url):
page_source = get_page(song_url)
assert isinstance(page_source, str)
root = html.fromstring(page_source)
TARGET_XPATH = '//*[@id="mw-content-text"]/div/p[11]'
target_html = root.xpath(TARGET_XPATH)
assert len(target_html) == 1
target_html = target_html[0]
output = [target_html.text]
for c in target_html.getchildren():
output.extend([c.text, c.tail])
return list(filter(lambda x : x, output))
target_url = 'https://en.wikipedia.org/wiki/Periodic_table'
print(' '.join(parse_page(target_url)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment