Skip to content

Instantly share code, notes, and snippets.

@wf34
Last active May 12, 2018 15:52
Show Gist options
  • Save wf34/f68cc10f65c5219b793dc39911cd5411 to your computer and use it in GitHub Desktop.
Save wf34/f68cc10f65c5219b793dc39911cd5411 to your computer and use it in GitHub Desktop.
Web-Scraping with Headless Chrome
#!/usr/bin/env python3
import sys
import os
import subprocess
import lxml.html as html
import lxml.etree as etree
def get_page(url):
HEADLESS_GOOGLE_CMD = '''google-chrome-stable --headless \
--disable-gpu \
--dump-dom'''
target = '{} {}'.format(HEADLESS_GOOGLE_CMD, url)
return subprocess.getoutput(target)
def parse_chunk(node):
parts = [node.text]
for c in node.getchildren():
elem = None
if c.tag == 'a' or c.tag == 'dfp-ad':
elem = None
elif c.tag == 'br':
elem = '\n'
else:
assert False, c.tag
parts.extend([c.text, elem, c.tail])
if node.tag == 'br':
parts.append('\n')
parts.append(node.tail)
parts = list(filter(None, parts))
return ''.join(parts)
def parse_song(song_url):
page_source = get_page(song_url)
assert isinstance(page_source, str)
root = html.fromstring(page_source)
LYRICS_XPATH = "//div[@class='lyrics']/section/p"
lyrics_html = root.xpath(LYRICS_XPATH)
assert len(lyrics_html) == 1
output = []
for c in lyrics_html[0].getchildren():
text = parse_chunk(c)
output.append(text)
return ' '.join(list(filter(lambda x : x, output)))
cd = 'https://genius.com/The-mamas-and-the-papas-california-dreamin-lyrics'
print(parse_song(cd))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment