Last active
January 1, 2016 02:29
-
-
Save leroux/8079928 to your computer and use it in GitHub Desktop.
Hacky Worm Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Worm Scraper by Arc | |
# Use pandoc to convert into something nicer. | |
import re | |
from bs4 import BeautifulSoup | |
from urllib2 import urlopen | |
url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/" | |
#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/" | |
arc = "" | |
arcN = 0 | |
f = None | |
# Keep traversing until post with no "Next Chapter" is hit. | |
while True: | |
html = urlopen(url.encode('utf-8').strip()).read() | |
soup = BeautifulSoup(html) | |
title = soup.find("h1", class_="entry-title") | |
title.name = "h2" | |
#title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b")) | |
print title.text | |
maybeArc = title.string.split()[0] | |
if maybeArc != arc and maybeArc != "Interlude" and maybeArc != "Interlude:": | |
arc = maybeArc | |
arcN = arcN + 1 | |
if f is not None: f.close() | |
f = open('Worm ' + str(arcN) + ' - ' + arc + '.html', 'a') | |
content = soup.find("div", class_="entry-content") | |
#nextChapter_tags = soup.find_all("a", title="Next Chapter") | |
nextChapter_tags = soup.find_all("a", text = re.compile('.*Next.Chapter.*')) | |
# Get next chapter url. | |
if len(nextChapter_tags) > 0: | |
url = nextChapter_tags[0]['href'] | |
# Remove all links (includes last/next chapter links). | |
for tag in content.find_all("a", href=True): | |
tag.decompose() | |
# Remove "share this". | |
content.find("div", id="jp-post-flair").decompose() | |
f.write(title.prettify().encode('utf-8').strip() + '\n') | |
f.write(content.prettify().encode('utf-8').strip() + '\n') | |
f.write("<hr>") | |
if len(nextChapter_tags) == 0: | |
break |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Worm Scraper | |
# Dumps into an html file. | |
# Use pandoc to convert into something nicer. | |
import re | |
from bs4 import BeautifulSoup | |
from urllib2 import urlopen | |
url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/" | |
#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/" | |
f = open('worm.html', 'w') | |
f.write("<h1>Worm</h1>\n") | |
# Keep traversing until post with no "Next Chapter" is hit. | |
while True: | |
html = urlopen(url.encode('utf-8').strip()).read() | |
soup = BeautifulSoup(html) | |
title = soup.find("h1", class_="entry-title") | |
title.name = "h2" | |
#title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b")) | |
print title.text | |
content = soup.find("div", class_="entry-content") | |
#nextChapter_tags = soup.find_all("a", title="Next Chapter") | |
nextChapter_tags = soup.find_all("a", text = re.compile('.*Next.Chapter.*')) | |
# Get next chapter url. | |
if len(nextChapter_tags) > 0: | |
url = nextChapter_tags[0]['href'] | |
# Remove all links (includes last/next chapter links). | |
for tag in content.find_all("a", href=True): | |
tag.decompose() | |
# Remove "share this". | |
content.find("div", id="jp-post-flair").decompose() | |
f.write(title.prettify().encode('utf-8').strip() + '\n') | |
f.write(content.prettify().encode('utf-8').strip() + '\n') | |
f.write("<hr>") | |
if len(nextChapter_tags) == 0: | |
break | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment