Created
May 6, 2019 09:02
-
-
Save sneakers-the-rat/c6dcfac0252ad574bdb54a918cd4d199 to your computer and use it in GitHub Desktop.
scraping fanfiction.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as bs | |
from tqdm import tqdm | |
import json | |
import os | |
import tables | |
import numpy as np | |
import pdb | |
import traceback | |
class Metadata(tables.IsDescription): | |
""" | |
Class to describe columns in metadata table | |
""" | |
page_title = tables.StringCol(512) | |
title = tables.StringCol(512) | |
description = tables.StringCol(2048) | |
rating = tables.StringCol(64) | |
language = tables.StringCol(128) | |
chapters = tables.UInt16Col() | |
chapter = tables.UInt16Col() | |
words = tables.UInt64Col() | |
reviews = tables.UInt32Col() | |
favs = tables.UInt32Col() | |
follows = tables.UInt32Col() | |
updated = tables.StringCol(64) | |
published = tables.StringCol(64) | |
text_idx = tables.UInt64Col() | |
genre = tables.StringCol(256) | |
characters = tables.StringCol(512) | |
def scrape_page(page, chapter=1): | |
# gather page elements with metadata | |
# small sub element with hyphen-separated descriptors | |
subheader = list(page.find(id="profile_top").find(class_='xgray xcontrast_txt').children) | |
subhead_1 = [t.strip() for t in subheader[2].split('-')] | |
subhead_2 = [t.strip() for t in subheader[4].split('-')] | |
############################# | |
# gather metadata -- idiosyncratic selection criteria for the page and its format. | |
# after scraping several thousand pages, there are no obvious failures. | |
metadata = {} | |
# these ones are more or less always present, no position correction needed | |
metadata["page_title"] = page.find("title").text.encode('ascii', errors="ignore") | |
metadata["title"] = page.find(id="profile_top").find("b").text.encode('ascii', errors="ignore") | |
metadata["description"] = page.find(id="profile_top").find('div', recursive=False).text.encode('ascii', errors="ignore") | |
metadata["rating"] = subheader[1].text.encode('ascii', errors="ignore") | |
metadata["language"], metadata["genre"], metadata["characters"] = subhead_1[1].encode('ascii', errors="ignore"), subhead_1[2].encode('ascii', errors="ignore"), subhead_1[3].encode('ascii', errors="ignore") | |
metadata['chapter'] = int(chapter) | |
# the following might be present, and so they require special checks | |
try: | |
metadata["chapters"] = [int(c.strip('Chapters: ')) for c in subhead_1 if c.startswith("Chapters:")][0] | |
except: | |
metadata['chapters'] = 1 | |
metadata["words"] = [int(c.strip('Words: ').replace(',', '')) for c in subhead_1 if c.startswith("Words: ")][0] | |
try: | |
reviews_item = [r for r in subheader if r.contains('Reviews')][0] | |
reviews_idx = subheader.index(reviews_item) | |
metadata["reviews"] = subheader[reviews_idx + 1].text | |
except: | |
pass | |
try: | |
metadata["favs"] = [int(c.strip("Favs: ").replace(',', '')) for c in subhead_2 if c.startswith('Favs')][0] | |
except: | |
metadata['favs'] = 0 | |
try: | |
metadata["follows"] = [int(c.strip("Follows: ").replace(',', '')) for c in subhead_2 if c.startswith('Follows')][0] | |
except: | |
metadata['follows'] = 0 | |
try: | |
updated_item = [r for r in subheader if r.contains('Published')][0] | |
updated_idx = subheader.index(updated_item) | |
metadata["updated"] = subheader[updated_idx+1].text | |
except: | |
pass | |
try: | |
published_item = [r for r in subheader if 'Published' in r][0] | |
published_idx = subheader.index(published_item) | |
metadata["published"] = subheader[published_idx+1].text | |
except: | |
pass | |
metadata["text_idx"] = texts.nrows | |
text = page.find(id="storytext") | |
return metadata, text | |
if __name__ == "__main__": | |
# open hdf5 file to write to | |
h5f = tables.open_file("fanfic.h5", mode="a", title="fanfiction.net") | |
# if metadata table doesn't exist, make it, otherwise get the reference to it | |
try: | |
tab = h5f.create_table('/', "metadata", description=Metadata) | |
except tables.exceptions.NodeError: | |
tab = h5f.get_node('/', "metadata") | |
# same with texts, though we'll use a variable-length unicode | |
# format, which can only be a single column array. | |
try: | |
texts = h5f.create_vlarray(h5f.root, 'texts', | |
tables.VLUnicodeAtom()) | |
except tables.exceptions.NodeError: | |
texts = h5f.get_node('/', 'texts') | |
# the row class allows us to write iteratively to the pytable | |
tab_row = tab.row | |
# figure out where we start & end. each page number is a 7-8 digit int. | |
try: | |
start_number = np.max(tab.col("text_idx")) | |
except ValueError: | |
start_number = 0 | |
end_number = 9999999 | |
# start chugging | |
for pn in tqdm(range(start_number, end_number), position = 0): | |
# every page is tried, and if there is an exception it's written to a log file. | |
try: | |
# zero pad number to get right url | |
try: | |
url_id = str(pn).zfill(7) | |
except: | |
url_id = pn | |
# load page | |
page_url = "https://www.fanfiction.net/s/{}/".format(url_id) | |
page = bs(requests.get(page_url).content, 'lxml') | |
# determine whether a story is had here, if not skip this page. | |
try: | |
warning = page.find(class_="gui_warning") | |
if warning.text.startswith("Story Not Found"): | |
continue | |
except: | |
pass | |
# figure out if story has chapters | |
chapter_select = page.find(id="chap_select") | |
if chapter_select: | |
has_chapters = True | |
else: | |
has_chapters = False | |
# scrape the first page and save data | |
metadata, text = scrape_page(page) | |
for k, v in metadata.items(): | |
tab_row[k] = v | |
tab_row.append() | |
tab.flush() | |
texts.append(str(text)) | |
texts.flush() | |
# if the text has chapters, iterate. | |
if has_chapters: | |
n_chapters = len(list(chapter_select.children)) | |
for chapter_number in tqdm(range(2, n_chapters+1), position=2): | |
chap_url = page_url + str(chapter_number) + "/" | |
page = bs(requests.get(chap_url).content, 'lxml') | |
metadata, text = scrape_page(page, chapter=chapter_number) | |
for k, v in metadata.items(): | |
tab_row[k] = v | |
tab_row.append() | |
tab.flush() | |
texts.append(str(text)) | |
texts.flush() | |
except Exception as e: | |
with open('fanfic_log.txt', 'a') as f: | |
f.write(str(e)) | |
f.write(traceback.format_exc()) | |
h5f.flush() | |
h5f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment