Skip to content

Instantly share code, notes, and snippets.

@sneakers-the-rat
Created May 6, 2019 09:02
Show Gist options
  • Save sneakers-the-rat/c6dcfac0252ad574bdb54a918cd4d199 to your computer and use it in GitHub Desktop.
Save sneakers-the-rat/c6dcfac0252ad574bdb54a918cd4d199 to your computer and use it in GitHub Desktop.
scraping fanfiction.com
import requests
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import json
import os
import tables
import numpy as np
import pdb
import traceback
class Metadata(tables.IsDescription):
"""
Class to describe columns in metadata table
"""
page_title = tables.StringCol(512)
title = tables.StringCol(512)
description = tables.StringCol(2048)
rating = tables.StringCol(64)
language = tables.StringCol(128)
chapters = tables.UInt16Col()
chapter = tables.UInt16Col()
words = tables.UInt64Col()
reviews = tables.UInt32Col()
favs = tables.UInt32Col()
follows = tables.UInt32Col()
updated = tables.StringCol(64)
published = tables.StringCol(64)
text_idx = tables.UInt64Col()
genre = tables.StringCol(256)
characters = tables.StringCol(512)
def scrape_page(page, chapter=1):
# gather page elements with metadata
# small sub element with hyphen-separated descriptors
subheader = list(page.find(id="profile_top").find(class_='xgray xcontrast_txt').children)
subhead_1 = [t.strip() for t in subheader[2].split('-')]
subhead_2 = [t.strip() for t in subheader[4].split('-')]
#############################
# gather metadata -- idiosyncratic selection criteria for the page and its format.
# after scraping several thousand pages, there are no obvious failures.
metadata = {}
# these ones are more or less always present, no position correction needed
metadata["page_title"] = page.find("title").text.encode('ascii', errors="ignore")
metadata["title"] = page.find(id="profile_top").find("b").text.encode('ascii', errors="ignore")
metadata["description"] = page.find(id="profile_top").find('div', recursive=False).text.encode('ascii', errors="ignore")
metadata["rating"] = subheader[1].text.encode('ascii', errors="ignore")
metadata["language"], metadata["genre"], metadata["characters"] = subhead_1[1].encode('ascii', errors="ignore"), subhead_1[2].encode('ascii', errors="ignore"), subhead_1[3].encode('ascii', errors="ignore")
metadata['chapter'] = int(chapter)
# the following might be present, and so they require special checks
try:
metadata["chapters"] = [int(c.strip('Chapters: ')) for c in subhead_1 if c.startswith("Chapters:")][0]
except:
metadata['chapters'] = 1
metadata["words"] = [int(c.strip('Words: ').replace(',', '')) for c in subhead_1 if c.startswith("Words: ")][0]
try:
reviews_item = [r for r in subheader if r.contains('Reviews')][0]
reviews_idx = subheader.index(reviews_item)
metadata["reviews"] = subheader[reviews_idx + 1].text
except:
pass
try:
metadata["favs"] = [int(c.strip("Favs: ").replace(',', '')) for c in subhead_2 if c.startswith('Favs')][0]
except:
metadata['favs'] = 0
try:
metadata["follows"] = [int(c.strip("Follows: ").replace(',', '')) for c in subhead_2 if c.startswith('Follows')][0]
except:
metadata['follows'] = 0
try:
updated_item = [r for r in subheader if r.contains('Published')][0]
updated_idx = subheader.index(updated_item)
metadata["updated"] = subheader[updated_idx+1].text
except:
pass
try:
published_item = [r for r in subheader if 'Published' in r][0]
published_idx = subheader.index(published_item)
metadata["published"] = subheader[published_idx+1].text
except:
pass
metadata["text_idx"] = texts.nrows
text = page.find(id="storytext")
return metadata, text
if __name__ == "__main__":
# open hdf5 file to write to
h5f = tables.open_file("fanfic.h5", mode="a", title="fanfiction.net")
# if metadata table doesn't exist, make it, otherwise get the reference to it
try:
tab = h5f.create_table('/', "metadata", description=Metadata)
except tables.exceptions.NodeError:
tab = h5f.get_node('/', "metadata")
# same with texts, though we'll use a variable-length unicode
# format, which can only be a single column array.
try:
texts = h5f.create_vlarray(h5f.root, 'texts',
tables.VLUnicodeAtom())
except tables.exceptions.NodeError:
texts = h5f.get_node('/', 'texts')
# the row class allows us to write iteratively to the pytable
tab_row = tab.row
# figure out where we start & end. each page number is a 7-8 digit int.
try:
start_number = np.max(tab.col("text_idx"))
except ValueError:
start_number = 0
end_number = 9999999
# start chugging
for pn in tqdm(range(start_number, end_number), position = 0):
# every page is tried, and if there is an exception it's written to a log file.
try:
# zero pad number to get right url
try:
url_id = str(pn).zfill(7)
except:
url_id = pn
# load page
page_url = "https://www.fanfiction.net/s/{}/".format(url_id)
page = bs(requests.get(page_url).content, 'lxml')
# determine whether a story is had here, if not skip this page.
try:
warning = page.find(class_="gui_warning")
if warning.text.startswith("Story Not Found"):
continue
except:
pass
# figure out if story has chapters
chapter_select = page.find(id="chap_select")
if chapter_select:
has_chapters = True
else:
has_chapters = False
# scrape the first page and save data
metadata, text = scrape_page(page)
for k, v in metadata.items():
tab_row[k] = v
tab_row.append()
tab.flush()
texts.append(str(text))
texts.flush()
# if the text has chapters, iterate.
if has_chapters:
n_chapters = len(list(chapter_select.children))
for chapter_number in tqdm(range(2, n_chapters+1), position=2):
chap_url = page_url + str(chapter_number) + "/"
page = bs(requests.get(chap_url).content, 'lxml')
metadata, text = scrape_page(page, chapter=chapter_number)
for k, v in metadata.items():
tab_row[k] = v
tab_row.append()
tab.flush()
texts.append(str(text))
texts.flush()
except Exception as e:
with open('fanfic_log.txt', 'a') as f:
f.write(str(e))
f.write(traceback.format_exc())
h5f.flush()
h5f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment