sneakers-the-rat · May 6, 2019 09:02
diff --git a/fanfiction.py b/fanfiction.py
 import requests
 from bs4 import BeautifulSoup as bs
 from tqdm import tqdm
 import json
 import os
 import tables
 import numpy as np
 import pdb
 import traceback


 class Metadata(tables.IsDescription):
    """
    Class to describe columns in metadata table
    """
    page_title = tables.StringCol(512)
    title = tables.StringCol(512)
    description = tables.StringCol(2048)
    rating = tables.StringCol(64)
    language = tables.StringCol(128)
    chapters = tables.UInt16Col()
    chapter = tables.UInt16Col()
    words = tables.UInt64Col()
    reviews = tables.UInt32Col()
    favs = tables.UInt32Col()
    follows = tables.UInt32Col()
    updated = tables.StringCol(64)
    published = tables.StringCol(64)
    text_idx = tables.UInt64Col()
    genre = tables.StringCol(256)
    characters = tables.StringCol(512)


 def scrape_page(page, chapter=1):

    # gather page elements with metadata
    # small sub element with hyphen-separated descriptors
    subheader = list(page.find(id="profile_top").find(class_='xgray xcontrast_txt').children)


    subhead_1 = [t.strip() for t in subheader[2].split('-')]
    subhead_2 = [t.strip() for t in subheader[4].split('-')]

    #############################
    # gather metadata -- idiosyncratic selection criteria for the page and its format.
    # after scraping several thousand pages, there are no obvious failures.
    metadata = {}

    # these ones are more or less always present, no position correction needed
    metadata["page_title"] = page.find("title").text.encode('ascii', errors="ignore")
    metadata["title"] = page.find(id="profile_top").find("b").text.encode('ascii', errors="ignore")
    metadata["description"] = page.find(id="profile_top").find('div', recursive=False).text.encode('ascii', errors="ignore")
    metadata["rating"] = subheader[1].text.encode('ascii', errors="ignore")
    metadata["language"], metadata["genre"], metadata["characters"] = subhead_1[1].encode('ascii', errors="ignore"), subhead_1[2].encode('ascii', errors="ignore"), subhead_1[3].encode('ascii', errors="ignore")
    metadata['chapter'] = int(chapter)

    # the following might be present, and so they require special checks
    try:
        metadata["chapters"] = [int(c.strip('Chapters: ')) for c in subhead_1 if c.startswith("Chapters:")][0]
    except:
        metadata['chapters'] = 1


    metadata["words"] = [int(c.strip('Words: ').replace(',', '')) for c in subhead_1 if c.startswith("Words: ")][0]

    try:
        reviews_item = [r for r in subheader if r.contains('Reviews')][0]
        reviews_idx = subheader.index(reviews_item)
        metadata["reviews"] = subheader[reviews_idx + 1].text
    except:
        pass

    try:
        metadata["favs"] = [int(c.strip("Favs: ").replace(',', '')) for c in subhead_2 if c.startswith('Favs')][0]
    except:
        metadata['favs'] = 0

    try:
        metadata["follows"] = [int(c.strip("Follows: ").replace(',', '')) for c in subhead_2 if c.startswith('Follows')][0]
    except:
        metadata['follows'] = 0

    try:
        updated_item = [r for r in subheader if r.contains('Published')][0]
        updated_idx = subheader.index(updated_item)
        metadata["updated"] = subheader[updated_idx+1].text
    except:
        pass

    try:
        published_item = [r for r in subheader if 'Published' in r][0]
        published_idx = subheader.index(published_item)
        metadata["published"] = subheader[published_idx+1].text
    except:
        pass

    metadata["text_idx"] = texts.nrows

    text = page.find(id="storytext")
    return metadata, text


 if __name__ == "__main__":

    # open hdf5 file to write to
    h5f = tables.open_file("fanfic.h5", mode="a", title="fanfiction.net")

    # if metadata table doesn't exist, make it, otherwise get the reference to it
    try:
        tab = h5f.create_table('/', "metadata", description=Metadata)
    except tables.exceptions.NodeError:
        tab = h5f.get_node('/', "metadata")

    # same with texts, though we'll use a variable-length unicode
    # format, which can only be a single column array.
    try:
        texts = h5f.create_vlarray(h5f.root, 'texts',
                                   tables.VLUnicodeAtom())
    except tables.exceptions.NodeError:
        texts = h5f.get_node('/', 'texts')


    # the row class allows us to write iteratively to the pytable
    tab_row = tab.row

    # figure out where we start & end. each page number is a 7-8 digit int.
    try:
        start_number = np.max(tab.col("text_idx"))
    except ValueError:
        start_number = 0

    end_number = 9999999

    # start chugging
    for pn in tqdm(range(start_number, end_number), position = 0):
        # every page is tried, and if there is an exception it's written to a log file.
        try:
            # zero pad number to get right url
            try:
                url_id = str(pn).zfill(7)
            except:
                url_id = pn

            # load page
            page_url = "https://www.fanfiction.net/s/{}/".format(url_id)
            page = bs(requests.get(page_url).content, 'lxml')

            # determine whether a story is had here, if not skip this page.
            try:
                warning = page.find(class_="gui_warning")
                if warning.text.startswith("Story Not Found"):
                    continue
            except:
                pass

            # figure out if story has chapters
            chapter_select = page.find(id="chap_select")
            if chapter_select:
                has_chapters = True
            else:
                has_chapters = False

            # scrape the first page and save data
            metadata, text = scrape_page(page)
            for k, v in metadata.items():
                tab_row[k] = v
            tab_row.append()
            tab.flush()
            texts.append(str(text))
            texts.flush()


            # if the text has chapters, iterate.
            if has_chapters:
                n_chapters = len(list(chapter_select.children))
                for chapter_number in tqdm(range(2, n_chapters+1), position=2):
                    chap_url = page_url + str(chapter_number) + "/"
                    page = bs(requests.get(chap_url).content, 'lxml')
                    metadata, text = scrape_page(page, chapter=chapter_number)
                    for k, v in metadata.items():
                        tab_row[k] = v
                    tab_row.append()
                    tab.flush()
                    texts.append(str(text))
                    texts.flush()

        except Exception as e:
            with open('fanfic_log.txt', 'a') as f:
                f.write(str(e))
                f.write(traceback.format_exc())




    h5f.flush()
    h5f.close()
	import requests
	from bs4 import BeautifulSoup as bs
	from tqdm import tqdm
	import json
	import os
	import tables
	import numpy as np
	import pdb
	import traceback


	class Metadata(tables.IsDescription):
	"""
	Class to describe columns in metadata table
	"""
	page_title = tables.StringCol(512)
	title = tables.StringCol(512)
	description = tables.StringCol(2048)
	rating = tables.StringCol(64)
	language = tables.StringCol(128)
	chapters = tables.UInt16Col()
	chapter = tables.UInt16Col()
	words = tables.UInt64Col()
	reviews = tables.UInt32Col()
	favs = tables.UInt32Col()
	follows = tables.UInt32Col()
	updated = tables.StringCol(64)
	published = tables.StringCol(64)
	text_idx = tables.UInt64Col()
	genre = tables.StringCol(256)
	characters = tables.StringCol(512)


	def scrape_page(page, chapter=1):

	# gather page elements with metadata
	# small sub element with hyphen-separated descriptors
	subheader = list(page.find(id="profile_top").find(class_='xgray xcontrast_txt').children)


	subhead_1 = [t.strip() for t in subheader[2].split('-')]
	subhead_2 = [t.strip() for t in subheader[4].split('-')]

	#############################
	# gather metadata -- idiosyncratic selection criteria for the page and its format.
	# after scraping several thousand pages, there are no obvious failures.
	metadata = {}

	# these ones are more or less always present, no position correction needed
	metadata["page_title"] = page.find("title").text.encode('ascii', errors="ignore")
	metadata["title"] = page.find(id="profile_top").find("b").text.encode('ascii', errors="ignore")
	metadata["description"] = page.find(id="profile_top").find('div', recursive=False).text.encode('ascii', errors="ignore")
	metadata["rating"] = subheader[1].text.encode('ascii', errors="ignore")
	metadata["language"], metadata["genre"], metadata["characters"] = subhead_1[1].encode('ascii', errors="ignore"), subhead_1[2].encode('ascii', errors="ignore"), subhead_1[3].encode('ascii', errors="ignore")
	metadata['chapter'] = int(chapter)

	# the following might be present, and so they require special checks
	try:
	metadata["chapters"] = [int(c.strip('Chapters: ')) for c in subhead_1 if c.startswith("Chapters:")][0]
	except:
	metadata['chapters'] = 1


	metadata["words"] = [int(c.strip('Words: ').replace(',', '')) for c in subhead_1 if c.startswith("Words: ")][0]

	try:
	reviews_item = [r for r in subheader if r.contains('Reviews')][0]
	reviews_idx = subheader.index(reviews_item)
	metadata["reviews"] = subheader[reviews_idx + 1].text
	except:
	pass

	try:
	metadata["favs"] = [int(c.strip("Favs: ").replace(',', '')) for c in subhead_2 if c.startswith('Favs')][0]
	except:
	metadata['favs'] = 0

	try:
	metadata["follows"] = [int(c.strip("Follows: ").replace(',', '')) for c in subhead_2 if c.startswith('Follows')][0]
	except:
	metadata['follows'] = 0

	try:
	updated_item = [r for r in subheader if r.contains('Published')][0]
	updated_idx = subheader.index(updated_item)
	metadata["updated"] = subheader[updated_idx+1].text
	except:
	pass

	try:
	published_item = [r for r in subheader if 'Published' in r][0]
	published_idx = subheader.index(published_item)
	metadata["published"] = subheader[published_idx+1].text
	except:
	pass

	metadata["text_idx"] = texts.nrows

	text = page.find(id="storytext")
	return metadata, text


	if __name__ == "__main__":

	# open hdf5 file to write to
	h5f = tables.open_file("fanfic.h5", mode="a", title="fanfiction.net")

	# if metadata table doesn't exist, make it, otherwise get the reference to it
	try:
	tab = h5f.create_table('/', "metadata", description=Metadata)
	except tables.exceptions.NodeError:
	tab = h5f.get_node('/', "metadata")

	# same with texts, though we'll use a variable-length unicode
	# format, which can only be a single column array.
	try:
	texts = h5f.create_vlarray(h5f.root, 'texts',
	tables.VLUnicodeAtom())
	except tables.exceptions.NodeError:
	texts = h5f.get_node('/', 'texts')


	# the row class allows us to write iteratively to the pytable
	tab_row = tab.row

	# figure out where we start & end. each page number is a 7-8 digit int.
	try:
	start_number = np.max(tab.col("text_idx"))
	except ValueError:
	start_number = 0

	end_number = 9999999

	# start chugging
	for pn in tqdm(range(start_number, end_number), position = 0):
	# every page is tried, and if there is an exception it's written to a log file.
	try:
	# zero pad number to get right url
	try:
	url_id = str(pn).zfill(7)
	except:
	url_id = pn

	# load page
	page_url = "https://www.fanfiction.net/s/{}/".format(url_id)
	page = bs(requests.get(page_url).content, 'lxml')

	# determine whether a story is had here, if not skip this page.
	try:
	warning = page.find(class_="gui_warning")
	if warning.text.startswith("Story Not Found"):
	continue
	except:
	pass

	# figure out if story has chapters
	chapter_select = page.find(id="chap_select")
	if chapter_select:
	has_chapters = True
	else:
	has_chapters = False

	# scrape the first page and save data
	metadata, text = scrape_page(page)
	for k, v in metadata.items():
	tab_row[k] = v
	tab_row.append()
	tab.flush()
	texts.append(str(text))
	texts.flush()


	# if the text has chapters, iterate.
	if has_chapters:
	n_chapters = len(list(chapter_select.children))
	for chapter_number in tqdm(range(2, n_chapters+1), position=2):
	chap_url = page_url + str(chapter_number) + "/"
	page = bs(requests.get(chap_url).content, 'lxml')
	metadata, text = scrape_page(page, chapter=chapter_number)
	for k, v in metadata.items():
	tab_row[k] = v
	tab_row.append()
	tab.flush()
	texts.append(str(text))
	texts.flush()

	except Exception as e:
	with open('fanfic_log.txt', 'a') as f:
	f.write(str(e))
	f.write(traceback.format_exc())




	h5f.flush()
	h5f.close()