Skip to content

Instantly share code, notes, and snippets.

@Steven24K
Created May 7, 2021 14:57
Show Gist options
  • Save Steven24K/25ea231b83d0512d9bec5abaddbe1463 to your computer and use it in GitHub Desktop.
Save Steven24K/25ea231b83d0512d9bec5abaddbe1463 to your computer and use it in GitHub Desktop.
A little program to scrape http://shakespeare.mit.edu/
import requests
from html.parser import HTMLParser
import os
class LinkParser(HTMLParser):
def __init__(self, *args, **kwargs):
self.title = ""
self.hasTitle = False
self.links = []
super(LinkParser, self).__init__(*args, **kwargs)
def handle_starttag(self, tag, attrs):
if (tag == 'h1' or tag == "title") and self.title == "":
print("Set title")
self.hasTitle = True
if tag == 'a':
print("Encountered a start tag:", tag, attrs[0][1])
self.links.append(attrs[0][1])
def handle_data(self, data):
if (self.hasTitle):
print("Encountered some data :", data)
self.hasTitle = False
self.title = data
class ContentParser(HTMLParser):
def __init__(self, *args, **kwargs):
super(ContentParser, self).__init__(*args, **kwargs)
# def handle_starttag(self, tag, attrs):
# print("Encountered a start tag:", tag, attrs[0][1])
# def handle_endtag(self, tag):
# print("Encountered an end tag :", tag)
# def handle_data(self, data):
# print("Encountered some data :", data)
def front_matter(title):
return """---
title: {title}
---
""".format(title = title)
def category_json(title, pos):
return """
{{
"label": "{title}",
"position": {position}
}}
""".format(title = title, position = pos)
URL = 'http://shakespeare.mit.edu/'
DIR = "some-dir/"
try:
os.mkdir(DIR)
except:
print("")
# Get list of plays
r = requests.get(URL)
data = r.text
print(data)
parser = LinkParser()
parser.feed(data)
links = parser.links
print (links)
# For every play get the chapter
position = 0
for link in links:
req_uri = URL + link
print('Get: ', req_uri)
try:
content_request = requests.get(req_uri)
except:
print ("Failed: ", link)
continue
content = content_request.text
link_parser = LinkParser()
link_parser.feed(content)
path = DIR + link_parser.title.replace(" ", "-").replace("'", "").replace(":", "").replace("\n", "") + "/"
try:
os.mkdir(path)
except:
print(path + " error")
content_links = link_parser.links
print("Title: ", link_parser.title)
print("Position: ", position)
# TODO: Make edge case for when page has zero links, then write content direct to file
# For every chapter get the content
for content_link in content_links:
if req_uri.endswith("sonnets.html"):
final_url = req_uri.replace("sonnets.html", "") + content_link
else:
final_url = req_uri.replace("index.html", "") + content_link
print("Get: ", final_url)
final_request = requests.get(final_url)
final_content = final_request.text
final_parser = LinkParser()
final_parser.feed(final_content)
# Write content to file
if (content_link.endswith(".html")):
print("Writing: " + content_link)
try:
f = open(path + content_link.replace(".html", ".md"), 'w')
f.write(front_matter(final_parser.title.replace("\n", "").replace(":", "")) + final_content)
json_file = open(path + "_category.json", "w")
json_file.write(category_json(link_parser.title.replace("\n", ""), position))
except:
print("")
position = position + 1
print("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment