Created
May 7, 2021 14:57
-
-
Save Steven24K/25ea231b83d0512d9bec5abaddbe1463 to your computer and use it in GitHub Desktop.
A little program to scrape http://shakespeare.mit.edu/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from html.parser import HTMLParser | |
import os | |
class LinkParser(HTMLParser): | |
def __init__(self, *args, **kwargs): | |
self.title = "" | |
self.hasTitle = False | |
self.links = [] | |
super(LinkParser, self).__init__(*args, **kwargs) | |
def handle_starttag(self, tag, attrs): | |
if (tag == 'h1' or tag == "title") and self.title == "": | |
print("Set title") | |
self.hasTitle = True | |
if tag == 'a': | |
print("Encountered a start tag:", tag, attrs[0][1]) | |
self.links.append(attrs[0][1]) | |
def handle_data(self, data): | |
if (self.hasTitle): | |
print("Encountered some data :", data) | |
self.hasTitle = False | |
self.title = data | |
class ContentParser(HTMLParser): | |
def __init__(self, *args, **kwargs): | |
super(ContentParser, self).__init__(*args, **kwargs) | |
# def handle_starttag(self, tag, attrs): | |
# print("Encountered a start tag:", tag, attrs[0][1]) | |
# def handle_endtag(self, tag): | |
# print("Encountered an end tag :", tag) | |
# def handle_data(self, data): | |
# print("Encountered some data :", data) | |
def front_matter(title): | |
return """--- | |
title: {title} | |
--- | |
""".format(title = title) | |
def category_json(title, pos): | |
return """ | |
{{ | |
"label": "{title}", | |
"position": {position} | |
}} | |
""".format(title = title, position = pos) | |
URL = 'http://shakespeare.mit.edu/' | |
DIR = "some-dir/" | |
try: | |
os.mkdir(DIR) | |
except: | |
print("") | |
# Get list of plays | |
r = requests.get(URL) | |
data = r.text | |
print(data) | |
parser = LinkParser() | |
parser.feed(data) | |
links = parser.links | |
print (links) | |
# For every play get the chapter | |
position = 0 | |
for link in links: | |
req_uri = URL + link | |
print('Get: ', req_uri) | |
try: | |
content_request = requests.get(req_uri) | |
except: | |
print ("Failed: ", link) | |
continue | |
content = content_request.text | |
link_parser = LinkParser() | |
link_parser.feed(content) | |
path = DIR + link_parser.title.replace(" ", "-").replace("'", "").replace(":", "").replace("\n", "") + "/" | |
try: | |
os.mkdir(path) | |
except: | |
print(path + " error") | |
content_links = link_parser.links | |
print("Title: ", link_parser.title) | |
print("Position: ", position) | |
# TODO: Make edge case for when page has zero links, then write content direct to file | |
# For every chapter get the content | |
for content_link in content_links: | |
if req_uri.endswith("sonnets.html"): | |
final_url = req_uri.replace("sonnets.html", "") + content_link | |
else: | |
final_url = req_uri.replace("index.html", "") + content_link | |
print("Get: ", final_url) | |
final_request = requests.get(final_url) | |
final_content = final_request.text | |
final_parser = LinkParser() | |
final_parser.feed(final_content) | |
# Write content to file | |
if (content_link.endswith(".html")): | |
print("Writing: " + content_link) | |
try: | |
f = open(path + content_link.replace(".html", ".md"), 'w') | |
f.write(front_matter(final_parser.title.replace("\n", "").replace(":", "")) + final_content) | |
json_file = open(path + "_category.json", "w") | |
json_file.write(category_json(link_parser.title.replace("\n", ""), position)) | |
except: | |
print("") | |
position = position + 1 | |
print("Done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment