Last active
January 11, 2018 15:41
-
-
Save mutsune/a49a8daa6634d63ccabc92c4f3ac4953 to your computer and use it in GitHub Desktop.
Clone UNIX V6 repository http://minnie.tuhs.org/cgi-bin/utree.pl?file=V6/usr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import urllib.request | |
import os | |
import html | |
def save(path, content): | |
with open(path, "w") as file: | |
file.write(content) | |
def get_html(url): | |
with urllib.request.urlopen(url) as response: | |
html = response.read() | |
# convert iso-8859-1 to utf-8 | |
return html.decode("iso-8859-1").encode("utf-8").decode("utf-8") | |
def get_body(url): | |
html = get_html(url) | |
return html[html.index("<body") + 1:] | |
def extract_urls(body): | |
# whether this page is a directory | |
if '<form method="post" action="http://minnie.tuhs.org/cgi-bin/utree.pl"' in body: | |
return [] | |
href_lines = [l for l in body.split("\n") if "href=" in l] | |
return [l[l.index('href="') + 6:-2] for l in href_lines] | |
def make_path(p_path, url): | |
name = url.split("/")[-1] | |
return p_path + "/" + name | |
def extract_src(body): | |
begin = body.index("<pre>") + 6 | |
end = body.index("</pre>") | |
unescaped = html.unescape(body[begin:end]) | |
return unescaped | |
def mkdir(path): | |
os.makedirs(os.path.join(*path.split("/")), exist_ok=True) | |
def get_url(p_path, url): | |
print(url) | |
body = get_body(url) | |
urls = extract_urls(body) | |
path = make_path(p_path, url) | |
if urls: | |
mkdir(path) | |
for u in urls: | |
get_url(path, u) | |
else: | |
content = extract_src(body) | |
save(path, content) | |
if __name__ == '__main__': | |
base_url = "http://minnie.tuhs.org/cgi-bin/utree.pl?file=V6/usr" | |
base_dir = "." | |
get_url(base_dir, base_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment