Last active
January 2, 2021 23:55
-
-
Save zeratax/920ae622c2fb0041233299b76f672c57 to your computer and use it in GitHub Desktop.
archive your bookmarks -- looks through exported html files (e.g. exported bookmarks) for specific websites and downloads them with youtube-dl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##### | |
# Installation: | |
# install python3 and install via pip "bs4" and "youtube_dl" | |
# | |
# Usage: | |
# python3 crawler.py -h | |
# | |
# Example: | |
# export your bookmarks with firefox and then crawl them for youtube and soundcloud by executing: | |
# python3 crawler.py -f bookmarks.html -w youtube.com -w youtu.be -w soundcloud.com | |
##### | |
from __future__ import unicode_literals | |
import argparse | |
import json | |
import os | |
from bs4 import BeautifulSoup | |
import youtube_dl | |
fails = [] | |
def crawler(file, websites): | |
if file.endswith(".txt"): | |
with open(file) as f: | |
urls = f.read().splitlines() | |
else: | |
html_doc = open(file, 'r') | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
links = soup.find_all('a') | |
urls = [] | |
for link in links: | |
urls.append(link["href"]) | |
for url in urls: | |
domain = url.replace("https://", '').replace("www.", '').split('/')[0] | |
for website in websites: | |
if website == domain: | |
print(url) | |
if not download(url): | |
fails.append(url) | |
with open("failed_downloads.txt", 'w') as f: | |
for fail in fails: | |
f.write("%s\n" % fail) | |
def download(url): | |
format = "m4a" | |
ydl_opts = { | |
"outtmpl": "downloads/%(uploader)s/%(title)s.%(ext)s", | |
"format": "bestaudio", | |
"postprocessors": [{ | |
"key": "FFmpegExtractAudio", | |
"preferredcodec": format | |
}], | |
} | |
with youtube_dl.YoutubeDL(ydl_opts) as ydl: | |
try: | |
info = ydl.extract_info(url, download=False) | |
if "_type" in info and info["_type"] == "playlist": | |
for entry in info["entries"]: | |
url = entry["webpage_url"] | |
if not download(url): | |
fails.append(url) | |
else: | |
title = info["title"] | |
uploader = info["uploader"] | |
if len(title) > 199: | |
title = title[:197] + "..." | |
uploader = prepare_filename(uploader) | |
title = prepare_filename(title) | |
path = "downloads/{}/{}".format(uploader, title) | |
if not os.path.isfile(path + "." + format): | |
ydl.download([url]) | |
else: | |
print("already downloaded.") | |
with open(path + ".json", 'w') as f: | |
json.dump(info, f) | |
return True | |
except youtube_dl.utils.DownloadError: | |
print("file couldn't be downloaded, possibly deleted!") | |
return False | |
def prepare_filename(filename): | |
filename = filename.replace("?", "") | |
filename = filename.replace("/", "_") | |
filename = filename.replace(": ", " - ") | |
filename = filename.replace(" :", " -") | |
filename = filename.replace(":", "_") | |
filename = filename.replace('"', "'") | |
filename = filename.replace("||", '|') | |
filename = filename.replace("|", '_') | |
filename = filename.replace("__", '_') | |
filename = filename.replace("*", '_') | |
if filename.startswith("_"): | |
filename = filename[1:] | |
if filename.endswith("_"): | |
filename = filename[:-1] | |
if filename.startswith("-"): | |
filename = list(filename) | |
filename[0] = "_" | |
filename = "".join(filename) | |
if filename.endswith("-"): | |
filename = list(filename) | |
filename[-1] = "_" | |
filename = "".join(filename) | |
return filename | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="archive your bookmarks") | |
parser.add_argument("-f", "--file", help="html file", | |
metavar="FILE", required=True) | |
parser.add_argument('-w', '--website', action='append', | |
help='websites that should be archived, e.g. "youtube.com"', required=True) | |
args = parser.parse_args() | |
crawler(args.file, args.website) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment