Created
November 4, 2019 21:12
-
-
Save stzsch/dfef064b9848a2ec9e1759d48ff66066 to your computer and use it in GitHub Desktop.
magzdb.org download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Must have wget installed on system | |
# Instructions: login on magzdb.org and export your cookies to a cookies.txt file on the same directory as the gist, then run it | |
import requests | |
from bs4 import BeautifulSoup | |
url = 'http://magzdb.org' | |
# TODO: make mag a cmd parameter | |
mag = '/j/1341' # 2600 Hackers Quarterly | |
r = requests.get(url+mag) | |
soup = BeautifulSoup(r.text) | |
# Remove site name and colons, and substitute spaces | |
title = soup.title.text.split(' | ')[0].replace(":","").replace(" ","_") | |
# yl references the yellow links corresponding to available issues | |
# TODO: report which issues got downloaded and which ones are missing | |
for yl in soup.find_all(style="background-color: yellow"): | |
[yl_year, yl_issue] = yl.parent['title'].split(' №') | |
yl_href = yl.parent['href'] | |
yl_r = requests.get(url+yl_href) | |
yl_soup = BeautifulSoup(yl_r.text) | |
yl_dl = yl_soup.find(href=re.compile("file"))['href'] | |
yl_dl_url = yl_dl.split('..')[1] | |
# Using the system wget seemed just less of a hassle | |
# TODO: use directories instead of filenames? | |
wget_cmd = 'wget --load-cookies=./cookies.txt '+url+yl_dl_url+' -O "'+title+'_'+yl_year+'_'+yl_issue+'.pdf"' | |
os.system(wget_cmd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment