Created
August 8, 2019 10:09
-
-
Save jinie/b8b67c80cb4e18477469ea29488cd1ac to your computer and use it in GitHub Desktop.
Script to archive all electronic issues of Linux Journal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from bs4 import BeautifulSoup | |
import requests | |
import re | |
account_no = '000000' #LJ Account number, zero prefixed | |
baseurl = 'https://secure2.linuxjournal.com' | |
def get_filename_from_cd(cd): | |
""" | |
Get filename from content-disposition | |
""" | |
if not cd: | |
return None | |
fname = re.findall('filename=(.+)', cd) | |
if len(fname) == 0: | |
return None | |
return fname[0] | |
def soup_filter(tag): | |
""" | |
Find all download tags | |
""" | |
return (tag.name == 'a' and | |
tag.parent.name == 'div' and | |
'downloadbtn' in tag.parent['class']) | |
def get_archive_list(url): | |
""" | |
Return list of all download links, PDF, EPUB and MOBI | |
""" | |
r = requests.get(url) | |
soup = BeautifulSoup(r.content, 'html.parser') | |
return [l['href'] for l in soup.findAll(soup_filter)] | |
def login(session): | |
""" | |
Create a session | |
""" | |
try: | |
data= { 'ucLJFooter_accountnumber': account_no } | |
return session.post('https://secure2.linuxjournal.com/pdf/dljdownload.php',data) | |
except Exception as e: | |
print(e) | |
raise e | |
def get_download_link(session,url): | |
""" | |
Return a real download link from the "your download should begin soon" page | |
""" | |
r=session.get(url) | |
soup = BeautifulSoup(r.content,'html.parser') | |
ret = soup.find('a') | |
return ret['href'] | |
def download_file(session, url): | |
""" | |
Download the binary file | |
""" | |
url = baseurl + url | |
local_filename = url.split('/')[-1] #In case CD doesn't hold a filename | |
with session.get(url, stream=True) as r: | |
r.raise_for_status() | |
filename=get_filename_from_cd(r.headers.get('content-disposition')) | |
if filename is not None: | |
local_filename = filename.lstrip('"').rstrip('"') | |
print(local_filename) | |
with open(local_filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=8192): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
# f.flush() | |
url='https://secure2.linuxjournal.com/pdf/dljdownload.php' | |
s = requests.session() | |
r = login(s) | |
soup = BeautifulSoup(r.content, 'html.parser') | |
archive = [l['href'] for l in soup.findAll(soup_filter)] | |
for a in archive: | |
dlink = get_download_link(s,a) | |
download_file(s,dlink) | |
It’s not pretty, but considering it’s a one time thing, and I spent just about 20 minutes writing it, I think it’s “good enough”(TM) :-)
+1
This looks good, but how does one find the account number? For me the way to access the site is using an e-mail id & zip code ...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome 👍