Skip to content

Instantly share code, notes, and snippets.

@KernelPanicAUS
Created June 16, 2020 11:11
Show Gist options
  • Save KernelPanicAUS/429e88c65c2556a22d486b2dfff994b7 to your computer and use it in GitHub Desktop.
Save KernelPanicAUS/429e88c65c2556a22d486b2dfff994b7 to your computer and use it in GitHub Desktop.
springer-dl
from lxml import html
import requests
import os
root_url = "https://hnarayanan.github.io/springer-books/"
category = "Computer Science"
download_path = f"{os.getcwd()}/downloads/{category.lower().replace(' ', '_')}"
headers = {
"Host": "link.springer.com",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://link.springer.com/",
"DNT": "1",
"Connection": "keep-alive",
"Cookie": "sim-inst-token=1::1592330235412:4c812914; trackid=1897426dcf7a4d8f9c24344cd; recaptcha=8K1/HkRi4MLVzheCLwngJ5CMeCvMypHSbPY0yWv2KFc=",
"Upgrade-Insecure-Requests": "1",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
}
def download_file(url, filename):
extension = url.split(".")[-1]
local_filename = f"{download_path}/{filename.replace('/','_')}.{extension}"
with requests.get(
f"https://link.springer.com{url}", headers=headers, stream=True
) as r:
r.raise_for_status()
with open(local_filename, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
def extract_download_links(link):
print(f"Visiting link {link}")
book_page = html.fromstring(requests.get(link).content)
title = book_page.xpath('//div[@class="page-title"]/h1[1]/text()')[0]
links = book_page.xpath('//a[contains(@class,"c-button__icon-right")][*]/@href')
print(f"Title [{title}]")
print(links)
print("=======================")
for book_link in links:
download_file(book_link, title.lower().replace(" ", "_"))
def main():
page = requests.get(root_url)
tree = html.fromstring(page.content)
if not os.path.exists(download_path):
os.makedirs(download_path)
links = tree.xpath(
f"//h2[text() ='{category}']/following-sibling::div[@class='row mt-2'][1]/div[*]/div[@class='card mb-2']/div/a/@href"
)
for link in links:
extract_download_links(link)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment