Created
February 14, 2016 15:53
-
-
Save tpoisot/931590ac85a2f6f9f059 to your computer and use it in GitHub Desktop.
#ICanHazPDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import re | |
import requests | |
import tempfile | |
import urllib.request | |
import random | |
import sys | |
import os | |
# ASCII colors | |
class acol: | |
END = '\033[0m' | |
BOLD = '\033[1m' | |
RED = '\033[91m' | |
GREEN = '\033[92m' | |
YELLOW = '\033[93m' | |
BLUE = '\033[94m' | |
MAGENTA = '\033[95m' | |
CYAN = '\033[96m' | |
def get_scihub_pdf(doi): | |
_root = random.choice(["sci-hub.io", "sci-hub.cc"]) | |
_doi_url = "http://" + _root + "/" + doi | |
print("\t" + acol.MAGENTA + "SciHub:\t" + acol.END + _doi_url + acol.END) | |
getpdf = re.compile(u'<iframe src = "(.+\.pdf)" id = "pdf">') | |
try : | |
_url = _doi_url | |
_redirect_url = requests.get(_url).url | |
if not _root in _redirect_url : | |
print("\t" + acol.MAGENTA + "Goto:\t" + acol.END + _redirect_url + acol.END) | |
raise ValueError("Redirected") | |
_url = _redirect_url | |
_url_html_content = requests.get(_url).text | |
search_result = re.search(getpdf, _url_html_content) | |
if not search_result == None: | |
print("\t" + acol.BLUE + "PDF at:\t" + acol.END + search_result.group(1) + acol.END) | |
return search_result.group(1) | |
else: | |
raise ValueError("Unable to read PDF") | |
except : | |
raise ValueError("No PDF known to SciHub") | |
""" | |
Download the file itself | |
""" | |
def download_file(url, fname): | |
# Look ma, I'm a browser! Fuck you, publishers. | |
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',} | |
r = requests.get(url, stream=True, headers=header) | |
with open(fname, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: | |
f.write(chunk) | |
return fname | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
raise ValueError("usage: ./icanhazpdf.py doi") | |
else: | |
doi = sys.argv[1] | |
_url = None | |
try : | |
_url = get_scihub_pdf(doi) | |
except : | |
raise ValueError("Something went wrong.") | |
if not _url == None: | |
print("\t" + acol.YELLOW + "PDF URL:\t" + acol.END + _url) | |
_fname = '.'.join(doi.split('/'))+".pdf" | |
download_file(_url, _fname) | |
# Sci Hub might ask captcha | |
# This next part is flaky as shit | |
try : | |
open(_fname).read() | |
if not 'captcha' in open(_fname).read(): | |
print("\t" + acol.BOLD + acol.GREEN + "#ICanHazPDF! \n") | |
except : | |
# If the file can't be read this way, it's likely to be a PDF | |
os.remove(_fname) | |
print("\t" + acol.BOLD + acol.RED + "Captcha'ed, try again later" + acol.END + "\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment