Created
January 18, 2024 18:09
-
-
Save octvs/016856d55a208a0c1fde108f4489070a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/papis/scihub.py b/papis/scihub.py | |
new file mode 100644 | |
index 00000000..77fb76df | |
--- /dev/null | |
+++ b/papis/scihub.py | |
@@ -0,0 +1,117 @@ | |
+import doi | |
+import scihub | |
+import webbrowser | |
+import papis.importer | |
+import papis.crossref | |
+import tempfile | |
+import colorama | |
+import warnings | |
+import urllib.request | |
+ | |
+ | |
+WARNING_NOTICE = ''' | |
+{bb} .+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+. {ns} | |
+{bb}( ){ns} | |
+{bb} ) {rb} WARNING NOTICE {bb} \ | |
+( {ns} | |
+{bb}( ---------------- ){ns} | |
+{bb} ) ( {ns} | |
+{bb}( This script uses the platform {rb}SCIHUB{bb}, which may or MAY NOT \ | |
+){ns} | |
+{bb} ) be in conflict with local laws in your country. Use it at ( {ns} | |
+{bb}( your own risk, {rb}the author bears no responsibility{bb}. \ | |
+){ns} | |
+{bb} ) ( {ns} | |
+{bb}( papis team ){ns} | |
+{bb} "+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+" {ns}\ | |
+'''.format( | |
+ bb=colorama.Back.BLACK + colorama.Fore.WHITE, | |
+ ns=colorama.Style.RESET_ALL, | |
+ rb=colorama.Back.RED, | |
+) | |
+ | |
+ | |
+class Importer(papis.importer.Importer): | |
+ | |
+ """Importer that tries to get files and data first from crossref, | |
+ and if no files are found on crossref, try to get them from scihub. | |
+ """ | |
+ | |
+ def __init__(self, **kwargs): | |
+ papis.importer.Importer.__init__(self, name='scihub', **kwargs) | |
+ self.doi = None | |
+ | |
+ @classmethod | |
+ def match(cls, uri): | |
+ try: | |
+ doi.validate_doi(uri) | |
+ except ValueError: | |
+ return None | |
+ else: | |
+ return Importer(uri=uri) | |
+ | |
+ def fetch(self): | |
+ doi_str = ( | |
+ doi.find_doi_in_text(self.uri) or | |
+ doi.find_doi_in_text( | |
+ urllib.request.urlopen(self.uri).read().decode('utf-8') | |
+ ) or | |
+ self.uri | |
+ ) | |
+ ctx = self.fetch_from_doi(doi_str) | |
+ if ctx: | |
+ if ctx.data: | |
+ self.ctx.data = ctx.data | |
+ if ctx.files: | |
+ self.ctx.files = ctx.files | |
+ return | |
+ self.get_files() | |
+ | |
+ def fetch_from_doi(self, doi_str): | |
+ doi_imp = papis.importer.get_importer_by_name('doi').match(doi_str) | |
+ if doi_imp is not None: | |
+ self.logger.info('getting data through doi') | |
+ doi_imp.fetch() | |
+ return doi_imp.ctx | |
+ | |
+ def get_files(self): | |
+ # ignore the https warnings for scihub | |
+ warnings.simplefilter('ignore') | |
+ self.logger.warning(WARNING_NOTICE) | |
+ sh = scihub.SciHub(self.uri) | |
+ try: | |
+ ctx = sh.fetch() | |
+ except scihub.CaptchaNeededException as e: | |
+ curl = e.captcha_url | |
+ self.logger.warning( | |
+ 'You have to solve the catcha in \n\t' | |
+ '{c.Back.RED}{c.Fore.WHITE}{url}{c.Style.RESET_ALL}' | |
+ .format(url=curl, c=colorama) | |
+ ) | |
+ self.logger.info('opening a browser for you...') | |
+ webbrowser.open(curl, new=1, autoraise=True) | |
+ if papis.utils.confirm('Try again?'): | |
+ ctx = sh.fetch() | |
+ except scihub.DocumentUrlNotFound: | |
+ self.logger.error( | |
+ 'Sorry, it does not appear to be possible to find and url' | |
+ ' for the given document using scihub' | |
+ ) | |
+ except Exception as e: | |
+ print(type(e)) | |
+ self.logger.error(e) | |
+ else: | |
+ assert(ctx is not None) | |
+ assert(ctx.url is not None) | |
+ assert(ctx.pdf is not None) | |
+ out = tempfile.mktemp(suffix='.pdf') | |
+ self.logger.info('got file from: {0}'.format(ctx.url)) | |
+ self.logger.info('writing file in: {0}'.format(out)) | |
+ with open(out, 'wb+') as fd: | |
+ fd.write(ctx.pdf) | |
+ self.ctx.files = [out] | |
+ if not self.ctx.data and ctx.doi: | |
+ doi_ctx = self.fetch_from_doi(ctx.doi) | |
+ if doi_ctx.data: | |
+ self.logger.info('got data from doi {0}'.format(ctx.doi)) | |
+ self.ctx.data = doi_ctx.data | |
diff --git a/setup.py b/setup.py | |
index 97678f9f..b746bd63 100644 | |
--- a/setup.py | |
+++ b/setup.py | |
@@ -99,6 +99,7 @@ setup( | |
"python-doi>=0.1.1", | |
"python-slugify>=1.2.6", | |
"requests>=2.11.1", | |
+ "scihub>=0.0.1", | |
"stevedore>=1.30", | |
"tqdm>=4.1", | |
], | |
@@ -195,6 +196,7 @@ setup( | |
"pdf2arxivid=papis.arxiv:ArxividFromPdfImporter", | |
"pdf2doi=papis.crossref:DoiFromPdfImporter", | |
"pmid=papis.pubmed:Importer", | |
+ "scihub=papis.scihub:Importer", | |
"yaml=papis.yaml:Importer", | |
], | |
"papis.picker": [ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment