Skip to content

Instantly share code, notes, and snippets.

@octvs
Created January 18, 2024 18:09
Show Gist options
  • Save octvs/016856d55a208a0c1fde108f4489070a to your computer and use it in GitHub Desktop.
Save octvs/016856d55a208a0c1fde108f4489070a to your computer and use it in GitHub Desktop.
diff --git a/papis/scihub.py b/papis/scihub.py
new file mode 100644
index 00000000..77fb76df
--- /dev/null
+++ b/papis/scihub.py
@@ -0,0 +1,117 @@
+import doi
+import scihub
+import webbrowser
+import papis.importer
+import papis.crossref
+import tempfile
+import colorama
+import warnings
+import urllib.request
+
+
+WARNING_NOTICE = '''
+{bb} .+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+. {ns}
+{bb}( ){ns}
+{bb} ) {rb} WARNING NOTICE {bb} \
+( {ns}
+{bb}( ---------------- ){ns}
+{bb} ) ( {ns}
+{bb}( This script uses the platform {rb}SCIHUB{bb}, which may or MAY NOT \
+){ns}
+{bb} ) be in conflict with local laws in your country. Use it at ( {ns}
+{bb}( your own risk, {rb}the author bears no responsibility{bb}. \
+){ns}
+{bb} ) ( {ns}
+{bb}( papis team ){ns}
+{bb} "+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+" {ns}\
+'''.format(
+ bb=colorama.Back.BLACK + colorama.Fore.WHITE,
+ ns=colorama.Style.RESET_ALL,
+ rb=colorama.Back.RED,
+)
+
+
+class Importer(papis.importer.Importer):
+
+ """Importer that tries to get files and data first from crossref,
+ and if no files are found on crossref, try to get them from scihub.
+ """
+
+ def __init__(self, **kwargs):
+ papis.importer.Importer.__init__(self, name='scihub', **kwargs)
+ self.doi = None
+
+ @classmethod
+ def match(cls, uri):
+ try:
+ doi.validate_doi(uri)
+ except ValueError:
+ return None
+ else:
+ return Importer(uri=uri)
+
+ def fetch(self):
+ doi_str = (
+ doi.find_doi_in_text(self.uri) or
+ doi.find_doi_in_text(
+ urllib.request.urlopen(self.uri).read().decode('utf-8')
+ ) or
+ self.uri
+ )
+ ctx = self.fetch_from_doi(doi_str)
+ if ctx:
+ if ctx.data:
+ self.ctx.data = ctx.data
+ if ctx.files:
+ self.ctx.files = ctx.files
+ return
+ self.get_files()
+
+ def fetch_from_doi(self, doi_str):
+ doi_imp = papis.importer.get_importer_by_name('doi').match(doi_str)
+ if doi_imp is not None:
+ self.logger.info('getting data through doi')
+ doi_imp.fetch()
+ return doi_imp.ctx
+
+ def get_files(self):
+ # ignore the https warnings for scihub
+ warnings.simplefilter('ignore')
+ self.logger.warning(WARNING_NOTICE)
+ sh = scihub.SciHub(self.uri)
+ try:
+ ctx = sh.fetch()
+ except scihub.CaptchaNeededException as e:
+ curl = e.captcha_url
+ self.logger.warning(
+ 'You have to solve the catcha in \n\t'
+ '{c.Back.RED}{c.Fore.WHITE}{url}{c.Style.RESET_ALL}'
+ .format(url=curl, c=colorama)
+ )
+ self.logger.info('opening a browser for you...')
+ webbrowser.open(curl, new=1, autoraise=True)
+ if papis.utils.confirm('Try again?'):
+ ctx = sh.fetch()
+ except scihub.DocumentUrlNotFound:
+ self.logger.error(
+ 'Sorry, it does not appear to be possible to find and url'
+ ' for the given document using scihub'
+ )
+ except Exception as e:
+ print(type(e))
+ self.logger.error(e)
+ else:
+ assert(ctx is not None)
+ assert(ctx.url is not None)
+ assert(ctx.pdf is not None)
+ out = tempfile.mktemp(suffix='.pdf')
+ self.logger.info('got file from: {0}'.format(ctx.url))
+ self.logger.info('writing file in: {0}'.format(out))
+ with open(out, 'wb+') as fd:
+ fd.write(ctx.pdf)
+ self.ctx.files = [out]
+ if not self.ctx.data and ctx.doi:
+ doi_ctx = self.fetch_from_doi(ctx.doi)
+ if doi_ctx.data:
+ self.logger.info('got data from doi {0}'.format(ctx.doi))
+ self.ctx.data = doi_ctx.data
diff --git a/setup.py b/setup.py
index 97678f9f..b746bd63 100644
--- a/setup.py
+++ b/setup.py
@@ -99,6 +99,7 @@ setup(
"python-doi>=0.1.1",
"python-slugify>=1.2.6",
"requests>=2.11.1",
+ "scihub>=0.0.1",
"stevedore>=1.30",
"tqdm>=4.1",
],
@@ -195,6 +196,7 @@ setup(
"pdf2arxivid=papis.arxiv:ArxividFromPdfImporter",
"pdf2doi=papis.crossref:DoiFromPdfImporter",
"pmid=papis.pubmed:Importer",
+ "scihub=papis.scihub:Importer",
"yaml=papis.yaml:Importer",
],
"papis.picker": [
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment