Skip to content

Instantly share code, notes, and snippets.

@vilhalmer
Created January 27, 2017 01:06
Show Gist options
  • Save vilhalmer/8f49a9cd2a044e6df4abcfff7c860d7e to your computer and use it in GitHub Desktop.
Save vilhalmer/8f49a9cd2a044e6df4abcfff7c860d7e to your computer and use it in GitHub Desktop.
Download a summary file from a search on https://www.ncbi.nlm.nih.gov/pmc/ (using the 'Send to' menu) and feed it to this script to archive everything.
#!/usr/bin/env python
from __future__ import print_function
import os
from sys import argv, exit
import re
from time import sleep
import requests
if __name__ == '__main__':
id_regex = re.compile(r'PMCID: (.*)$')
items = []
with open(argv[1], 'r') as source:
for line in source:
match = id_regex.search(line)
if match:
items.append(match.group(1))
for pmcid in items:
url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/'.format(pmcid)
path = '{}.pdf'.format(pmcid)
if os.path.exists(path):
print('Skipping {}'.format(path))
continue
else:
print('Retrieving {}'.format(url))
response = requests.get(url, headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
})
if response.status_code != 200:
print(' Failed to retrieve: {}'.format(response))
if response.status_code == 403:
print('Got blocked!')
exit(1)
continue
with open(path, 'w') as out:
out.write(response.content)
sleep_time = 30
print(' Done! Waiting {} seconds'.format(sleep_time))
sleep(sleep_time) # Give the server a break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment