vilhalmer · January 27, 2017 01:06
diff --git a/nih-grab.py b/nih-grab.py
 #!/usr/bin/env python

 from __future__ import print_function
 import os
 from sys import argv, exit
 import re
 from time import sleep

 import requests

 if __name__ == '__main__':
    id_regex = re.compile(r'PMCID: (.*)$')

    items = []
    with open(argv[1], 'r') as source:
        for line in source:
            match = id_regex.search(line)
            if match:
                items.append(match.group(1))

    for pmcid in items:
        url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/'.format(pmcid)
        path = '{}.pdf'.format(pmcid)
        if os.path.exists(path):
            print('Skipping {}'.format(path))
            continue
        else:
            print('Retrieving {}'.format(url))

        response = requests.get(url, headers={
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
        })

        if response.status_code != 200:
            print('  Failed to retrieve: {}'.format(response))
            if response.status_code == 403:
                print('Got blocked!')
                exit(1)

            continue

        with open(path, 'w') as out:
            out.write(response.content)

        sleep_time = 30
        print('  Done! Waiting {} seconds'.format(sleep_time))
        sleep(sleep_time) # Give the server a break
	#!/usr/bin/env python

	from __future__ import print_function
	import os
	from sys import argv, exit
	import re
	from time import sleep

	import requests

	if __name__ == '__main__':
	id_regex = re.compile(r'PMCID: (.*)$')

	items = []
	with open(argv[1], 'r') as source:
	for line in source:
	match = id_regex.search(line)
	if match:
	items.append(match.group(1))

	for pmcid in items:
	url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/'.format(pmcid)
	path = '{}.pdf'.format(pmcid)
	if os.path.exists(path):
	print('Skipping {}'.format(path))
	continue
	else:
	print('Retrieving {}'.format(url))

	response = requests.get(url, headers={
	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
	})

	if response.status_code != 200:
	print(' Failed to retrieve: {}'.format(response))
	if response.status_code == 403:
	print('Got blocked!')
	exit(1)

	continue

	with open(path, 'w') as out:
	out.write(response.content)

	sleep_time = 30
	print(' Done! Waiting {} seconds'.format(sleep_time))
	sleep(sleep_time) # Give the server a break