slowkow · February 21, 2013 19:58
diff --git a/ncbi_fetch.py b/ncbi_fetch.py
 #!/usr/bin/env python

 """ncbi_fetch.py

 Query NCBI and fetch the results into gzipped files. Files will be saved in
 the current directory as: 001-050.gz, 051-100.gz, etc.

 You can then do:

    zcat 001-050.gz | head

 Usage:
    ncbi_fetch.py <query> [--database=nuccore] [--rettype=fasta]
    ncbi_fetch.py <query> [options] [--count]

 Options:
    -h --help               show this help message and exit
    --version               show version and exit
    -c --count              just print the number of results and exit
    -b --batch=N            download N results at a time [default: 50]
    -d --database=DATABASE  NCBI database [default: nuccore]
    -t --rettype=TYPE       returned data format [default: fasta]
    -e --email=EMAIL        your email address [default: [email protected]]
 """

 import gzip
 import sys

 from Bio import Entrez
 from docopt import docopt

 if __name__ == '__main__':
    args = docopt(__doc__, version='0.1.0')

    Entrez.email = args['--email']

    handle = Entrez.esearch(db=args['--database'],
                            term=args['<query>'],
                            usehistory='N' if args['--count'] else 'Y')
    search_results = Entrez.read(handle)
    handle.close()

    count = int(search_results['Count'])

    if args['--count']:
        print(count)
        sys.exit(0)

    webenv = search_results['WebEnv']
    query_key = search_results['QueryKey']

    # Create ouput files like 001-050.gz, 051-100.gz, etc.
    d = str(len(str(count)))
    name = '{:0' + d + '}-{:0' + d + '}.gz'

    # Download this many results at a time.
    batch_size = int(args['--batch'])

    # Print progress to stderr.
    progress = 'Downloading records {}-{} of {}\n'

    for start in range(0, count, batch_size):
        end = min(count, start + batch_size)

        sys.stderr.write(progress.format(start + 1, end, count))

        handle = Entrez.efetch(db=args['--database'],
                               rettype=args['--rettype'],
                               retmode='text',
                               retstart=start,
                               retmax=end,
                               webenv=webenv,
                               query_key=query_key)
        data = handle.read()
        handle.close()

        with gzip.open(name.format(start + 1, end), 'wb') as f:
            f.write(data)
	#!/usr/bin/env python

	"""ncbi_fetch.py

	Query NCBI and fetch the results into gzipped files. Files will be saved in
	the current directory as: 001-050.gz, 051-100.gz, etc.

	You can then do:

	zcat 001-050.gz \| head

	Usage:
	ncbi_fetch.py <query> [--database=nuccore] [--rettype=fasta]
	ncbi_fetch.py <query> [options] [--count]

	Options:
	-h --help show this help message and exit
	--version show version and exit
	-c --count just print the number of results and exit
	-b --batch=N download N results at a time [default: 50]
	-d --database=DATABASE NCBI database [default: nuccore]
	-t --rettype=TYPE returned data format [default: fasta]
	-e --email=EMAIL your email address [default: [email protected]]
	"""

	import gzip
	import sys

	from Bio import Entrez
	from docopt import docopt

	if __name__ == '__main__':
	args = docopt(__doc__, version='0.1.0')

	Entrez.email = args['--email']

	handle = Entrez.esearch(db=args['--database'],
	term=args['<query>'],
	usehistory='N' if args['--count'] else 'Y')
	search_results = Entrez.read(handle)
	handle.close()

	count = int(search_results['Count'])

	if args['--count']:
	print(count)
	sys.exit(0)

	webenv = search_results['WebEnv']
	query_key = search_results['QueryKey']

	# Create ouput files like 001-050.gz, 051-100.gz, etc.
	d = str(len(str(count)))
	name = '{:0' + d + '}-{:0' + d + '}.gz'

	# Download this many results at a time.
	batch_size = int(args['--batch'])

	# Print progress to stderr.
	progress = 'Downloading records {}-{} of {}\n'

	for start in range(0, count, batch_size):
	end = min(count, start + batch_size)

	sys.stderr.write(progress.format(start + 1, end, count))

	handle = Entrez.efetch(db=args['--database'],
	rettype=args['--rettype'],
	retmode='text',
	retstart=start,
	retmax=end,
	webenv=webenv,
	query_key=query_key)
	data = handle.read()
	handle.close()

	with gzip.open(name.format(start + 1, end), 'wb') as f:
	f.write(data)