daler · August 29, 2015 14:02
diff --git a/download-metaseq-supplemental.py b/download-metaseq-supplemental.py
 #!/usr/bin/env python
 import sys
 import os
 import errno
 import argparse
 import logging
 from urllib2 import urlopen
 from urllib2 import Request
 from socket import error as SocketError


 usage = """

    This script downloads and unpacks supplemental scripts and processed data
    for the metaseq manuscript.

    Data are hosted on figshare (http://figshare.com), and are split into
    chunks that are <500MB.

    Downloading from figshare, which is actually a direct connection to
    Amazon's S3 storage, can be unstable.  This script provides a robust
    downloader that is able to resume downloads if the connection is reset.
    Furthermore, a download will be skipped if its file size is identical to
    that on the server, so this script can be re-run to refresh only the files
    that need updating.

    About 6 GB of data will be downloaded.  When uncompressed, the data will
    occupy about 10 GB.
 """

 ap = argparse.ArgumentParser(usage=usage)
 ap.add_argument('dest',
                help='''Destination directory.  Scripts and processed data will
                be downloaded and unpacked here.  Will be created if it does
                not exist.''')
 ap.add_argument('--keep-downloaded-archives', '-k', action='store_true',
                help='''Do not delete the downloaded archives after they are
                unpacked.''')
 args = ap.parse_args()

 logging.basicConfig(
    level=logging.DEBUG,
    format='[%(name)s] [%(asctime)s]: %(message)s')
 logger = logging.getLogger(os.path.basename(sys.argv[0]))

 DIR = args.dest
 if not os.path.exists(DIR):
    os.system('mkdir -p %s' % DIR)


 def get_remote_size(url):
    return int(urlopen(url).headers.getheader("Content-Length"))


 def get_local_size(dest):
    if not os.path.exists(dest):
        return 0
    return os.path.getsize(dest)


 def downloader(url, dest, start=None, block_size=8192):
    """
    Download data from `url` to `dest`.
    """
    size = get_remote_size(url)
    if start is None:
        u = urlopen(url)
        current_size = 0
    else:
        req = Request(url)
        req.headers['Range'] = 'bytes=%s-%s' % (start, size)
        u = urlopen(req)
        current_size = start
    last_percent = None
    with open(dest, 'ab') as f:
        while True:
            percent = round(float(current_size) / size * 100, 1)
            if percent != last_percent:
                print "\r    %s%% complete" % percent,
                sys.stdout.flush()
            last_percent = percent

            try:
                buffer = u.read(block_size)
            except SocketError as e:
                if e.errno == errno.ECONNRESET:
                    return 'Error 104', current_size
                raise

            if not buffer:
                if current_size == size:
                    break
                else:
                    return "empty buffer", current_size

            current_size += len(buffer)
            f.write(buffer)

    return 'complete', current_size


 def download(url, dest):
    tmp_dest = dest + '.tmp'
    if os.path.exists(tmp_dest):
        os.unlink(tmp_dest)
    local_size = get_local_size(dest)
    remote_size = get_remote_size(url)
    start = None
    while get_local_size(tmp_dest) < remote_size:
        msg, current_size = downloader(url, tmp_dest, start)
        start = current_size
        if msg == 'complete':
            assert remote_size == current_size
            print ("\n    Done. Remote size == downloaded size "
                   "== %s bytes" % current_size)
            break
        elif msg == 'Error 104':
            print (
                "[Error 104 connection reset by peer].  "
                "Resuming download at %s bytes" % current_size)
        elif msg == 'empty buffer':
            print msg, remote_size, current_size
            continue
        else:
            print msg, remote_size, current_size
            break

    os.rename(dest + '.tmp', dest)
    return dest


 # TODO: For review, files will be hosted on helix.nih.gov.  Upon acceptance,
 # files will be migrated to figshare and scripts moved to a public github
 # repository.
 MANIFEST = """
 metaseq supplemental data (README file) | http://files.figshare.com/1579075/README.rst | 2 KB
 metaseq supplemental data (annotations) | http://files.figshare.com/1577681/annotations.tar.gz | 38.09 MB
 metaseq supplemental data (code to generate figures) | http://files.figshare.com/1577690/metaseq_figures_code.tar.gz | 648 KB
 metaseq supplemental data (modENCODE bigWigs code) | http://files.figshare.com/1577718/metaseq_modencode_chip_chip_analysis_00.tar.gz | 38 KB
 metaseq supplemental data (modENCODE bigWigs 01) | http://files.figshare.com/1577719/metaseq_modencode_chip_chip_analysis_01.tar | 215.77 MB
 metaseq supplemental data (modENCODE bigWigs 02) | http://files.figshare.com/1577720/metaseq_modencode_chip_chip_analysis_02.tar | 215.50 MB
 metaseq supplemental data (modENCODE bigWigs 03) | http://files.figshare.com/1577721/metaseq_modencode_chip_chip_analysis_03.tar | 215.73 MB
 metaseq supplemental data (modENCODE bigWigs 04) | http://files.figshare.com/1577722/metaseq_modencode_chip_chip_analysis_04.tar | 188.36 MB
 metaseq supplemental data (modENCODE bigWigs 05) | http://files.figshare.com/1577723/metaseq_modencode_chip_chip_analysis_05.tar | 208.32 MB
 metaseq supplemental data (modENCODE bigWigs 06) | http://files.figshare.com/1577724/metaseq_modencode_chip_chip_analysis_06.tar | 215.46 MB
 metaseq supplemental data (modENCODE bigWigs 07) | http://files.figshare.com/1577726/metaseq_modencode_chip_chip_analysis_07.tar | 215.90 MB
 metaseq supplemental data (modENCODE bigWigs 08) | http://files.figshare.com/1577725/metaseq_modencode_chip_chip_analysis_08.tar | 215.94 MB
 metaseq supplemental data (modENCODE bigWigs 09) | http://files.figshare.com/1577727/metaseq_modencode_chip_chip_analysis_09.tar | 201.88 MB
 metaseq supplemental data (modENCODE bigWigs 10) | http://files.figshare.com/1577728/metaseq_modencode_chip_chip_analysis_10.tar | 215.80 MB
 metaseq supplemental data (modENCODE bigWigs 11) | http://files.figshare.com/1577729/metaseq_modencode_chip_chip_analysis_11.tar | 242.95 MB
 metaseq supplemental data (preprocessed data for figures 1) | http://files.figshare.com/1577712/metaseq_figures_preprocessed_data_1.tar.gz | 115.83 MB
 metaseq supplemental data (preprocessed data for figures 2) | http://files.figshare.com/1577715/metaseq_figures_preprocessed_data_2.tar.gz | 353.94 MB
 metaseq supplemental data (preprocessed data for figures 3) | http://files.figshare.com/1577711/metaseq_figures_preprocessed_data_3.tar.gz | 131.09 MB
 metaseq supplemental data (preprocessed data for figures 4) | http://files.figshare.com/1577714/metaseq_figures_preprocessed_data_4.tar.gz | 106.12 MB
 metaseq supplemental data (preprocessed data for figures 5) | http://files.figshare.com/1577717/metaseq_figures_preprocessed_data_5.tar.gz | 207.35 MB
 metaseq supplemental data (preprocessed data for figures 6) | http://files.figshare.com/1577716/metaseq_figures_preprocessed_data_6.tar.gz | 74.61 MB
 metaseq supplemental data (RIP-seq bigWigs, Kc) | http://files.figshare.com/1577692/kc_ripseq_bigwigs.tar | 228.05 MB
 metaseq supplemental data (ChIP-seq bigWigs, Kc) | http://files.figshare.com/1577691/kc_chipseq_bigwigs.tar | 167.33 MB
 metaseq supplemental data (called peaks) | http://files.figshare.com/1577688/called_peaks.tar.gz | 4.90 MB
 metaseq supplemental data (DESeq results) | http://files.figshare.com/1577689/deseq_results.tar.gz | 3.95 MB
 metaseq supplemental data (RIP-seq bigWigs, BG3 cells) | http://files.figshare.com/1577687/bg3_ripseq_bigwigs.tar | 195.22 MB
 metaseq supplemental data (ChIP-seq bigWig, BG3 cells) | http://files.figshare.com/1577686/bg3_chipseq_bigwigs.tar | 87.76 MB
 metaseq supplemental data (ChIP-seq BAMs, Kc Shep) | http://files.figshare.com/1579094/kc_shep_chipseq_bams.tar | 369.92 MB
 metaseq supplemental data (ChIP-seq BAMs, BG3 Shep) | http://files.figshare.com/1579095/bg3_shep_chipseq_bams.tar | 265.85 MB
 metaseq supplemental data (ChIP-seq BAMs, Kc Su(Hw)) | http://files.figshare.com/1579093/kc_suhw_chipseq_bams.tar | 237.23 MB
 metaseq supplemental data (ChIP-seq BAMs, BG3 Su(Hw)) | http://files.figshare.com/1579092/bg3_suhw_chipseq_bams.tar | 150.78 MB
 metaseq supplemental data (ChIP-seq BAMs, BG3 input) | http://files.figshare.com/1579097/bg3_input_chipseq_bams.tar | 252.48 MB
 metaseq supplemental data (ChIP-seq BAMs, Kc input) | http://files.figshare.com/1579096/kc_input_chipseq_bams.tar | 236.90 MB
 """
 DATA = []
 for item in MANIFEST.splitlines():
    item = item.strip()
    if len(item) == 0:
        continue
    desc, url, size = item.split('|')
    DATA.append((desc.strip(), url.strip(), size.strip()))

 for desc, url, size in DATA:
    dest = os.path.join(DIR, os.path.basename(url))

    #if os.path.exists(dest):
    #    logger.info('%s exists; skipping' % dest )
    #    continue
    if get_local_size(dest) == get_remote_size(url):
        logger.info("%s exists and is the right size; skipping" % dest)
        continue

    msg = """
    url: {url}
    description: {desc}
    size: {size}""".format(**locals())
    logger.info(msg)
    download(url, os.path.join(DIR, os.path.basename(url)))

 BASE = [os.path.basename(i) for _, i, _ in DATA]
 for b in BASE:
    if b.endswith('.gz'):
        z = "z"
    elif b.endswith('.tar'):
        z = ""
    else:
        continue
    cmds = '(cd {DIR}; tar -x{z}f {b})'.format(**locals())
    logger.info('unpacking: %s;  cmds: %s' % (b, cmds))
    os.system(cmds)
    if not args.keep_downloaded_archives:
        cmds = '(cd {DIR}; rm {b})'.format(**locals())
        logger.info('removing: %s; cmds: %s ' % (b, cmds))
        os.system(cmds)

 print """

 Please cd to {DIR} and check the README.rst file.

 Briefly:

   Pre-processed data is included for convenience.

   See {DIR}/manuscript-figures for the scripts to run, and see
   {DIR}/manuscript-figures/README.rst for more details.

 """.format(**locals())
	#!/usr/bin/env python
	import sys
	import os
	import errno
	import argparse
	import logging
	from urllib2 import urlopen
	from urllib2 import Request
	from socket import error as SocketError


	usage = """

	This script downloads and unpacks supplemental scripts and processed data
	for the metaseq manuscript.

	Data are hosted on figshare (http://figshare.com), and are split into
	chunks that are <500MB.

	Downloading from figshare, which is actually a direct connection to
	Amazon's S3 storage, can be unstable. This script provides a robust
	downloader that is able to resume downloads if the connection is reset.
	Furthermore, a download will be skipped if its file size is identical to
	that on the server, so this script can be re-run to refresh only the files
	that need updating.

	About 6 GB of data will be downloaded. When uncompressed, the data will
	occupy about 10 GB.
	"""

	ap = argparse.ArgumentParser(usage=usage)
	ap.add_argument('dest',
	help='''Destination directory. Scripts and processed data will
	be downloaded and unpacked here. Will be created if it does
	not exist.''')
	ap.add_argument('--keep-downloaded-archives', '-k', action='store_true',
	help='''Do not delete the downloaded archives after they are
	unpacked.''')
	args = ap.parse_args()

	logging.basicConfig(
	level=logging.DEBUG,
	format='[%(name)s] [%(asctime)s]: %(message)s')
	logger = logging.getLogger(os.path.basename(sys.argv[0]))

	DIR = args.dest
	if not os.path.exists(DIR):
	os.system('mkdir -p %s' % DIR)


	def get_remote_size(url):
	return int(urlopen(url).headers.getheader("Content-Length"))


	def get_local_size(dest):
	if not os.path.exists(dest):
	return 0
	return os.path.getsize(dest)


	def downloader(url, dest, start=None, block_size=8192):
	"""
	Download data from `url` to `dest`.
	"""
	size = get_remote_size(url)
	if start is None:
	u = urlopen(url)
	current_size = 0
	else:
	req = Request(url)
	req.headers['Range'] = 'bytes=%s-%s' % (start, size)
	u = urlopen(req)
	current_size = start
	last_percent = None
	with open(dest, 'ab') as f:
	while True:
	percent = round(float(current_size) / size * 100, 1)
	if percent != last_percent:
	print "\r %s%% complete" % percent,
	sys.stdout.flush()
	last_percent = percent

	try:
	buffer = u.read(block_size)
	except SocketError as e:
	if e.errno == errno.ECONNRESET:
	return 'Error 104', current_size
	raise

	if not buffer:
	if current_size == size:
	break
	else:
	return "empty buffer", current_size

	current_size += len(buffer)
	f.write(buffer)

	return 'complete', current_size


	def download(url, dest):
	tmp_dest = dest + '.tmp'
	if os.path.exists(tmp_dest):
	os.unlink(tmp_dest)
	local_size = get_local_size(dest)
	remote_size = get_remote_size(url)
	start = None
	while get_local_size(tmp_dest) < remote_size:
	msg, current_size = downloader(url, tmp_dest, start)
	start = current_size
	if msg == 'complete':
	assert remote_size == current_size
	print ("\n Done. Remote size == downloaded size "
	"== %s bytes" % current_size)
	break
	elif msg == 'Error 104':
	print (
	"[Error 104 connection reset by peer]. "
	"Resuming download at %s bytes" % current_size)
	elif msg == 'empty buffer':
	print msg, remote_size, current_size
	continue
	else:
	print msg, remote_size, current_size
	break

	os.rename(dest + '.tmp', dest)
	return dest


	# TODO: For review, files will be hosted on helix.nih.gov. Upon acceptance,
	# files will be migrated to figshare and scripts moved to a public github
	# repository.
	MANIFEST = """
	metaseq supplemental data (README file) \| http://files.figshare.com/1579075/README.rst \| 2 KB
	metaseq supplemental data (annotations) \| http://files.figshare.com/1577681/annotations.tar.gz \| 38.09 MB
	metaseq supplemental data (code to generate figures) \| http://files.figshare.com/1577690/metaseq_figures_code.tar.gz \| 648 KB
	metaseq supplemental data (modENCODE bigWigs code) \| http://files.figshare.com/1577718/metaseq_modencode_chip_chip_analysis_00.tar.gz \| 38 KB
	metaseq supplemental data (modENCODE bigWigs 01) \| http://files.figshare.com/1577719/metaseq_modencode_chip_chip_analysis_01.tar \| 215.77 MB
	metaseq supplemental data (modENCODE bigWigs 02) \| http://files.figshare.com/1577720/metaseq_modencode_chip_chip_analysis_02.tar \| 215.50 MB
	metaseq supplemental data (modENCODE bigWigs 03) \| http://files.figshare.com/1577721/metaseq_modencode_chip_chip_analysis_03.tar \| 215.73 MB
	metaseq supplemental data (modENCODE bigWigs 04) \| http://files.figshare.com/1577722/metaseq_modencode_chip_chip_analysis_04.tar \| 188.36 MB
	metaseq supplemental data (modENCODE bigWigs 05) \| http://files.figshare.com/1577723/metaseq_modencode_chip_chip_analysis_05.tar \| 208.32 MB
	metaseq supplemental data (modENCODE bigWigs 06) \| http://files.figshare.com/1577724/metaseq_modencode_chip_chip_analysis_06.tar \| 215.46 MB
	metaseq supplemental data (modENCODE bigWigs 07) \| http://files.figshare.com/1577726/metaseq_modencode_chip_chip_analysis_07.tar \| 215.90 MB
	metaseq supplemental data (modENCODE bigWigs 08) \| http://files.figshare.com/1577725/metaseq_modencode_chip_chip_analysis_08.tar \| 215.94 MB
	metaseq supplemental data (modENCODE bigWigs 09) \| http://files.figshare.com/1577727/metaseq_modencode_chip_chip_analysis_09.tar \| 201.88 MB
	metaseq supplemental data (modENCODE bigWigs 10) \| http://files.figshare.com/1577728/metaseq_modencode_chip_chip_analysis_10.tar \| 215.80 MB
	metaseq supplemental data (modENCODE bigWigs 11) \| http://files.figshare.com/1577729/metaseq_modencode_chip_chip_analysis_11.tar \| 242.95 MB
	metaseq supplemental data (preprocessed data for figures 1) \| http://files.figshare.com/1577712/metaseq_figures_preprocessed_data_1.tar.gz \| 115.83 MB
	metaseq supplemental data (preprocessed data for figures 2) \| http://files.figshare.com/1577715/metaseq_figures_preprocessed_data_2.tar.gz \| 353.94 MB
	metaseq supplemental data (preprocessed data for figures 3) \| http://files.figshare.com/1577711/metaseq_figures_preprocessed_data_3.tar.gz \| 131.09 MB
	metaseq supplemental data (preprocessed data for figures 4) \| http://files.figshare.com/1577714/metaseq_figures_preprocessed_data_4.tar.gz \| 106.12 MB
	metaseq supplemental data (preprocessed data for figures 5) \| http://files.figshare.com/1577717/metaseq_figures_preprocessed_data_5.tar.gz \| 207.35 MB
	metaseq supplemental data (preprocessed data for figures 6) \| http://files.figshare.com/1577716/metaseq_figures_preprocessed_data_6.tar.gz \| 74.61 MB
	metaseq supplemental data (RIP-seq bigWigs, Kc) \| http://files.figshare.com/1577692/kc_ripseq_bigwigs.tar \| 228.05 MB
	metaseq supplemental data (ChIP-seq bigWigs, Kc) \| http://files.figshare.com/1577691/kc_chipseq_bigwigs.tar \| 167.33 MB
	metaseq supplemental data (called peaks) \| http://files.figshare.com/1577688/called_peaks.tar.gz \| 4.90 MB
	metaseq supplemental data (DESeq results) \| http://files.figshare.com/1577689/deseq_results.tar.gz \| 3.95 MB
	metaseq supplemental data (RIP-seq bigWigs, BG3 cells) \| http://files.figshare.com/1577687/bg3_ripseq_bigwigs.tar \| 195.22 MB
	metaseq supplemental data (ChIP-seq bigWig, BG3 cells) \| http://files.figshare.com/1577686/bg3_chipseq_bigwigs.tar \| 87.76 MB
	metaseq supplemental data (ChIP-seq BAMs, Kc Shep) \| http://files.figshare.com/1579094/kc_shep_chipseq_bams.tar \| 369.92 MB
	metaseq supplemental data (ChIP-seq BAMs, BG3 Shep) \| http://files.figshare.com/1579095/bg3_shep_chipseq_bams.tar \| 265.85 MB
	metaseq supplemental data (ChIP-seq BAMs, Kc Su(Hw)) \| http://files.figshare.com/1579093/kc_suhw_chipseq_bams.tar \| 237.23 MB
	metaseq supplemental data (ChIP-seq BAMs, BG3 Su(Hw)) \| http://files.figshare.com/1579092/bg3_suhw_chipseq_bams.tar \| 150.78 MB
	metaseq supplemental data (ChIP-seq BAMs, BG3 input) \| http://files.figshare.com/1579097/bg3_input_chipseq_bams.tar \| 252.48 MB
	metaseq supplemental data (ChIP-seq BAMs, Kc input) \| http://files.figshare.com/1579096/kc_input_chipseq_bams.tar \| 236.90 MB
	"""
	DATA = []
	for item in MANIFEST.splitlines():
	item = item.strip()
	if len(item) == 0:
	continue
	desc, url, size = item.split('\|')
	DATA.append((desc.strip(), url.strip(), size.strip()))

	for desc, url, size in DATA:
	dest = os.path.join(DIR, os.path.basename(url))

	#if os.path.exists(dest):
	# logger.info('%s exists; skipping' % dest )
	# continue
	if get_local_size(dest) == get_remote_size(url):
	logger.info("%s exists and is the right size; skipping" % dest)
	continue

	msg = """
	url: {url}
	description: {desc}
	size: {size}""".format(**locals())
	logger.info(msg)
	download(url, os.path.join(DIR, os.path.basename(url)))

	BASE = [os.path.basename(i) for _, i, _ in DATA]
	for b in BASE:
	if b.endswith('.gz'):
	z = "z"
	elif b.endswith('.tar'):
	z = ""
	else:
	continue
	cmds = '(cd {DIR}; tar -x{z}f {b})'.format(**locals())
	logger.info('unpacking: %s; cmds: %s' % (b, cmds))
	os.system(cmds)
	if not args.keep_downloaded_archives:
	cmds = '(cd {DIR}; rm {b})'.format(**locals())
	logger.info('removing: %s; cmds: %s ' % (b, cmds))
	os.system(cmds)

	print """

	Please cd to {DIR} and check the README.rst file.

	Briefly:

	Pre-processed data is included for convenience.

	See {DIR}/manuscript-figures for the scripts to run, and see
	{DIR}/manuscript-figures/README.rst for more details.

	""".format(**locals())