Skip to content

Instantly share code, notes, and snippets.

@daler
Last active August 29, 2015 14:02
Show Gist options
  • Save daler/aad942c746fbffda566e to your computer and use it in GitHub Desktop.
Save daler/aad942c746fbffda566e to your computer and use it in GitHub Desktop.
Download supplemental data for the metaseq manuscript
#!/usr/bin/env python
import sys
import os
import errno
import argparse
import logging
from urllib2 import urlopen
from urllib2 import Request
from socket import error as SocketError
usage = """
This script downloads and unpacks supplemental scripts and processed data
for the metaseq manuscript.
Data are hosted on figshare (http://figshare.com), and are split into
chunks that are <500MB.
Downloading from figshare, which is actually a direct connection to
Amazon's S3 storage, can be unstable. This script provides a robust
downloader that is able to resume downloads if the connection is reset.
Furthermore, a download will be skipped if its file size is identical to
that on the server, so this script can be re-run to refresh only the files
that need updating.
About 6 GB of data will be downloaded. When uncompressed, the data will
occupy about 10 GB.
"""
ap = argparse.ArgumentParser(usage=usage)
ap.add_argument('dest',
help='''Destination directory. Scripts and processed data will
be downloaded and unpacked here. Will be created if it does
not exist.''')
ap.add_argument('--keep-downloaded-archives', '-k', action='store_true',
help='''Do not delete the downloaded archives after they are
unpacked.''')
args = ap.parse_args()
logging.basicConfig(
level=logging.DEBUG,
format='[%(name)s] [%(asctime)s]: %(message)s')
logger = logging.getLogger(os.path.basename(sys.argv[0]))
DIR = args.dest
if not os.path.exists(DIR):
os.system('mkdir -p %s' % DIR)
def get_remote_size(url):
return int(urlopen(url).headers.getheader("Content-Length"))
def get_local_size(dest):
if not os.path.exists(dest):
return 0
return os.path.getsize(dest)
def downloader(url, dest, start=None, block_size=8192):
"""
Download data from `url` to `dest`.
"""
size = get_remote_size(url)
if start is None:
u = urlopen(url)
current_size = 0
else:
req = Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, size)
u = urlopen(req)
current_size = start
last_percent = None
with open(dest, 'ab') as f:
while True:
percent = round(float(current_size) / size * 100, 1)
if percent != last_percent:
print "\r %s%% complete" % percent,
sys.stdout.flush()
last_percent = percent
try:
buffer = u.read(block_size)
except SocketError as e:
if e.errno == errno.ECONNRESET:
return 'Error 104', current_size
raise
if not buffer:
if current_size == size:
break
else:
return "empty buffer", current_size
current_size += len(buffer)
f.write(buffer)
return 'complete', current_size
def download(url, dest):
tmp_dest = dest + '.tmp'
if os.path.exists(tmp_dest):
os.unlink(tmp_dest)
local_size = get_local_size(dest)
remote_size = get_remote_size(url)
start = None
while get_local_size(tmp_dest) < remote_size:
msg, current_size = downloader(url, tmp_dest, start)
start = current_size
if msg == 'complete':
assert remote_size == current_size
print ("\n Done. Remote size == downloaded size "
"== %s bytes" % current_size)
break
elif msg == 'Error 104':
print (
"[Error 104 connection reset by peer]. "
"Resuming download at %s bytes" % current_size)
elif msg == 'empty buffer':
print msg, remote_size, current_size
continue
else:
print msg, remote_size, current_size
break
os.rename(dest + '.tmp', dest)
return dest
# TODO: For review, files will be hosted on helix.nih.gov. Upon acceptance,
# files will be migrated to figshare and scripts moved to a public github
# repository.
MANIFEST = """
metaseq supplemental data (README file) | http://files.figshare.com/1579075/README.rst | 2 KB
metaseq supplemental data (annotations) | http://files.figshare.com/1577681/annotations.tar.gz | 38.09 MB
metaseq supplemental data (code to generate figures) | http://files.figshare.com/1577690/metaseq_figures_code.tar.gz | 648 KB
metaseq supplemental data (modENCODE bigWigs code) | http://files.figshare.com/1577718/metaseq_modencode_chip_chip_analysis_00.tar.gz | 38 KB
metaseq supplemental data (modENCODE bigWigs 01) | http://files.figshare.com/1577719/metaseq_modencode_chip_chip_analysis_01.tar | 215.77 MB
metaseq supplemental data (modENCODE bigWigs 02) | http://files.figshare.com/1577720/metaseq_modencode_chip_chip_analysis_02.tar | 215.50 MB
metaseq supplemental data (modENCODE bigWigs 03) | http://files.figshare.com/1577721/metaseq_modencode_chip_chip_analysis_03.tar | 215.73 MB
metaseq supplemental data (modENCODE bigWigs 04) | http://files.figshare.com/1577722/metaseq_modencode_chip_chip_analysis_04.tar | 188.36 MB
metaseq supplemental data (modENCODE bigWigs 05) | http://files.figshare.com/1577723/metaseq_modencode_chip_chip_analysis_05.tar | 208.32 MB
metaseq supplemental data (modENCODE bigWigs 06) | http://files.figshare.com/1577724/metaseq_modencode_chip_chip_analysis_06.tar | 215.46 MB
metaseq supplemental data (modENCODE bigWigs 07) | http://files.figshare.com/1577726/metaseq_modencode_chip_chip_analysis_07.tar | 215.90 MB
metaseq supplemental data (modENCODE bigWigs 08) | http://files.figshare.com/1577725/metaseq_modencode_chip_chip_analysis_08.tar | 215.94 MB
metaseq supplemental data (modENCODE bigWigs 09) | http://files.figshare.com/1577727/metaseq_modencode_chip_chip_analysis_09.tar | 201.88 MB
metaseq supplemental data (modENCODE bigWigs 10) | http://files.figshare.com/1577728/metaseq_modencode_chip_chip_analysis_10.tar | 215.80 MB
metaseq supplemental data (modENCODE bigWigs 11) | http://files.figshare.com/1577729/metaseq_modencode_chip_chip_analysis_11.tar | 242.95 MB
metaseq supplemental data (preprocessed data for figures 1) | http://files.figshare.com/1577712/metaseq_figures_preprocessed_data_1.tar.gz | 115.83 MB
metaseq supplemental data (preprocessed data for figures 2) | http://files.figshare.com/1577715/metaseq_figures_preprocessed_data_2.tar.gz | 353.94 MB
metaseq supplemental data (preprocessed data for figures 3) | http://files.figshare.com/1577711/metaseq_figures_preprocessed_data_3.tar.gz | 131.09 MB
metaseq supplemental data (preprocessed data for figures 4) | http://files.figshare.com/1577714/metaseq_figures_preprocessed_data_4.tar.gz | 106.12 MB
metaseq supplemental data (preprocessed data for figures 5) | http://files.figshare.com/1577717/metaseq_figures_preprocessed_data_5.tar.gz | 207.35 MB
metaseq supplemental data (preprocessed data for figures 6) | http://files.figshare.com/1577716/metaseq_figures_preprocessed_data_6.tar.gz | 74.61 MB
metaseq supplemental data (RIP-seq bigWigs, Kc) | http://files.figshare.com/1577692/kc_ripseq_bigwigs.tar | 228.05 MB
metaseq supplemental data (ChIP-seq bigWigs, Kc) | http://files.figshare.com/1577691/kc_chipseq_bigwigs.tar | 167.33 MB
metaseq supplemental data (called peaks) | http://files.figshare.com/1577688/called_peaks.tar.gz | 4.90 MB
metaseq supplemental data (DESeq results) | http://files.figshare.com/1577689/deseq_results.tar.gz | 3.95 MB
metaseq supplemental data (RIP-seq bigWigs, BG3 cells) | http://files.figshare.com/1577687/bg3_ripseq_bigwigs.tar | 195.22 MB
metaseq supplemental data (ChIP-seq bigWig, BG3 cells) | http://files.figshare.com/1577686/bg3_chipseq_bigwigs.tar | 87.76 MB
metaseq supplemental data (ChIP-seq BAMs, Kc Shep) | http://files.figshare.com/1579094/kc_shep_chipseq_bams.tar | 369.92 MB
metaseq supplemental data (ChIP-seq BAMs, BG3 Shep) | http://files.figshare.com/1579095/bg3_shep_chipseq_bams.tar | 265.85 MB
metaseq supplemental data (ChIP-seq BAMs, Kc Su(Hw)) | http://files.figshare.com/1579093/kc_suhw_chipseq_bams.tar | 237.23 MB
metaseq supplemental data (ChIP-seq BAMs, BG3 Su(Hw)) | http://files.figshare.com/1579092/bg3_suhw_chipseq_bams.tar | 150.78 MB
metaseq supplemental data (ChIP-seq BAMs, BG3 input) | http://files.figshare.com/1579097/bg3_input_chipseq_bams.tar | 252.48 MB
metaseq supplemental data (ChIP-seq BAMs, Kc input) | http://files.figshare.com/1579096/kc_input_chipseq_bams.tar | 236.90 MB
"""
DATA = []
for item in MANIFEST.splitlines():
item = item.strip()
if len(item) == 0:
continue
desc, url, size = item.split('|')
DATA.append((desc.strip(), url.strip(), size.strip()))
for desc, url, size in DATA:
dest = os.path.join(DIR, os.path.basename(url))
#if os.path.exists(dest):
# logger.info('%s exists; skipping' % dest )
# continue
if get_local_size(dest) == get_remote_size(url):
logger.info("%s exists and is the right size; skipping" % dest)
continue
msg = """
url: {url}
description: {desc}
size: {size}""".format(**locals())
logger.info(msg)
download(url, os.path.join(DIR, os.path.basename(url)))
BASE = [os.path.basename(i) for _, i, _ in DATA]
for b in BASE:
if b.endswith('.gz'):
z = "z"
elif b.endswith('.tar'):
z = ""
else:
continue
cmds = '(cd {DIR}; tar -x{z}f {b})'.format(**locals())
logger.info('unpacking: %s; cmds: %s' % (b, cmds))
os.system(cmds)
if not args.keep_downloaded_archives:
cmds = '(cd {DIR}; rm {b})'.format(**locals())
logger.info('removing: %s; cmds: %s ' % (b, cmds))
os.system(cmds)
print """
Please cd to {DIR} and check the README.rst file.
Briefly:
Pre-processed data is included for convenience.
See {DIR}/manuscript-figures for the scripts to run, and see
{DIR}/manuscript-figures/README.rst for more details.
""".format(**locals())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment