Last active
August 29, 2015 14:02
-
-
Save daler/aad942c746fbffda566e to your computer and use it in GitHub Desktop.
Download supplemental data for the metaseq manuscript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import os | |
import errno | |
import argparse | |
import logging | |
from urllib2 import urlopen | |
from urllib2 import Request | |
from socket import error as SocketError | |
usage = """ | |
This script downloads and unpacks supplemental scripts and processed data | |
for the metaseq manuscript. | |
Data are hosted on figshare (http://figshare.com), and are split into | |
chunks that are <500MB. | |
Downloading from figshare, which is actually a direct connection to | |
Amazon's S3 storage, can be unstable. This script provides a robust | |
downloader that is able to resume downloads if the connection is reset. | |
Furthermore, a download will be skipped if its file size is identical to | |
that on the server, so this script can be re-run to refresh only the files | |
that need updating. | |
About 6 GB of data will be downloaded. When uncompressed, the data will | |
occupy about 10 GB. | |
""" | |
ap = argparse.ArgumentParser(usage=usage) | |
ap.add_argument('dest', | |
help='''Destination directory. Scripts and processed data will | |
be downloaded and unpacked here. Will be created if it does | |
not exist.''') | |
ap.add_argument('--keep-downloaded-archives', '-k', action='store_true', | |
help='''Do not delete the downloaded archives after they are | |
unpacked.''') | |
args = ap.parse_args() | |
logging.basicConfig( | |
level=logging.DEBUG, | |
format='[%(name)s] [%(asctime)s]: %(message)s') | |
logger = logging.getLogger(os.path.basename(sys.argv[0])) | |
DIR = args.dest | |
if not os.path.exists(DIR): | |
os.system('mkdir -p %s' % DIR) | |
def get_remote_size(url): | |
return int(urlopen(url).headers.getheader("Content-Length")) | |
def get_local_size(dest): | |
if not os.path.exists(dest): | |
return 0 | |
return os.path.getsize(dest) | |
def downloader(url, dest, start=None, block_size=8192): | |
""" | |
Download data from `url` to `dest`. | |
""" | |
size = get_remote_size(url) | |
if start is None: | |
u = urlopen(url) | |
current_size = 0 | |
else: | |
req = Request(url) | |
req.headers['Range'] = 'bytes=%s-%s' % (start, size) | |
u = urlopen(req) | |
current_size = start | |
last_percent = None | |
with open(dest, 'ab') as f: | |
while True: | |
percent = round(float(current_size) / size * 100, 1) | |
if percent != last_percent: | |
print "\r %s%% complete" % percent, | |
sys.stdout.flush() | |
last_percent = percent | |
try: | |
buffer = u.read(block_size) | |
except SocketError as e: | |
if e.errno == errno.ECONNRESET: | |
return 'Error 104', current_size | |
raise | |
if not buffer: | |
if current_size == size: | |
break | |
else: | |
return "empty buffer", current_size | |
current_size += len(buffer) | |
f.write(buffer) | |
return 'complete', current_size | |
def download(url, dest): | |
tmp_dest = dest + '.tmp' | |
if os.path.exists(tmp_dest): | |
os.unlink(tmp_dest) | |
local_size = get_local_size(dest) | |
remote_size = get_remote_size(url) | |
start = None | |
while get_local_size(tmp_dest) < remote_size: | |
msg, current_size = downloader(url, tmp_dest, start) | |
start = current_size | |
if msg == 'complete': | |
assert remote_size == current_size | |
print ("\n Done. Remote size == downloaded size " | |
"== %s bytes" % current_size) | |
break | |
elif msg == 'Error 104': | |
print ( | |
"[Error 104 connection reset by peer]. " | |
"Resuming download at %s bytes" % current_size) | |
elif msg == 'empty buffer': | |
print msg, remote_size, current_size | |
continue | |
else: | |
print msg, remote_size, current_size | |
break | |
os.rename(dest + '.tmp', dest) | |
return dest | |
# TODO: For review, files will be hosted on helix.nih.gov. Upon acceptance, | |
# files will be migrated to figshare and scripts moved to a public github | |
# repository. | |
MANIFEST = """ | |
metaseq supplemental data (README file) | http://files.figshare.com/1579075/README.rst | 2 KB | |
metaseq supplemental data (annotations) | http://files.figshare.com/1577681/annotations.tar.gz | 38.09 MB | |
metaseq supplemental data (code to generate figures) | http://files.figshare.com/1577690/metaseq_figures_code.tar.gz | 648 KB | |
metaseq supplemental data (modENCODE bigWigs code) | http://files.figshare.com/1577718/metaseq_modencode_chip_chip_analysis_00.tar.gz | 38 KB | |
metaseq supplemental data (modENCODE bigWigs 01) | http://files.figshare.com/1577719/metaseq_modencode_chip_chip_analysis_01.tar | 215.77 MB | |
metaseq supplemental data (modENCODE bigWigs 02) | http://files.figshare.com/1577720/metaseq_modencode_chip_chip_analysis_02.tar | 215.50 MB | |
metaseq supplemental data (modENCODE bigWigs 03) | http://files.figshare.com/1577721/metaseq_modencode_chip_chip_analysis_03.tar | 215.73 MB | |
metaseq supplemental data (modENCODE bigWigs 04) | http://files.figshare.com/1577722/metaseq_modencode_chip_chip_analysis_04.tar | 188.36 MB | |
metaseq supplemental data (modENCODE bigWigs 05) | http://files.figshare.com/1577723/metaseq_modencode_chip_chip_analysis_05.tar | 208.32 MB | |
metaseq supplemental data (modENCODE bigWigs 06) | http://files.figshare.com/1577724/metaseq_modencode_chip_chip_analysis_06.tar | 215.46 MB | |
metaseq supplemental data (modENCODE bigWigs 07) | http://files.figshare.com/1577726/metaseq_modencode_chip_chip_analysis_07.tar | 215.90 MB | |
metaseq supplemental data (modENCODE bigWigs 08) | http://files.figshare.com/1577725/metaseq_modencode_chip_chip_analysis_08.tar | 215.94 MB | |
metaseq supplemental data (modENCODE bigWigs 09) | http://files.figshare.com/1577727/metaseq_modencode_chip_chip_analysis_09.tar | 201.88 MB | |
metaseq supplemental data (modENCODE bigWigs 10) | http://files.figshare.com/1577728/metaseq_modencode_chip_chip_analysis_10.tar | 215.80 MB | |
metaseq supplemental data (modENCODE bigWigs 11) | http://files.figshare.com/1577729/metaseq_modencode_chip_chip_analysis_11.tar | 242.95 MB | |
metaseq supplemental data (preprocessed data for figures 1) | http://files.figshare.com/1577712/metaseq_figures_preprocessed_data_1.tar.gz | 115.83 MB | |
metaseq supplemental data (preprocessed data for figures 2) | http://files.figshare.com/1577715/metaseq_figures_preprocessed_data_2.tar.gz | 353.94 MB | |
metaseq supplemental data (preprocessed data for figures 3) | http://files.figshare.com/1577711/metaseq_figures_preprocessed_data_3.tar.gz | 131.09 MB | |
metaseq supplemental data (preprocessed data for figures 4) | http://files.figshare.com/1577714/metaseq_figures_preprocessed_data_4.tar.gz | 106.12 MB | |
metaseq supplemental data (preprocessed data for figures 5) | http://files.figshare.com/1577717/metaseq_figures_preprocessed_data_5.tar.gz | 207.35 MB | |
metaseq supplemental data (preprocessed data for figures 6) | http://files.figshare.com/1577716/metaseq_figures_preprocessed_data_6.tar.gz | 74.61 MB | |
metaseq supplemental data (RIP-seq bigWigs, Kc) | http://files.figshare.com/1577692/kc_ripseq_bigwigs.tar | 228.05 MB | |
metaseq supplemental data (ChIP-seq bigWigs, Kc) | http://files.figshare.com/1577691/kc_chipseq_bigwigs.tar | 167.33 MB | |
metaseq supplemental data (called peaks) | http://files.figshare.com/1577688/called_peaks.tar.gz | 4.90 MB | |
metaseq supplemental data (DESeq results) | http://files.figshare.com/1577689/deseq_results.tar.gz | 3.95 MB | |
metaseq supplemental data (RIP-seq bigWigs, BG3 cells) | http://files.figshare.com/1577687/bg3_ripseq_bigwigs.tar | 195.22 MB | |
metaseq supplemental data (ChIP-seq bigWig, BG3 cells) | http://files.figshare.com/1577686/bg3_chipseq_bigwigs.tar | 87.76 MB | |
metaseq supplemental data (ChIP-seq BAMs, Kc Shep) | http://files.figshare.com/1579094/kc_shep_chipseq_bams.tar | 369.92 MB | |
metaseq supplemental data (ChIP-seq BAMs, BG3 Shep) | http://files.figshare.com/1579095/bg3_shep_chipseq_bams.tar | 265.85 MB | |
metaseq supplemental data (ChIP-seq BAMs, Kc Su(Hw)) | http://files.figshare.com/1579093/kc_suhw_chipseq_bams.tar | 237.23 MB | |
metaseq supplemental data (ChIP-seq BAMs, BG3 Su(Hw)) | http://files.figshare.com/1579092/bg3_suhw_chipseq_bams.tar | 150.78 MB | |
metaseq supplemental data (ChIP-seq BAMs, BG3 input) | http://files.figshare.com/1579097/bg3_input_chipseq_bams.tar | 252.48 MB | |
metaseq supplemental data (ChIP-seq BAMs, Kc input) | http://files.figshare.com/1579096/kc_input_chipseq_bams.tar | 236.90 MB | |
""" | |
DATA = [] | |
for item in MANIFEST.splitlines(): | |
item = item.strip() | |
if len(item) == 0: | |
continue | |
desc, url, size = item.split('|') | |
DATA.append((desc.strip(), url.strip(), size.strip())) | |
for desc, url, size in DATA: | |
dest = os.path.join(DIR, os.path.basename(url)) | |
#if os.path.exists(dest): | |
# logger.info('%s exists; skipping' % dest ) | |
# continue | |
if get_local_size(dest) == get_remote_size(url): | |
logger.info("%s exists and is the right size; skipping" % dest) | |
continue | |
msg = """ | |
url: {url} | |
description: {desc} | |
size: {size}""".format(**locals()) | |
logger.info(msg) | |
download(url, os.path.join(DIR, os.path.basename(url))) | |
BASE = [os.path.basename(i) for _, i, _ in DATA] | |
for b in BASE: | |
if b.endswith('.gz'): | |
z = "z" | |
elif b.endswith('.tar'): | |
z = "" | |
else: | |
continue | |
cmds = '(cd {DIR}; tar -x{z}f {b})'.format(**locals()) | |
logger.info('unpacking: %s; cmds: %s' % (b, cmds)) | |
os.system(cmds) | |
if not args.keep_downloaded_archives: | |
cmds = '(cd {DIR}; rm {b})'.format(**locals()) | |
logger.info('removing: %s; cmds: %s ' % (b, cmds)) | |
os.system(cmds) | |
print """ | |
Please cd to {DIR} and check the README.rst file. | |
Briefly: | |
Pre-processed data is included for convenience. | |
See {DIR}/manuscript-figures for the scripts to run, and see | |
{DIR}/manuscript-figures/README.rst for more details. | |
""".format(**locals()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment