Created
March 31, 2015 01:27
-
-
Save moonwatcher/813bad4f11fd5e1f898c to your computer and use it in GitHub Desktop.
recursive fastq reference resolver for cleaning up the scratch file system
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# recursive fastq reference resolver for cleaning up the scratch file system | |
# Author: Lior Galanti < [email protected] > | |
# NYU Center for Genetics and System Biology 2015 | |
import sys | |
import re | |
import json | |
import io | |
import os | |
folder = re.compile('^(?P<dirname>(?:/[^/:]+)+):$') | |
long_file = re.compile('^-[rwxs-]{9}(?:\+)?[\s]+[1][\s]+gencore[\s]+(?:cgsb|users)[\s]+[0-9\.MKG]+[\s]+[A-Za-z]+[\s]+[0-9]+[\s]+[0-9:]+[\s]+(?P<basename>.+\.fastq\.gz)$') | |
long_symlink = re.compile('^l[rwxs-]{9}(?:\+)?[\s]+[1][\s]+gencore[\s]+(?:cgsb|users)[\s]+[0-9]+[\s]+[A-Za-z]+[\s]+[0-9]+[\s]+[0-9:]+[\s]+(?P<basename>.+) -> (?P<reference>.+)$') | |
long_directory = re.compile('^d[rwxs-]{9}(?:\+)?[\s]+[0-9]+[\s]+gencore[\s]+(:?cgsb|users|root)[\s]+[0-9\.MKG]+[\s]+[A-Za-z]+[\s]+[0-9]+[\s]+[0-9:]+[\s]+(?P<basename>.+)$') | |
fastq_reference = re.compile('\.fastq\.gz$') | |
db = { | |
'record': [], | |
'directory': {}, | |
'file': {}, | |
'result': [] | |
} | |
def to_json(node): | |
return json.dumps(node, sort_keys=True, indent=4, ensure_ascii=False) | |
def index_record(record): | |
if 'dirname' in record: | |
if 'directory' in record or 'file' in record or 'symlink' in record: | |
db['record'].append(record) | |
db['directory'][record['dirname']] = record | |
if 'file' in record: | |
for file in record['file']: | |
if file['absolute'] not in db['file']: | |
db['file'][file['absolute']] = { | |
'dirname': record['dirname'], | |
'basename': file['basename'], | |
'path': file['absolute'], | |
'inode type': 'file' | |
} | |
else: | |
print('FILE ERROR: path {} mapped more than once', file['absolute']) | |
if 'symlink' in record: | |
for symlink in record['symlink']: | |
if fastq_reference.search(symlink['reference']): | |
if symlink['absolute'] not in db['file']: | |
db['file'][symlink['absolute']] = { | |
'dirname': record['dirname'], | |
'basename': symlink['basename'], | |
'path': symlink['absolute'], | |
'inode type': 'symlink', | |
'reference': symlink['reference'] | |
} | |
else: | |
print('SYMLINK ERROR: path {} mapped more than once', symlink['absolute']) | |
def process_record(record): | |
if 'dirname' in record: | |
if 'directory' in record: | |
for d in record['directory']: | |
d['absolute'] = os.path.abspath(os.path.join(record['dirname'], d['basename'])) | |
if 'file' in record: | |
for f in record['file']: | |
f['absolute'] = os.path.abspath(os.path.join(record['dirname'], f['basename'])) | |
if 'symlink' in record: | |
for s in record['symlink']: | |
s['absolute'] = os.path.abspath(os.path.join(record['dirname'], s['basename'])) | |
s['reference'] = os.path.abspath(os.path.join(record['dirname'], s['reference'])) | |
if os.path.commonprefix(('/data/cgsb', s['reference'])) == '/data/cgsb': | |
s['reference'] = s['reference'].replace('/data/cgsb', '/scratch/cgsb') | |
def first_pass(): | |
record = None | |
state = 0 | |
for line in sys.stdin: | |
line = line.strip() | |
if state == 0: | |
match = folder.search(line) | |
if match: | |
record = { | |
'dirname': match.groupdict()['dirname'] | |
} | |
state = 1 | |
elif state == 1: | |
if line == '': | |
state = 0 | |
process_record(record) | |
index_record(record) | |
record = None | |
else: | |
match = long_file.search(line) | |
if match: | |
if 'file' not in record: record['file'] = [] | |
record['file'].append({'basename': match.groupdict()['basename']}) | |
else: | |
match = long_directory.search(line) | |
if match: | |
d = {'basename': match.groupdict()['basename']} | |
if d['basename'] not in ('.', '..'): | |
if 'directory' not in record: record['directory'] = [] | |
record['directory'].append(d) | |
else: | |
match = long_symlink.search(line) | |
if match: | |
if 'symlink' not in record: record['symlink'] = [] | |
record['symlink'].append({ 'basename': match.groupdict()['basename'], 'reference': match.groupdict()['reference'] }) | |
def second_pass(): | |
def fake(real, link): | |
if real in db['directory']: | |
referenced = db['directory'][real] | |
record = { 'dirname': link } | |
if 'directory' in referenced: | |
record['directory'] = [] | |
for d in referenced['directory']: | |
directory = { | |
'basename': d['basename'], | |
'absolute': os.path.join(link, d['basename']) | |
} | |
record['directory'].append(directory) | |
fake(os.path.join(real, directory['basename']), directory['absolute']) | |
if 'file' in referenced: | |
record['symlink'] = [] | |
for f in referenced['file']: | |
symlink = { | |
'basename': f['basename'], | |
'absolute': os.path.join(link, f['basename']), | |
'reference': f['absolute'] | |
} | |
record['symlink'].append(symlink) | |
index_record(record) | |
for record in db['record']: | |
if 'symlink' in record: | |
for symlink in record['symlink']: | |
if not fastq_reference.search(symlink['reference']): | |
fake(symlink['reference'], symlink['absolute']) | |
def third_pass(): | |
db['result'] = sorted(db['file'].values(), key=lambda record: record['path']) | |
first_pass() | |
second_pass() | |
third_pass() | |
print(to_json(db['result'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment