Skip to content

Instantly share code, notes, and snippets.

@moonwatcher
Created March 31, 2015 01:27
Show Gist options
  • Save moonwatcher/813bad4f11fd5e1f898c to your computer and use it in GitHub Desktop.
Save moonwatcher/813bad4f11fd5e1f898c to your computer and use it in GitHub Desktop.
recursive fastq reference resolver for cleaning up the scratch file system
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# recursive fastq reference resolver for cleaning up the scratch file system
# Author: Lior Galanti < [email protected] >
# NYU Center for Genetics and System Biology 2015
import sys
import re
import json
import io
import os
folder = re.compile('^(?P<dirname>(?:/[^/:]+)+):$')
long_file = re.compile('^-[rwxs-]{9}(?:\+)?[\s]+[1][\s]+gencore[\s]+(?:cgsb|users)[\s]+[0-9\.MKG]+[\s]+[A-Za-z]+[\s]+[0-9]+[\s]+[0-9:]+[\s]+(?P<basename>.+\.fastq\.gz)$')
long_symlink = re.compile('^l[rwxs-]{9}(?:\+)?[\s]+[1][\s]+gencore[\s]+(?:cgsb|users)[\s]+[0-9]+[\s]+[A-Za-z]+[\s]+[0-9]+[\s]+[0-9:]+[\s]+(?P<basename>.+) -> (?P<reference>.+)$')
long_directory = re.compile('^d[rwxs-]{9}(?:\+)?[\s]+[0-9]+[\s]+gencore[\s]+(:?cgsb|users|root)[\s]+[0-9\.MKG]+[\s]+[A-Za-z]+[\s]+[0-9]+[\s]+[0-9:]+[\s]+(?P<basename>.+)$')
fastq_reference = re.compile('\.fastq\.gz$')
db = {
'record': [],
'directory': {},
'file': {},
'result': []
}
def to_json(node):
return json.dumps(node, sort_keys=True, indent=4, ensure_ascii=False)
def index_record(record):
if 'dirname' in record:
if 'directory' in record or 'file' in record or 'symlink' in record:
db['record'].append(record)
db['directory'][record['dirname']] = record
if 'file' in record:
for file in record['file']:
if file['absolute'] not in db['file']:
db['file'][file['absolute']] = {
'dirname': record['dirname'],
'basename': file['basename'],
'path': file['absolute'],
'inode type': 'file'
}
else:
print('FILE ERROR: path {} mapped more than once', file['absolute'])
if 'symlink' in record:
for symlink in record['symlink']:
if fastq_reference.search(symlink['reference']):
if symlink['absolute'] not in db['file']:
db['file'][symlink['absolute']] = {
'dirname': record['dirname'],
'basename': symlink['basename'],
'path': symlink['absolute'],
'inode type': 'symlink',
'reference': symlink['reference']
}
else:
print('SYMLINK ERROR: path {} mapped more than once', symlink['absolute'])
def process_record(record):
if 'dirname' in record:
if 'directory' in record:
for d in record['directory']:
d['absolute'] = os.path.abspath(os.path.join(record['dirname'], d['basename']))
if 'file' in record:
for f in record['file']:
f['absolute'] = os.path.abspath(os.path.join(record['dirname'], f['basename']))
if 'symlink' in record:
for s in record['symlink']:
s['absolute'] = os.path.abspath(os.path.join(record['dirname'], s['basename']))
s['reference'] = os.path.abspath(os.path.join(record['dirname'], s['reference']))
if os.path.commonprefix(('/data/cgsb', s['reference'])) == '/data/cgsb':
s['reference'] = s['reference'].replace('/data/cgsb', '/scratch/cgsb')
def first_pass():
record = None
state = 0
for line in sys.stdin:
line = line.strip()
if state == 0:
match = folder.search(line)
if match:
record = {
'dirname': match.groupdict()['dirname']
}
state = 1
elif state == 1:
if line == '':
state = 0
process_record(record)
index_record(record)
record = None
else:
match = long_file.search(line)
if match:
if 'file' not in record: record['file'] = []
record['file'].append({'basename': match.groupdict()['basename']})
else:
match = long_directory.search(line)
if match:
d = {'basename': match.groupdict()['basename']}
if d['basename'] not in ('.', '..'):
if 'directory' not in record: record['directory'] = []
record['directory'].append(d)
else:
match = long_symlink.search(line)
if match:
if 'symlink' not in record: record['symlink'] = []
record['symlink'].append({ 'basename': match.groupdict()['basename'], 'reference': match.groupdict()['reference'] })
def second_pass():
def fake(real, link):
if real in db['directory']:
referenced = db['directory'][real]
record = { 'dirname': link }
if 'directory' in referenced:
record['directory'] = []
for d in referenced['directory']:
directory = {
'basename': d['basename'],
'absolute': os.path.join(link, d['basename'])
}
record['directory'].append(directory)
fake(os.path.join(real, directory['basename']), directory['absolute'])
if 'file' in referenced:
record['symlink'] = []
for f in referenced['file']:
symlink = {
'basename': f['basename'],
'absolute': os.path.join(link, f['basename']),
'reference': f['absolute']
}
record['symlink'].append(symlink)
index_record(record)
for record in db['record']:
if 'symlink' in record:
for symlink in record['symlink']:
if not fastq_reference.search(symlink['reference']):
fake(symlink['reference'], symlink['absolute'])
def third_pass():
db['result'] = sorted(db['file'].values(), key=lambda record: record['path'])
first_pass()
second_pass()
third_pass()
print(to_json(db['result']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment