Skip to content

Instantly share code, notes, and snippets.

@bgruening
Last active August 29, 2015 14:13
Show Gist options
  • Save bgruening/43084476b46d2e711419 to your computer and use it in GitHub Desktop.
Save bgruening/43084476b46d2e711419 to your computer and use it in GitHub Desktop.
CompressedArchive: Galaxy filetype that will not be unpacked and sniffed during upload.
diff -r eb61b02da186 config/datatypes_conf.xml.sample
--- a/config/datatypes_conf.xml.sample Sun Jan 11 21:43:13 2015 -0500
+++ b/config/datatypes_conf.xml.sample Mon Jan 12 17:37:25 2015 +0100
@@ -140,6 +140,8 @@
<datatype extension="rgb" type="galaxy.datatypes.images:Rgb" mimetype="image/rgb"/>
<datatype extension="pbm" type="galaxy.datatypes.images:Pbm" mimetype="image/pbm"/>
<datatype extension="pgm" type="galaxy.datatypes.images:Pgm" mimetype="image/pgm"/>
+ <datatype extension="searchgui_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
+ <datatype extension="peptideshaker_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
<datatype extension="eps" type="galaxy.datatypes.images:Eps" mimetype="image/eps"/>
<datatype extension="rast" type="galaxy.datatypes.images:Rast" mimetype="image/rast"/>
<datatype extension="laj" type="galaxy.datatypes.images:Laj"/>
diff -r eb61b02da186 lib/galaxy/datatypes/binary.py
--- a/lib/galaxy/datatypes/binary.py Sun Jan 11 21:43:13 2015 -0500
+++ b/lib/galaxy/datatypes/binary.py Mon Jan 12 17:37:25 2015 +0100
@@ -106,6 +106,30 @@
Binary.register_unsniffable_binary_ext("ab1")
+class CompressedArchive( Binary ):
+ """
+ Class describing an compressed binary file
+ This class can be sublass'ed to implement archive filetypes that will not be unpacked by upload.py.
+ """
+ file_ext = "compressed_archive"
+ compressed = True
+
+ def set_peek( self, dataset, is_multi_byte=False ):
+ if not dataset.dataset.purged:
+ dataset.peek = "Compressed binary file"
+ dataset.blurb = data.nice_size( dataset.get_size() )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+
+ def display_peek( self, dataset ):
+ try:
+ return dataset.peek
+ except:
+ return "Compressed binary file (%s)" % ( data.nice_size( dataset.get_size() ) )
+
+Binary.register_unsniffable_binary_ext("compressed_archive")
+
class GenericAsn1Binary( Binary ):
"""Class for generic ASN.1 binary format"""
diff -r eb61b02da186 tools/data_source/upload.py
--- a/tools/data_source/upload.py Sun Jan 11 21:43:13 2015 -0500
+++ b/tools/data_source/upload.py Mon Jan 12 17:37:25 2015 +0100
@@ -120,171 +120,176 @@
data_type = type_info[0]
ext = type_info[1]
if not data_type:
- # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
- is_gzipped, is_valid = check_gzip( dataset.path )
- if is_gzipped and not is_valid:
- file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
- return
- elif is_gzipped and is_valid:
- if link_data_only == 'copy_files':
- # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
- CHUNK_SIZE = 2**20 # 1Mb
- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
- gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
- while 1:
- try:
- chunk = gzipped_file.read( CHUNK_SIZE )
- except IOError:
- os.close( fd )
- os.remove( uncompressed )
- file_err( 'Problem decompressing gzipped data', dataset, json_file )
- return
- if not chunk:
- break
- os.write( fd, chunk )
- os.close( fd )
- gzipped_file.close()
- # Replace the gzipped file with the decompressed file if it's safe to do so
- if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
- dataset.path = uncompressed
- else:
- shutil.move( uncompressed, dataset.path )
- os.chmod(dataset.path, 0644)
- dataset.name = dataset.name.rstrip( '.gz' )
- data_type = 'gzip'
- if not data_type and bz2 is not None:
- # See if we have a bz2 file, much like gzip
- is_bzipped, is_valid = check_bz2( dataset.path )
- if is_bzipped and not is_valid:
+ root_datatype = registry.get_datatype_by_extension( dataset.file_type )
+ if getattr( root_datatype, 'compressed', False ):
+ data_type = 'compressed archive'
+ ext = dataset.file_type
+ else:
+ # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
+ is_gzipped, is_valid = check_gzip( dataset.path )
+ if is_gzipped and not is_valid:
file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
return
- elif is_bzipped and is_valid:
+ elif is_gzipped and is_valid:
if link_data_only == 'copy_files':
- # We need to uncompress the temp_name file
+ # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
CHUNK_SIZE = 2**20 # 1Mb
- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
- bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+ gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
while 1:
try:
- chunk = bzipped_file.read( CHUNK_SIZE )
+ chunk = gzipped_file.read( CHUNK_SIZE )
except IOError:
os.close( fd )
os.remove( uncompressed )
- file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
+ file_err( 'Problem decompressing gzipped data', dataset, json_file )
return
if not chunk:
break
os.write( fd, chunk )
os.close( fd )
- bzipped_file.close()
- # Replace the bzipped file with the decompressed file if it's safe to do so
+ gzipped_file.close()
+ # Replace the gzipped file with the decompressed file if it's safe to do so
if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
dataset.path = uncompressed
else:
shutil.move( uncompressed, dataset.path )
os.chmod(dataset.path, 0644)
- dataset.name = dataset.name.rstrip( '.bz2' )
- data_type = 'bz2'
- if not data_type:
- # See if we have a zip archive
- is_zipped = check_zip( dataset.path )
- if is_zipped:
- if link_data_only == 'copy_files':
- CHUNK_SIZE = 2**20 # 1Mb
- uncompressed = None
- uncompressed_name = None
- unzipped = False
- z = zipfile.ZipFile( dataset.path )
- for name in z.namelist():
- if name.endswith('/'):
- continue
- if unzipped:
- stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
- break
- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
- if sys.version_info[:2] >= ( 2, 6 ):
- zipped_file = z.open( name )
- while 1:
+ dataset.name = dataset.name.rstrip( '.gz' )
+ data_type = 'gzip'
+ if not data_type and bz2 is not None:
+ # See if we have a bz2 file, much like gzip
+ is_bzipped, is_valid = check_bz2( dataset.path )
+ if is_bzipped and not is_valid:
+ file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
+ return
+ elif is_bzipped and is_valid:
+ if link_data_only == 'copy_files':
+ # We need to uncompress the temp_name file
+ CHUNK_SIZE = 2**20 # 1Mb
+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+ bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
+ while 1:
+ try:
+ chunk = bzipped_file.read( CHUNK_SIZE )
+ except IOError:
+ os.close( fd )
+ os.remove( uncompressed )
+ file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
+ return
+ if not chunk:
+ break
+ os.write( fd, chunk )
+ os.close( fd )
+ bzipped_file.close()
+ # Replace the bzipped file with the decompressed file if it's safe to do so
+ if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
+ dataset.path = uncompressed
+ else:
+ shutil.move( uncompressed, dataset.path )
+ os.chmod(dataset.path, 0644)
+ dataset.name = dataset.name.rstrip( '.bz2' )
+ data_type = 'bz2'
+ if not data_type:
+ # See if we have a zip archive
+ is_zipped = check_zip( dataset.path )
+ if is_zipped:
+ if link_data_only == 'copy_files':
+ CHUNK_SIZE = 2**20 # 1Mb
+ uncompressed = None
+ uncompressed_name = None
+ unzipped = False
+ z = zipfile.ZipFile( dataset.path )
+ for name in z.namelist():
+ if name.endswith('/'):
+ continue
+ if unzipped:
+ stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
+ break
+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+ if sys.version_info[:2] >= ( 2, 6 ):
+ zipped_file = z.open( name )
+ while 1:
+ try:
+ chunk = zipped_file.read( CHUNK_SIZE )
+ except IOError:
+ os.close( fd )
+ os.remove( uncompressed )
+ file_err( 'Problem decompressing zipped data', dataset, json_file )
+ return
+ if not chunk:
+ break
+ os.write( fd, chunk )
+ os.close( fd )
+ zipped_file.close()
+ uncompressed_name = name
+ unzipped = True
+ else:
+ # python < 2.5 doesn't have a way to read members in chunks(!)
try:
- chunk = zipped_file.read( CHUNK_SIZE )
+ outfile = open( uncompressed, 'wb' )
+ outfile.write( z.read( name ) )
+ outfile.close()
+ uncompressed_name = name
+ unzipped = True
except IOError:
os.close( fd )
os.remove( uncompressed )
file_err( 'Problem decompressing zipped data', dataset, json_file )
return
- if not chunk:
- break
- os.write( fd, chunk )
- os.close( fd )
- zipped_file.close()
- uncompressed_name = name
- unzipped = True
+ z.close()
+ # Replace the zipped file with the decompressed file if it's safe to do so
+ if uncompressed is not None:
+ if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
+ dataset.path = uncompressed
+ else:
+ shutil.move( uncompressed, dataset.path )
+ os.chmod(dataset.path, 0644)
+ dataset.name = uncompressed_name
+ data_type = 'zip'
+ if not data_type:
+ # TODO refactor this logic. check_binary isn't guaranteed to be
+ # correct since it only looks at whether the first 100 chars are
+ # printable or not. If someone specifies a known unsniffable
+ # binary datatype and check_binary fails, the file gets mangled.
+ if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
+ # We have a binary dataset, but it is not Bam, Sff or Pdf
+ data_type = 'binary'
+ #binary_ok = False
+ parts = dataset.name.split( "." )
+ if len( parts ) > 1:
+ ext = parts[-1].strip().lower()
+ if not Binary.is_ext_unsniffable(ext):
+ file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
+ return
+ elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
+ err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
+ file_err( err_msg, dataset, json_file )
+ return
+ if not data_type:
+ # We must have a text file
+ if check_html( dataset.path ):
+ file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
+ return
+ if data_type != 'binary':
+ if link_data_only == 'copy_files':
+ if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
+ in_place = False
+ # Convert universal line endings to Posix line endings, but allow the user to turn it off,
+ # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
+ # corrupting the content of those files.
+ if dataset.to_posix_lines:
+ tmpdir = output_adjacent_tmpdir( output_path )
+ tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
+ if dataset.space_to_tab:
+ line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
else:
- # python < 2.5 doesn't have a way to read members in chunks(!)
- try:
- outfile = open( uncompressed, 'wb' )
- outfile.write( z.read( name ) )
- outfile.close()
- uncompressed_name = name
- unzipped = True
- except IOError:
- os.close( fd )
- os.remove( uncompressed )
- file_err( 'Problem decompressing zipped data', dataset, json_file )
- return
- z.close()
- # Replace the zipped file with the decompressed file if it's safe to do so
- if uncompressed is not None:
- if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
- dataset.path = uncompressed
- else:
- shutil.move( uncompressed, dataset.path )
- os.chmod(dataset.path, 0644)
- dataset.name = uncompressed_name
- data_type = 'zip'
- if not data_type:
- # TODO refactor this logic. check_binary isn't guaranteed to be
- # correct since it only looks at whether the first 100 chars are
- # printable or not. If someone specifies a known unsniffable
- # binary datatype and check_binary fails, the file gets mangled.
- if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
- # We have a binary dataset, but it is not Bam, Sff or Pdf
- data_type = 'binary'
- #binary_ok = False
- parts = dataset.name.split( "." )
- if len( parts ) > 1:
- ext = parts[-1].strip().lower()
- if not Binary.is_ext_unsniffable(ext):
- file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
- return
- elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
- err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
- file_err( err_msg, dataset, json_file )
- return
- if not data_type:
- # We must have a text file
- if check_html( dataset.path ):
- file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
- return
- if data_type != 'binary':
- if link_data_only == 'copy_files':
- if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
- in_place = False
- # Convert universal line endings to Posix line endings, but allow the user to turn it off,
- # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
- # corrupting the content of those files.
- if dataset.to_posix_lines:
- tmpdir = output_adjacent_tmpdir( output_path )
- tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
- if dataset.space_to_tab:
- line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
- else:
- line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
- if dataset.file_type == 'auto':
- ext = sniff.guess_ext( dataset.path, registry.sniff_order )
- else:
- ext = dataset.file_type
- data_type = ext
+ line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
+ if dataset.file_type == 'auto':
+ ext = sniff.guess_ext( dataset.path, registry.sniff_order )
+ else:
+ ext = dataset.file_type
+ data_type = ext
# Save job info for the framework
if ext == 'auto' and dataset.ext:
ext = dataset.ext
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment