Last active
August 29, 2015 14:13
-
-
Save bgruening/43084476b46d2e711419 to your computer and use it in GitHub Desktop.
CompressedArchive: Galaxy filetype that will not be unpacked and sniffed during upload.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -r eb61b02da186 config/datatypes_conf.xml.sample | |
--- a/config/datatypes_conf.xml.sample Sun Jan 11 21:43:13 2015 -0500 | |
+++ b/config/datatypes_conf.xml.sample Mon Jan 12 17:37:25 2015 +0100 | |
@@ -140,6 +140,8 @@ | |
<datatype extension="rgb" type="galaxy.datatypes.images:Rgb" mimetype="image/rgb"/> | |
<datatype extension="pbm" type="galaxy.datatypes.images:Pbm" mimetype="image/pbm"/> | |
<datatype extension="pgm" type="galaxy.datatypes.images:Pgm" mimetype="image/pgm"/> | |
+ <datatype extension="searchgui_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/> | |
+ <datatype extension="peptideshaker_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/> | |
<datatype extension="eps" type="galaxy.datatypes.images:Eps" mimetype="image/eps"/> | |
<datatype extension="rast" type="galaxy.datatypes.images:Rast" mimetype="image/rast"/> | |
<datatype extension="laj" type="galaxy.datatypes.images:Laj"/> | |
diff -r eb61b02da186 lib/galaxy/datatypes/binary.py | |
--- a/lib/galaxy/datatypes/binary.py Sun Jan 11 21:43:13 2015 -0500 | |
+++ b/lib/galaxy/datatypes/binary.py Mon Jan 12 17:37:25 2015 +0100 | |
@@ -106,6 +106,30 @@ | |
Binary.register_unsniffable_binary_ext("ab1") | |
+class CompressedArchive( Binary ): | |
+ """ | |
+ Class describing an compressed binary file | |
+ This class can be sublass'ed to implement archive filetypes that will not be unpacked by upload.py. | |
+ """ | |
+ file_ext = "compressed_archive" | |
+ compressed = True | |
+ | |
+ def set_peek( self, dataset, is_multi_byte=False ): | |
+ if not dataset.dataset.purged: | |
+ dataset.peek = "Compressed binary file" | |
+ dataset.blurb = data.nice_size( dataset.get_size() ) | |
+ else: | |
+ dataset.peek = 'file does not exist' | |
+ dataset.blurb = 'file purged from disk' | |
+ | |
+ def display_peek( self, dataset ): | |
+ try: | |
+ return dataset.peek | |
+ except: | |
+ return "Compressed binary file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
+ | |
+Binary.register_unsniffable_binary_ext("compressed_archive") | |
+ | |
class GenericAsn1Binary( Binary ): | |
"""Class for generic ASN.1 binary format""" | |
diff -r eb61b02da186 tools/data_source/upload.py | |
--- a/tools/data_source/upload.py Sun Jan 11 21:43:13 2015 -0500 | |
+++ b/tools/data_source/upload.py Mon Jan 12 17:37:25 2015 +0100 | |
@@ -120,171 +120,176 @@ | |
data_type = type_info[0] | |
ext = type_info[1] | |
if not data_type: | |
- # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress | |
- is_gzipped, is_valid = check_gzip( dataset.path ) | |
- if is_gzipped and not is_valid: | |
- file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | |
- return | |
- elif is_gzipped and is_valid: | |
- if link_data_only == 'copy_files': | |
- # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format | |
- CHUNK_SIZE = 2**20 # 1Mb | |
- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
- gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | |
- while 1: | |
- try: | |
- chunk = gzipped_file.read( CHUNK_SIZE ) | |
- except IOError: | |
- os.close( fd ) | |
- os.remove( uncompressed ) | |
- file_err( 'Problem decompressing gzipped data', dataset, json_file ) | |
- return | |
- if not chunk: | |
- break | |
- os.write( fd, chunk ) | |
- os.close( fd ) | |
- gzipped_file.close() | |
- # Replace the gzipped file with the decompressed file if it's safe to do so | |
- if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
- dataset.path = uncompressed | |
- else: | |
- shutil.move( uncompressed, dataset.path ) | |
- os.chmod(dataset.path, 0644) | |
- dataset.name = dataset.name.rstrip( '.gz' ) | |
- data_type = 'gzip' | |
- if not data_type and bz2 is not None: | |
- # See if we have a bz2 file, much like gzip | |
- is_bzipped, is_valid = check_bz2( dataset.path ) | |
- if is_bzipped and not is_valid: | |
+ root_datatype = registry.get_datatype_by_extension( dataset.file_type ) | |
+ if getattr( root_datatype, 'compressed', False ): | |
+ data_type = 'compressed archive' | |
+ ext = dataset.file_type | |
+ else: | |
+ # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress | |
+ is_gzipped, is_valid = check_gzip( dataset.path ) | |
+ if is_gzipped and not is_valid: | |
file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | |
return | |
- elif is_bzipped and is_valid: | |
+ elif is_gzipped and is_valid: | |
if link_data_only == 'copy_files': | |
- # We need to uncompress the temp_name file | |
+ # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format | |
CHUNK_SIZE = 2**20 # 1Mb | |
- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
- bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) | |
+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
+ gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) | |
while 1: | |
try: | |
- chunk = bzipped_file.read( CHUNK_SIZE ) | |
+ chunk = gzipped_file.read( CHUNK_SIZE ) | |
except IOError: | |
os.close( fd ) | |
os.remove( uncompressed ) | |
- file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) | |
+ file_err( 'Problem decompressing gzipped data', dataset, json_file ) | |
return | |
if not chunk: | |
break | |
os.write( fd, chunk ) | |
os.close( fd ) | |
- bzipped_file.close() | |
- # Replace the bzipped file with the decompressed file if it's safe to do so | |
+ gzipped_file.close() | |
+ # Replace the gzipped file with the decompressed file if it's safe to do so | |
if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
dataset.path = uncompressed | |
else: | |
shutil.move( uncompressed, dataset.path ) | |
os.chmod(dataset.path, 0644) | |
- dataset.name = dataset.name.rstrip( '.bz2' ) | |
- data_type = 'bz2' | |
- if not data_type: | |
- # See if we have a zip archive | |
- is_zipped = check_zip( dataset.path ) | |
- if is_zipped: | |
- if link_data_only == 'copy_files': | |
- CHUNK_SIZE = 2**20 # 1Mb | |
- uncompressed = None | |
- uncompressed_name = None | |
- unzipped = False | |
- z = zipfile.ZipFile( dataset.path ) | |
- for name in z.namelist(): | |
- if name.endswith('/'): | |
- continue | |
- if unzipped: | |
- stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | |
- break | |
- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
- if sys.version_info[:2] >= ( 2, 6 ): | |
- zipped_file = z.open( name ) | |
- while 1: | |
+ dataset.name = dataset.name.rstrip( '.gz' ) | |
+ data_type = 'gzip' | |
+ if not data_type and bz2 is not None: | |
+ # See if we have a bz2 file, much like gzip | |
+ is_bzipped, is_valid = check_bz2( dataset.path ) | |
+ if is_bzipped and not is_valid: | |
+ file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) | |
+ return | |
+ elif is_bzipped and is_valid: | |
+ if link_data_only == 'copy_files': | |
+ # We need to uncompress the temp_name file | |
+ CHUNK_SIZE = 2**20 # 1Mb | |
+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
+ bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) | |
+ while 1: | |
+ try: | |
+ chunk = bzipped_file.read( CHUNK_SIZE ) | |
+ except IOError: | |
+ os.close( fd ) | |
+ os.remove( uncompressed ) | |
+ file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) | |
+ return | |
+ if not chunk: | |
+ break | |
+ os.write( fd, chunk ) | |
+ os.close( fd ) | |
+ bzipped_file.close() | |
+ # Replace the bzipped file with the decompressed file if it's safe to do so | |
+ if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
+ dataset.path = uncompressed | |
+ else: | |
+ shutil.move( uncompressed, dataset.path ) | |
+ os.chmod(dataset.path, 0644) | |
+ dataset.name = dataset.name.rstrip( '.bz2' ) | |
+ data_type = 'bz2' | |
+ if not data_type: | |
+ # See if we have a zip archive | |
+ is_zipped = check_zip( dataset.path ) | |
+ if is_zipped: | |
+ if link_data_only == 'copy_files': | |
+ CHUNK_SIZE = 2**20 # 1Mb | |
+ uncompressed = None | |
+ uncompressed_name = None | |
+ unzipped = False | |
+ z = zipfile.ZipFile( dataset.path ) | |
+ for name in z.namelist(): | |
+ if name.endswith('/'): | |
+ continue | |
+ if unzipped: | |
+ stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' | |
+ break | |
+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) | |
+ if sys.version_info[:2] >= ( 2, 6 ): | |
+ zipped_file = z.open( name ) | |
+ while 1: | |
+ try: | |
+ chunk = zipped_file.read( CHUNK_SIZE ) | |
+ except IOError: | |
+ os.close( fd ) | |
+ os.remove( uncompressed ) | |
+ file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
+ return | |
+ if not chunk: | |
+ break | |
+ os.write( fd, chunk ) | |
+ os.close( fd ) | |
+ zipped_file.close() | |
+ uncompressed_name = name | |
+ unzipped = True | |
+ else: | |
+ # python < 2.5 doesn't have a way to read members in chunks(!) | |
try: | |
- chunk = zipped_file.read( CHUNK_SIZE ) | |
+ outfile = open( uncompressed, 'wb' ) | |
+ outfile.write( z.read( name ) ) | |
+ outfile.close() | |
+ uncompressed_name = name | |
+ unzipped = True | |
except IOError: | |
os.close( fd ) | |
os.remove( uncompressed ) | |
file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
return | |
- if not chunk: | |
- break | |
- os.write( fd, chunk ) | |
- os.close( fd ) | |
- zipped_file.close() | |
- uncompressed_name = name | |
- unzipped = True | |
+ z.close() | |
+ # Replace the zipped file with the decompressed file if it's safe to do so | |
+ if uncompressed is not None: | |
+ if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
+ dataset.path = uncompressed | |
+ else: | |
+ shutil.move( uncompressed, dataset.path ) | |
+ os.chmod(dataset.path, 0644) | |
+ dataset.name = uncompressed_name | |
+ data_type = 'zip' | |
+ if not data_type: | |
+ # TODO refactor this logic. check_binary isn't guaranteed to be | |
+ # correct since it only looks at whether the first 100 chars are | |
+ # printable or not. If someone specifies a known unsniffable | |
+ # binary datatype and check_binary fails, the file gets mangled. | |
+ if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): | |
+ # We have a binary dataset, but it is not Bam, Sff or Pdf | |
+ data_type = 'binary' | |
+ #binary_ok = False | |
+ parts = dataset.name.split( "." ) | |
+ if len( parts ) > 1: | |
+ ext = parts[-1].strip().lower() | |
+ if not Binary.is_ext_unsniffable(ext): | |
+ file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) | |
+ return | |
+ elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: | |
+ err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) | |
+ file_err( err_msg, dataset, json_file ) | |
+ return | |
+ if not data_type: | |
+ # We must have a text file | |
+ if check_html( dataset.path ): | |
+ file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) | |
+ return | |
+ if data_type != 'binary': | |
+ if link_data_only == 'copy_files': | |
+ if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: | |
+ in_place = False | |
+ # Convert universal line endings to Posix line endings, but allow the user to turn it off, | |
+ # so that is becomes possible to upload gzip, bz2 or zip files with binary data without | |
+ # corrupting the content of those files. | |
+ if dataset.to_posix_lines: | |
+ tmpdir = output_adjacent_tmpdir( output_path ) | |
+ tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
+ if dataset.space_to_tab: | |
+ line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
else: | |
- # python < 2.5 doesn't have a way to read members in chunks(!) | |
- try: | |
- outfile = open( uncompressed, 'wb' ) | |
- outfile.write( z.read( name ) ) | |
- outfile.close() | |
- uncompressed_name = name | |
- unzipped = True | |
- except IOError: | |
- os.close( fd ) | |
- os.remove( uncompressed ) | |
- file_err( 'Problem decompressing zipped data', dataset, json_file ) | |
- return | |
- z.close() | |
- # Replace the zipped file with the decompressed file if it's safe to do so | |
- if uncompressed is not None: | |
- if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: | |
- dataset.path = uncompressed | |
- else: | |
- shutil.move( uncompressed, dataset.path ) | |
- os.chmod(dataset.path, 0644) | |
- dataset.name = uncompressed_name | |
- data_type = 'zip' | |
- if not data_type: | |
- # TODO refactor this logic. check_binary isn't guaranteed to be | |
- # correct since it only looks at whether the first 100 chars are | |
- # printable or not. If someone specifies a known unsniffable | |
- # binary datatype and check_binary fails, the file gets mangled. | |
- if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): | |
- # We have a binary dataset, but it is not Bam, Sff or Pdf | |
- data_type = 'binary' | |
- #binary_ok = False | |
- parts = dataset.name.split( "." ) | |
- if len( parts ) > 1: | |
- ext = parts[-1].strip().lower() | |
- if not Binary.is_ext_unsniffable(ext): | |
- file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) | |
- return | |
- elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: | |
- err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) | |
- file_err( err_msg, dataset, json_file ) | |
- return | |
- if not data_type: | |
- # We must have a text file | |
- if check_html( dataset.path ): | |
- file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) | |
- return | |
- if data_type != 'binary': | |
- if link_data_only == 'copy_files': | |
- if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: | |
- in_place = False | |
- # Convert universal line endings to Posix line endings, but allow the user to turn it off, | |
- # so that is becomes possible to upload gzip, bz2 or zip files with binary data without | |
- # corrupting the content of those files. | |
- if dataset.to_posix_lines: | |
- tmpdir = output_adjacent_tmpdir( output_path ) | |
- tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id | |
- if dataset.space_to_tab: | |
- line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
- else: | |
- line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
- if dataset.file_type == 'auto': | |
- ext = sniff.guess_ext( dataset.path, registry.sniff_order ) | |
- else: | |
- ext = dataset.file_type | |
- data_type = ext | |
+ line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) | |
+ if dataset.file_type == 'auto': | |
+ ext = sniff.guess_ext( dataset.path, registry.sniff_order ) | |
+ else: | |
+ ext = dataset.file_type | |
+ data_type = ext | |
# Save job info for the framework | |
if ext == 'auto' and dataset.ext: | |
ext = dataset.ext |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment