tgs · April 19, 2023 16:06 · tgs · Apr 19, 2023
diff --git a/ncbi2na.py b/ncbi2na.py
 #!/usr/bin/env python
 """
 Translate compact hex sequence representation to standard bases.

 If you have NCBI ASN.1 data, the easiest way to get a FASTA version is to
 use NCBI's asn2fasta tool, which is available here:
 https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/asn2fasta/

 NCBI2na is a compact representation of sequence data, where each base just
 takes 2 bits.  It's commonly found in text ASN.1 files as hexadecimal strings.
 In binary ASN.1, it is found as binary data - this script does not handle that.
 In a string of hexadecimal digits, each digit represents two nucleotide bases.
 And the hex string represents actual characters, so only even numbers of hex
 digits are valid.  So for any given hex string, there are four different
 numbers of nucleotides it could represent.  This means you need to provide the
 length of the nucleotide sequence you expect.

    >>> import ncbi2na
    >>> ncbi2na.decode_ncbi2na(20, u'0123456789')
    'AAACAGATCACCCGCTGAGC'
    >>> ncbi2na.decode_ncbi2na(17, u'0123456789')
    'AAACAGATCACCCGCTG'

 Since we have the expected length, we can sanity-check it against the input
 string:

    >>> ncbi2na.decode_ncbi2na(16, u'0123456789')
    Traceback (most recent call last):
        ...
    ValueError: Found more nuc bases than expected
    >>> ncbi2na.decode_ncbi2na(21, u'0123456789')
    Traceback (most recent call last):
        ...
    ValueError: Found fewer nuc bases than expected

 This should work from Python 2 or 3, but the hexadecimal string
 must be unicode because the `translate` method works very differently
 in byte strings.
 """
 from itertools import product

 __all__ = ['decode_ncbi2na']

 _nucs = u'ACGT'
 _replacements = list(map(u''.join, product(_nucs, _nucs)))
 _hex2nuc = dict(zip(map(ord, u'0123456789ABCDEF'), _replacements))

 if bytes is str:  # python2
    _text_type = unicode
 else:
    _text_type = str


 def decode_ncbi2na(length, hex_str):
    if not isinstance(hex_str, _text_type):
        raise TypeError("hex_str must be unicode/text type")
    seq_untrimmed = hex_str.translate(_hex2nuc)
    raw_len = len(seq_untrimmed)
    if length > raw_len:
        raise ValueError("Found fewer nuc bases than expected")
    if length < raw_len - 3:
        raise ValueError("Found more nuc bases than expected")
    return seq_untrimmed[:length]
	#!/usr/bin/env python
	"""
	Translate compact hex sequence representation to standard bases.

	If you have NCBI ASN.1 data, the easiest way to get a FASTA version is to
	use NCBI's asn2fasta tool, which is available here:
	https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/asn2fasta/

	NCBI2na is a compact representation of sequence data, where each base just
	takes 2 bits. It's commonly found in text ASN.1 files as hexadecimal strings.
	In binary ASN.1, it is found as binary data - this script does not handle that.
	In a string of hexadecimal digits, each digit represents two nucleotide bases.
	And the hex string represents actual characters, so only even numbers of hex
	digits are valid. So for any given hex string, there are four different
	numbers of nucleotides it could represent. This means you need to provide the
	length of the nucleotide sequence you expect.

	>>> import ncbi2na
	>>> ncbi2na.decode_ncbi2na(20, u'0123456789')
	'AAACAGATCACCCGCTGAGC'
	>>> ncbi2na.decode_ncbi2na(17, u'0123456789')
	'AAACAGATCACCCGCTG'

	Since we have the expected length, we can sanity-check it against the input
	string:

	>>> ncbi2na.decode_ncbi2na(16, u'0123456789')
	Traceback (most recent call last):
	...
	ValueError: Found more nuc bases than expected
	>>> ncbi2na.decode_ncbi2na(21, u'0123456789')
	Traceback (most recent call last):
	...
	ValueError: Found fewer nuc bases than expected

	This should work from Python 2 or 3, but the hexadecimal string
	must be unicode because the `translate` method works very differently
	in byte strings.
	"""
	from itertools import product

	__all__ = ['decode_ncbi2na']

	_nucs = u'ACGT'
	_replacements = list(map(u''.join, product(_nucs, _nucs)))
	_hex2nuc = dict(zip(map(ord, u'0123456789ABCDEF'), _replacements))

	if bytes is str: # python2
	_text_type = unicode
	else:
	_text_type = str


	def decode_ncbi2na(length, hex_str):
	if not isinstance(hex_str, _text_type):
	raise TypeError("hex_str must be unicode/text type")
	seq_untrimmed = hex_str.translate(_hex2nuc)
	raw_len = len(seq_untrimmed)
	if length > raw_len:
	raise ValueError("Found fewer nuc bases than expected")
	if length < raw_len - 3:
	raise ValueError("Found more nuc bases than expected")
	return seq_untrimmed[:length]
No results found