notbanker · August 29, 2015 14:05
diff --git a/shortlei.py b/shortlei.py
 import math
 import func

 # These coincide with the choices hardwired into the spreadsheet provided to ISDA, so don't change them
 ISDA_PERMITTED_CHARS = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'  # Don't change!
 ISDA_PRIME = 59575553


 def _avoidReserved( s ):
    """ Avoid clash with reserved namespace used by CFTC """
    DIGITS = '0123456789'
    if (s[0] in DIGITS) and (s[1] in DIGITS) and (s[2] in DIGITS):
        # If s starts with three digits we make one of them alpha-numeric
        if s[3] in '0123456789ABC':
            pos = 0
        elif s[3] in 'DEFGHIJKLMNOP':
            pos = 1
        else:
            pos = 2
        # What to substitute?
        j   = ISDA_PERMITTED_CHARS.index( s[pos] ) + 12
        sub = ISDA_PERMITTED_CHARS[ j ]
        s   = s[:pos] + sub + s[pos+1:]
    if (s[0]=='E') and (s[1] in DIGITS) and (s[2] in DIGITS):
        s   = 'X' + s[1:]  # Avoid ESME's reserved space
    return s

 def leiHash( lei, permittedChars = ISDA_PERMITTED_CHARS ):
    """ Shorten 20 character alpha-numeric Legal Entity Identifier into 10 character alpha-numeric "short LEI" that
        can be used as a Unique Trade Identifier (UTI) prefix. The algorithm is simply the modulo operation lifted from
        integers to the space of alpha-numeric case-insensitive strings.

         Argument
         --------
         s          str     (length 20 typically)

         Returns
         -------
         str                (length 10)

    """
    # This will break a combination of issuer and firm identifiers into two eight char strings, compress each down to five, and join
    # LEI = LEI Issuer (4 char alphanumeric)" ++ "2 reserved characters always same" 00 ++ firm identifier (12 char alphanumeric) + 2 check digits.
    if len(lei)==20:
        issuer = lei[:4]
        firm   = lei[6:18]
        leiWithoutRedundantChars = issuer + firm
    else:
        leiWithoutRedundantChars = lei

    h = len( leiWithoutRedundantChars )/2
    firstHalf  = leiWithoutRedundantChars[:h]
    secondHalf = leiWithoutRedundantChars[h:]

    firstHalfHash  = _primeHash( s = firstHalf,  newLen = 5, prime = ISDA_PRIME, permittedChars = permittedChars )
    secondHalfHash = _primeHash( s = secondHalf, newLen = 5, prime = ISDA_PRIME, permittedChars = permittedChars )

    firstHalfHash  = _avoidReserved( firstHalfHash )

    return firstHalfHash + secondHalfHash

 def leiHashCollisionProbability( nLei, nPermittedChars = 36, firstPrime = ISDA_PRIME, secondPrime = ISDA_PRIME ):
    """ Probability of at least one collision if (>=20 character) LEI's are drawn from uniform distribution with replacement nLei times """
    nPossibilities = firstPrime*secondPrime
    return 1- math.exp( -nLei*(nLei-1.)/(2.*nPossibilities) )

 @func.memoize_plus
 def _powersModP( nPermittedChars, p ):
    """ Cached powers of nPermittedChars mod p """
    return [ pow( nPermittedChars, k ) % p for k in xrange( 25 ) ]

 def _intFromAlphaModP( s, permittedChars = ISDA_PERMITTED_CHARS, p = ISDA_PRIME ):
    """ Convert alpha-numeric to integer modulo p """
    # We merge the operations of converting to an int and taking mod p not because python can't handle large numbers, which
    # it does perfectly well, but to make translation to Excel/VBA straightforward. {n^k mod p} are cached for the same reason,
    # and are hardwired in the VBA version provided to ISDA.
    i = 0
    nPermittedChars = len( permittedChars )
    for k, char in enumerate( reversed(s) ):
        digit     = permittedChars.find( char )
        i        += _powersModP( nPermittedChars, p )[k]*digit
        i         = i % p
    return i

 def _alphaFromInt( i, permittedChars = ISDA_PERMITTED_CHARS ):
    """ Convert back again

        Arguments
        ---------
        reserved     int     The CFTC has reserved the first three characters for their prefixes. To avoid clashes we don't allow digits there.

    """
    # (This is an injection from 0..PRIME into the subset of alpha-numeric strings THAT don't begin with three digits)
    a = ''
    nPermittedChars = len( permittedChars )
    while i:
        a = permittedChars[i % nPermittedChars] + a
        i = i // nPermittedChars
    if not a:
        a = permittedChars[0]
    return a

 def _primeHash( s, permittedChars = ISDA_PERMITTED_CHARS, prime = ISDA_PRIME, newLen = 5 ):
    """ Hash case-insensitive string to one of shorter length by applying modulus operation """
    # (Doesn't really have to be a prime)
    nPermittedChars = len( permittedChars )
    if pow( nPermittedChars, newLen ) < prime:
        raise ValueError("You may wish to choose a larger prime so the map from integers mod p to alpha-numeric strings is an injection")

    iModP = _intFromAlphaModP( s, permittedChars = permittedChars, p = prime )
    alf   = _alphaFromInt( iModP, permittedChars )
    return alf.ljust( 5, 'Z' )



 GEEK_EXPLANATION = r""" The modulo operation lifted from integers to alpha-numeric strings """

 LAY_EXPLANATION = r"""
 The idea behind the scheme for shortening a 20 character Legal Entity identifier to an 10-character
 "Short Legal Entity Identifier" (SLEI) is essentially modulo arithmetic.

                 If LEI's were just digits ...
                 -----------------------------

 First imagine that LEI's were 20 digit numbers instead of 20 digit alpha-numeric strings. A poor
 scheme would take the remainder modulo 10,000,000,000 (that is, keep subtracting 10,000,000,000 from
 out 20 digit number until it is less than 10,000,000,000 and hence a 10 digit number at most). This
 would not be wise, as it is equivalent to only using the last ten digits of our 20 digit LEI. If two
 LEI's differed only in the first ten digits they will evidently have the same SLEI.

                 ... We'd do the following:
                 -------------------------

 To fix this, we would instead choose a large prime number P close to but less than 10,000,000,000 and divide by
 that instead. Thus if LEI's were composes only of digits we could say:

             SLEI = LEI modulus P

 Motivation:

    i.  If we change any one digit of the LEI the SLEI will change.
    ii. If we change (only) two digits of the LEI the SLEI will change.
    ...
    iii. If we change any *nine* digits of the LEI the SLEI will change.

 Thus for two LEI's to have the same LEI they must differ in at least ten of the twenty digits. Simple
 schemes for incrementing LEI's (and so forth) are unlikely to lead to collisions.

                 Since LEIs are alpha-numeric we make minor changes
                 --------------------------------------------------

 The algorithm here is essentially just "SLEI = LEI modulus P" with a few superficial changes.

    a) Because we are using alpha-numeric strings, not just digits, we convert from the former to the latter
    b) We take advantage of the following redundancies in LEIs:
           - Characters 5,6 are always the same
           - Characters 19,20 are check-sum digits
       We throw these characters away prior to encoding
    c) To avoid arithmetic involving large integers (which is troublesome in Excel, which some users might like to use)
       we split the non-redudant sixteen characters into two groups of eight, compress each to five, and then combine.
    d) The CFTC has reserved some UTI prefixes, namedly those whose first three characters are digits (running 000-999). We avoid these.


               Collision probabilities
               -----------------------

 Probability of a clash for one million LEI's is 0.000140864907758 or 1 in 7099.0
 Probability of a clash for ten million LEI's is 0.0139887312515 or 1 in 71.0

 LEI's are not randomly chosen however. Indeed issuer and firm identifiers are presumably unique. So this calculation
 overstates the actual collision probability.

 """
	import math
	import func

	# These coincide with the choices hardwired into the spreadsheet provided to ISDA, so don't change them
	ISDA_PERMITTED_CHARS = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' # Don't change!
	ISDA_PRIME = 59575553


	def _avoidReserved( s ):
	""" Avoid clash with reserved namespace used by CFTC """
	DIGITS = '0123456789'
	if (s[0] in DIGITS) and (s[1] in DIGITS) and (s[2] in DIGITS):
	# If s starts with three digits we make one of them alpha-numeric
	if s[3] in '0123456789ABC':
	pos = 0
	elif s[3] in 'DEFGHIJKLMNOP':
	pos = 1
	else:
	pos = 2
	# What to substitute?
	j = ISDA_PERMITTED_CHARS.index( s[pos] ) + 12
	sub = ISDA_PERMITTED_CHARS[ j ]
	s = s[:pos] + sub + s[pos+1:]
	if (s[0]=='E') and (s[1] in DIGITS) and (s[2] in DIGITS):
	s = 'X' + s[1:] # Avoid ESME's reserved space
	return s

	def leiHash( lei, permittedChars = ISDA_PERMITTED_CHARS ):
	""" Shorten 20 character alpha-numeric Legal Entity Identifier into 10 character alpha-numeric "short LEI" that
	can be used as a Unique Trade Identifier (UTI) prefix. The algorithm is simply the modulo operation lifted from
	integers to the space of alpha-numeric case-insensitive strings.

	Argument
	--------
	s str (length 20 typically)

	Returns
	-------
	str (length 10)

	"""
	# This will break a combination of issuer and firm identifiers into two eight char strings, compress each down to five, and join
	# LEI = LEI Issuer (4 char alphanumeric)" ++ "2 reserved characters always same" 00 ++ firm identifier (12 char alphanumeric) + 2 check digits.
	if len(lei)==20:
	issuer = lei[:4]
	firm = lei[6:18]
	leiWithoutRedundantChars = issuer + firm
	else:
	leiWithoutRedundantChars = lei

	h = len( leiWithoutRedundantChars )/2
	firstHalf = leiWithoutRedundantChars[:h]
	secondHalf = leiWithoutRedundantChars[h:]

	firstHalfHash = _primeHash( s = firstHalf, newLen = 5, prime = ISDA_PRIME, permittedChars = permittedChars )
	secondHalfHash = _primeHash( s = secondHalf, newLen = 5, prime = ISDA_PRIME, permittedChars = permittedChars )

	firstHalfHash = _avoidReserved( firstHalfHash )

	return firstHalfHash + secondHalfHash

	def leiHashCollisionProbability( nLei, nPermittedChars = 36, firstPrime = ISDA_PRIME, secondPrime = ISDA_PRIME ):
	""" Probability of at least one collision if (>=20 character) LEI's are drawn from uniform distribution with replacement nLei times """
	nPossibilities = firstPrime*secondPrime
	return 1- math.exp( -nLei(nLei-1.)/(2.nPossibilities) )

	@func.memoize_plus
	def _powersModP( nPermittedChars, p ):
	""" Cached powers of nPermittedChars mod p """
	return [ pow( nPermittedChars, k ) % p for k in xrange( 25 ) ]

	def _intFromAlphaModP( s, permittedChars = ISDA_PERMITTED_CHARS, p = ISDA_PRIME ):
	""" Convert alpha-numeric to integer modulo p """
	# We merge the operations of converting to an int and taking mod p not because python can't handle large numbers, which
	# it does perfectly well, but to make translation to Excel/VBA straightforward. {n^k mod p} are cached for the same reason,
	# and are hardwired in the VBA version provided to ISDA.
	i = 0
	nPermittedChars = len( permittedChars )
	for k, char in enumerate( reversed(s) ):
	digit = permittedChars.find( char )
	i += _powersModP( nPermittedChars, p )[k]*digit
	i = i % p
	return i

	def _alphaFromInt( i, permittedChars = ISDA_PERMITTED_CHARS ):
	""" Convert back again

	Arguments
	---------
	reserved int The CFTC has reserved the first three characters for their prefixes. To avoid clashes we don't allow digits there.

	"""
	# (This is an injection from 0..PRIME into the subset of alpha-numeric strings THAT don't begin with three digits)
	a = ''
	nPermittedChars = len( permittedChars )
	while i:
	a = permittedChars[i % nPermittedChars] + a
	i = i // nPermittedChars
	if not a:
	a = permittedChars[0]
	return a

	def _primeHash( s, permittedChars = ISDA_PERMITTED_CHARS, prime = ISDA_PRIME, newLen = 5 ):
	""" Hash case-insensitive string to one of shorter length by applying modulus operation """
	# (Doesn't really have to be a prime)
	nPermittedChars = len( permittedChars )
	if pow( nPermittedChars, newLen ) < prime:
	raise ValueError("You may wish to choose a larger prime so the map from integers mod p to alpha-numeric strings is an injection")

	iModP = _intFromAlphaModP( s, permittedChars = permittedChars, p = prime )
	alf = _alphaFromInt( iModP, permittedChars )
	return alf.ljust( 5, 'Z' )



	GEEK_EXPLANATION = r""" The modulo operation lifted from integers to alpha-numeric strings """

	LAY_EXPLANATION = r"""
	The idea behind the scheme for shortening a 20 character Legal Entity identifier to an 10-character
	"Short Legal Entity Identifier" (SLEI) is essentially modulo arithmetic.

	If LEI's were just digits ...
	-----------------------------

	First imagine that LEI's were 20 digit numbers instead of 20 digit alpha-numeric strings. A poor
	scheme would take the remainder modulo 10,000,000,000 (that is, keep subtracting 10,000,000,000 from
	out 20 digit number until it is less than 10,000,000,000 and hence a 10 digit number at most). This
	would not be wise, as it is equivalent to only using the last ten digits of our 20 digit LEI. If two
	LEI's differed only in the first ten digits they will evidently have the same SLEI.

	... We'd do the following:
	-------------------------

	To fix this, we would instead choose a large prime number P close to but less than 10,000,000,000 and divide by
	that instead. Thus if LEI's were composes only of digits we could say:

	SLEI = LEI modulus P

	Motivation:

	i. If we change any one digit of the LEI the SLEI will change.
	ii. If we change (only) two digits of the LEI the SLEI will change.
	...
	iii. If we change any nine digits of the LEI the SLEI will change.

	Thus for two LEI's to have the same LEI they must differ in at least ten of the twenty digits. Simple
	schemes for incrementing LEI's (and so forth) are unlikely to lead to collisions.

	Since LEIs are alpha-numeric we make minor changes
	--------------------------------------------------

	The algorithm here is essentially just "SLEI = LEI modulus P" with a few superficial changes.

	a) Because we are using alpha-numeric strings, not just digits, we convert from the former to the latter
	b) We take advantage of the following redundancies in LEIs:
	- Characters 5,6 are always the same
	- Characters 19,20 are check-sum digits
	We throw these characters away prior to encoding
	c) To avoid arithmetic involving large integers (which is troublesome in Excel, which some users might like to use)
	we split the non-redudant sixteen characters into two groups of eight, compress each to five, and then combine.
	d) The CFTC has reserved some UTI prefixes, namedly those whose first three characters are digits (running 000-999). We avoid these.


	Collision probabilities
	-----------------------

	Probability of a clash for one million LEI's is 0.000140864907758 or 1 in 7099.0
	Probability of a clash for ten million LEI's is 0.0139887312515 or 1 in 71.0

	LEI's are not randomly chosen however. Indeed issuer and firm identifiers are presumably unique. So this calculation
	overstates the actual collision probability.

	"""