-
-
Save david-sanabria/0d3ff67eb56d2750502aed4186d6a4a7 to your computer and use it in GitHub Desktop.
/* | |
* This script will create base62_encode() and base62_decode() in the current schema of a postgresql database. | |
* Give it a star if you find it useful. | |
*/ | |
CREATE OR REPLACE FUNCTION base62_encode( long_number bigint ) | |
RETURNS text | |
AS $BODY$ | |
/* | |
* base62_encode() | |
* | |
* This function accepts a small or big number (base 10) and reduces its length into a string | |
* that is URI-safe using the upper and lower case 26-letter English alphabet | |
* as well as the numbers 0 - 9. The result is returned as a text string that can be decoded | |
* based to base10 using the base62_decode() function. | |
* | |
* You can find a handy explainer at https://helloacm.com/base62/ | |
* | |
* | |
* HISTORY | |
* 2018-03-13 david sanabria, office of systems integration | |
* - New function | |
* | |
*/ | |
declare | |
k_base constant integer := 62; | |
k_alphabet constant text[] := string_to_array( '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'::text, null); | |
v_return_text text := ''; | |
v_remainder integer; | |
v_interim bigint; | |
begin | |
v_interim := abs( long_number ); -- Negative Numbers (sign) are ignored | |
--Conversion Loop | |
loop | |
v_remainder := v_interim % k_base; | |
v_interim := v_interim / k_base; | |
v_return_text := ''|| k_alphabet[ (v_remainder + 1) ] || v_return_text ; | |
exit when v_interim <= 0; | |
end loop ; | |
return v_return_text; | |
end;$BODY$ | |
LANGUAGE plpgsql | |
immutable /* Makes no changes to data in tables */ | |
returns null ON NULL INPUT /* Don't bother to call if the value is NULL */ | |
SECURITY INVOKER /* No reason to use DEFINER for security */ | |
cost 5 /* A made up number. Any advice? */ | |
; | |
CREATE OR REPLACE FUNCTION base62_decode( encoded_text text ) | |
RETURNS bigint | |
AS $BODY$ | |
/* | |
* base62_decode() | |
* | |
* This function accepts a string that has been base62 encoded. Any characters that are not valid | |
* are simply ignored, so you can safely pass a formatted string and still get a valid result like you | |
* do with a UUID field. | |
* | |
* HISTORY | |
* 2018-03-13 david sanabria, office of systems integration | |
* - New function | |
* | |
*/ | |
declare | |
k_base constant integer := 62; | |
k_alphabet constant text := '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; | |
v_encoded_arr text[]; | |
v_return_result bigint := 0; | |
v_interim bigint; | |
v_index integer; -- Pointer to input array | |
v_token text; | |
v_power integer := 0; -- reverse pointer, used for position exponent (e.g. 2^32) | |
begin | |
-- check for guard values | |
if encoded_text is null or length( encoded_text ) = 0 then | |
return null; | |
end if; | |
-- reverse the input string to make the exponent math simpler below | |
v_encoded_arr := string_to_array( reverse( encoded_text ) , null ); | |
--Conversion Loop | |
foreach v_token in array v_encoded_arr | |
loop | |
v_index := strpos( k_alphabet, v_token ); | |
if v_index = 0 then | |
raise notice 'Token ignored "%"', v_token; | |
--ignore invalid tokens, which allows formatted strings to be processed (e.g. '{abc-1Lg}') | |
else | |
v_return_result := v_return_result + ( ( v_index - 1) * pow( k_base, v_power) ); | |
v_power := 1 + v_power; --increment after each valid loop | |
end if; | |
end loop; | |
return v_return_result; | |
end;$BODY$ | |
LANGUAGE plpgsql | |
immutable /* Makes no changes to data in tables */ | |
returns null ON NULL INPUT /* Don't bother to call if the value is NULL */ | |
SECURITY INVOKER /* No reason to use DEFINER for security */ | |
cost 5 /* A made up number. Any advice? */ | |
; | |
/* | |
-- TEST Query | |
-- * Requires base62_decode() and base62_encode() | |
-- * Tested in PostgreSQL v9.6.7 | |
with arr as ( | |
select | |
62::integer as v_base | |
,73::bigint as v_test | |
,74400::bigint as v_test2 | |
,'JM0'::text as v_test2_b | |
,09::bigint as v_test3 | |
,999888777666::bigint as v_test4 | |
, string_to_array( '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'::text, null) as v_alphabet | |
) | |
select (v_test % v_base)::integer as test_mod | |
, (v_test / v_base)::bigint as test_div | |
,abs( v_test ) as test_abs | |
,v_alphabet[ (v_test % v_base) + 1 ] as test_arr | |
,cwsdoc.base62_encode( v_test2 ) as test_encode | |
,cwsdoc.base62_encode( null ) as test_encode2 | |
,cwsdoc.base62_encode( v_test4 ) as test_encode4 | |
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test ) ) as test_decode1 | |
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test2 ) ) as test_decode2 | |
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test3 ) ) as test_decode3 | |
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test4 ) ) as test_decode4 | |
,cwsdoc.base62_decode( null ) as test_decode5 | |
,cwsdoc.base62_decode( '{JM-0}' ) as test_decode6 | |
,arr.* | |
from arr; | |
*/ |
I figured out the problem where some decodes are incorrect has to do with inlining v_interim
in the source code. If you change the body of the else statement in base62_decode
to include the v_interim
variable then it works without a problem:
v_interim := ( ( v_index - 1) * pow( k_base, v_power) );
v_return_result := v_return_result + v_interim;
v_power := 1 + v_power; --increment after each valid loop
(and yes select base62_encode(base62_decode('qltxDQbGP7')); -- returns qltxDQbGP7 correctly
)
In my case, I'd like to encode a hash, larger than a bigint (which is 8 bytes AFAIK), so what would be the variation that encodes from a bytea
and decodes to a bytea
? I'm trying to move existing code in C++ that runs client side, to the server-side, that is used to convert arbitrarily long names into DB-specific ROLE names, which uses base62 to ensure unicity of the names per-DB (since ROLE names are cluster-wide).
This worked for a lot of values, but not all values.