Skip to content

Instantly share code, notes, and snippets.

@david-sanabria
Last active August 21, 2024 20:31
Show Gist options
  • Save david-sanabria/0d3ff67eb56d2750502aed4186d6a4a7 to your computer and use it in GitHub Desktop.
Save david-sanabria/0d3ff67eb56d2750502aed4186d6a4a7 to your computer and use it in GitHub Desktop.
Base62 Encode/Decode functions for PostgreSQL.
/*
* This script will create base62_encode() and base62_decode() in the current schema of a postgresql database.
* Give it a star if you find it useful.
*/
CREATE OR REPLACE FUNCTION base62_encode( long_number bigint )
RETURNS text
AS $BODY$
/*
* base62_encode()
*
* This function accepts a small or big number (base 10) and reduces its length into a string
* that is URI-safe using the upper and lower case 26-letter English alphabet
* as well as the numbers 0 - 9. The result is returned as a text string that can be decoded
* based to base10 using the base62_decode() function.
*
* You can find a handy explainer at https://helloacm.com/base62/
*
*
* HISTORY
* 2018-03-13 david sanabria, office of systems integration
* - New function
*
*/
declare
k_base constant integer := 62;
k_alphabet constant text[] := string_to_array( '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'::text, null);
v_return_text text := '';
v_remainder integer;
v_interim bigint;
begin
v_interim := abs( long_number ); -- Negative Numbers (sign) are ignored
--Conversion Loop
loop
v_remainder := v_interim % k_base;
v_interim := v_interim / k_base;
v_return_text := ''|| k_alphabet[ (v_remainder + 1) ] || v_return_text ;
exit when v_interim <= 0;
end loop ;
return v_return_text;
end;$BODY$
LANGUAGE plpgsql
immutable /* Makes no changes to data in tables */
returns null ON NULL INPUT /* Don't bother to call if the value is NULL */
SECURITY INVOKER /* No reason to use DEFINER for security */
cost 5 /* A made up number. Any advice? */
;
CREATE OR REPLACE FUNCTION base62_decode( encoded_text text )
RETURNS bigint
AS $BODY$
/*
* base62_decode()
*
* This function accepts a string that has been base62 encoded. Any characters that are not valid
* are simply ignored, so you can safely pass a formatted string and still get a valid result like you
* do with a UUID field.
*
* HISTORY
* 2018-03-13 david sanabria, office of systems integration
* - New function
*
*/
declare
k_base constant integer := 62;
k_alphabet constant text := '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
v_encoded_arr text[];
v_return_result bigint := 0;
v_interim bigint;
v_index integer; -- Pointer to input array
v_token text;
v_power integer := 0; -- reverse pointer, used for position exponent (e.g. 2^32)
begin
-- check for guard values
if encoded_text is null or length( encoded_text ) = 0 then
return null;
end if;
-- reverse the input string to make the exponent math simpler below
v_encoded_arr := string_to_array( reverse( encoded_text ) , null );
--Conversion Loop
foreach v_token in array v_encoded_arr
loop
v_index := strpos( k_alphabet, v_token );
if v_index = 0 then
raise notice 'Token ignored "%"', v_token;
--ignore invalid tokens, which allows formatted strings to be processed (e.g. '{abc-1Lg}')
else
v_return_result := v_return_result + ( ( v_index - 1) * pow( k_base, v_power) );
v_power := 1 + v_power; --increment after each valid loop
end if;
end loop;
return v_return_result;
end;$BODY$
LANGUAGE plpgsql
immutable /* Makes no changes to data in tables */
returns null ON NULL INPUT /* Don't bother to call if the value is NULL */
SECURITY INVOKER /* No reason to use DEFINER for security */
cost 5 /* A made up number. Any advice? */
;
/*
-- TEST Query
-- * Requires base62_decode() and base62_encode()
-- * Tested in PostgreSQL v9.6.7
with arr as (
select
62::integer as v_base
,73::bigint as v_test
,74400::bigint as v_test2
,'JM0'::text as v_test2_b
,09::bigint as v_test3
,999888777666::bigint as v_test4
, string_to_array( '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'::text, null) as v_alphabet
)
select (v_test % v_base)::integer as test_mod
, (v_test / v_base)::bigint as test_div
,abs( v_test ) as test_abs
,v_alphabet[ (v_test % v_base) + 1 ] as test_arr
,cwsdoc.base62_encode( v_test2 ) as test_encode
,cwsdoc.base62_encode( null ) as test_encode2
,cwsdoc.base62_encode( v_test4 ) as test_encode4
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test ) ) as test_decode1
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test2 ) ) as test_decode2
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test3 ) ) as test_decode3
,cwsdoc.base62_decode( cwsdoc.base62_encode( v_test4 ) ) as test_decode4
,cwsdoc.base62_decode( null ) as test_decode5
,cwsdoc.base62_decode( '{JM-0}' ) as test_decode6
,arr.*
from arr;
*/
@ehfeng
Copy link

ehfeng commented May 10, 2019

This worked for a lot of values, but not all values.

select base62_encode(base62_decode('qltxDQbGP7')); -- returns qltxDQbGPQ

@cfeduke
Copy link

cfeduke commented Oct 28, 2019

I figured out the problem where some decodes are incorrect has to do with inlining v_interim in the source code. If you change the body of the else statement in base62_decode to include the v_interim variable then it works without a problem:

      v_interim := ( ( v_index - 1) *  pow( k_base, v_power) );
      v_return_result := v_return_result + v_interim;
      v_power := 1 + v_power; --increment after each valid loop

(and yes select base62_encode(base62_decode('qltxDQbGP7')); -- returns qltxDQbGP7 correctly)

@ddevienne
Copy link

In my case, I'd like to encode a hash, larger than a bigint (which is 8 bytes AFAIK), so what would be the variation that encodes from a bytea and decodes to a bytea? I'm trying to move existing code in C++ that runs client side, to the server-side, that is used to convert arbitrarily long names into DB-specific ROLE names, which uses base62 to ensure unicity of the names per-DB (since ROLE names are cluster-wide).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment