Skip to content

Instantly share code, notes, and snippets.

@jefft
Created May 23, 2017 05:48
Show Gist options
  • Save jefft/7df0f7e05b2e3ea7b111e9f8b6c8a420 to your computer and use it in GitHub Desktop.
Save jefft/7df0f7e05b2e3ea7b111e9f8b6c8a420 to your computer and use it in GitHub Desktop.
A variant of the PostgreSQL convert_from() function that avoids the 'invalid byte sequence for encoding' error by emitting hex escapes for any bytes that don't encode
-- A variant of the built-in 'convert_from' function that doesn't give up when it encounters invalid characters, but rather replaces them with 0x hex equivalents.
-- The last 'locator' arg lets you specify which record you're processing.
--
-- Sample use:
-- jira=# select convert_from_or_escape('Invalid chars: \344\274\232\345\223\241\350\252?\350\250\274\343\203\207\343\203\242\343\203\232\343\203\274\343\202\270\343?\270\343\202\210\343?\206\343?\223\343??\357\274?</h3>'::bytea, 'utf-8', 'At table foo line 123');
-- NOTICE: At table foo line 123: Replacing bad chars '0xe8 0xaa 0x3f' with '0xe8aa3f'
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0xb8' with '0xe33fb8'
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0x86' with '0xe33f86'
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0x93' with '0xe33f93'
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0x3f' with '0xe33f3f'
-- NOTICE: At table foo line 123: Replacing bad chars '0xef 0xbc 0x3f' with '0xefbc3f'
-- ┌────────────────────────────────────────────────────────────────────────────────────────┐
-- │ convert_from_or_escape │
-- ├────────────────────────────────────────────────────────────────────────────────────────┤
-- │ Invalid chars: 会員0xe8aa3f証デモページ0xe33fb8よ0xe33f860xe33f930xe33f3f0xefbc3f</h3> │
-- └────────────────────────────────────────────────────────────────────────────────────────┘
-- (1 row)
-- [email protected], 23/May/17
CREATE OR REPLACE FUNCTION convert_from_or_escape(input bytea, encoding varchar, locator varchar DEFAULT null) RETURNS text AS
$$
DECLARE
temp bytea := null; -- Working copy of our input
oldtemp bytea := null;
errormessage text;
errstate text;
invalidchars bytea;
invalidchars_str varchar;
invalidchars_str_trimmed varchar;
BEGIN
temp = input;
-- Repeatedly attempt to convert_from() the string held in temp. Each time it fails, replace the chars that caused the failure and try again.
LOOP
BEGIN
RETURN convert_from(temp, encoding);
EXCEPTION WHEN OTHERS THEN
GET STACKED DIAGNOSTICS errormessage = MESSAGE_TEXT, errstate = RETURNED_SQLSTATE;
IF errstate = '22021' THEN -- This is the 'Invalid byte sequence for encoding' error
-- Here, errormessage is something like: invalid byte sequence for encoding "UTF8": 0xc3 0x3f
invalidchars_str = regexp_replace(errormessage, '^.*: ', ''); -- Strip the 'invalid byte sequence for encoding "UTF8": ' part of the error message, leaving just the invalid chars, e.g. '0xc3 0x3f'
invalidchars_str_trimmed = regexp_replace(invalidchars_str, '(^| )0x', '', 'g'); -- Strip '0x' and whitespace to leave just the hex digits
invalidchars = decode(invalidchars_str_trimmed, 'hex'); -- Decode hex with decode(..., 'hex'), giving us a bytea result
RAISE NOTICE '%Replacing bad chars ''%'' with ''0x%''', coalesce(locator || ': ', ''), invalidchars_str, invalidchars_str_trimmed;
oldtemp = temp;
temp = replace_binary(temp, invalidchars, '0x' || invalidchars_str_trimmed::bytea);
IF oldtemp = temp THEN
RAISE EXCEPTION 'We were unable to fix bad characters by applying regex % to %', invalidchars, encode(oldtemp, 'escape');
END IF;
ELSE
RAISE EXCEPTION 'Unhandled error: %', errormessage;
END IF;
END;
END LOOP;
END;
$$
LANGUAGE plpgsql;
@thiagohenriquegaspar
Copy link

Thanks!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment