Created
May 23, 2017 05:48
-
-
Save jefft/7df0f7e05b2e3ea7b111e9f8b6c8a420 to your computer and use it in GitHub Desktop.
A variant of the PostgreSQL convert_from() function that avoids the 'invalid byte sequence for encoding' error by emitting hex escapes for any bytes that don't encode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- A variant of the built-in 'convert_from' function that doesn't give up when it encounters invalid characters, but rather replaces them with 0x hex equivalents. | |
-- The last 'locator' arg lets you specify which record you're processing. | |
-- | |
-- Sample use: | |
-- jira=# select convert_from_or_escape('Invalid chars: \344\274\232\345\223\241\350\252?\350\250\274\343\203\207\343\203\242\343\203\232\343\203\274\343\202\270\343?\270\343\202\210\343?\206\343?\223\343??\357\274?</h3>'::bytea, 'utf-8', 'At table foo line 123'); | |
-- NOTICE: At table foo line 123: Replacing bad chars '0xe8 0xaa 0x3f' with '0xe8aa3f' | |
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0xb8' with '0xe33fb8' | |
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0x86' with '0xe33f86' | |
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0x93' with '0xe33f93' | |
-- NOTICE: At table foo line 123: Replacing bad chars '0xe3 0x3f 0x3f' with '0xe33f3f' | |
-- NOTICE: At table foo line 123: Replacing bad chars '0xef 0xbc 0x3f' with '0xefbc3f' | |
-- ┌────────────────────────────────────────────────────────────────────────────────────────┐ | |
-- │ convert_from_or_escape │ | |
-- ├────────────────────────────────────────────────────────────────────────────────────────┤ | |
-- │ Invalid chars: 会員0xe8aa3f証デモページ0xe33fb8よ0xe33f860xe33f930xe33f3f0xefbc3f</h3> │ | |
-- └────────────────────────────────────────────────────────────────────────────────────────┘ | |
-- (1 row) | |
-- [email protected], 23/May/17 | |
CREATE OR REPLACE FUNCTION convert_from_or_escape(input bytea, encoding varchar, locator varchar DEFAULT null) RETURNS text AS | |
$$ | |
DECLARE | |
temp bytea := null; -- Working copy of our input | |
oldtemp bytea := null; | |
errormessage text; | |
errstate text; | |
invalidchars bytea; | |
invalidchars_str varchar; | |
invalidchars_str_trimmed varchar; | |
BEGIN | |
temp = input; | |
-- Repeatedly attempt to convert_from() the string held in temp. Each time it fails, replace the chars that caused the failure and try again. | |
LOOP | |
BEGIN | |
RETURN convert_from(temp, encoding); | |
EXCEPTION WHEN OTHERS THEN | |
GET STACKED DIAGNOSTICS errormessage = MESSAGE_TEXT, errstate = RETURNED_SQLSTATE; | |
IF errstate = '22021' THEN -- This is the 'Invalid byte sequence for encoding' error | |
-- Here, errormessage is something like: invalid byte sequence for encoding "UTF8": 0xc3 0x3f | |
invalidchars_str = regexp_replace(errormessage, '^.*: ', ''); -- Strip the 'invalid byte sequence for encoding "UTF8": ' part of the error message, leaving just the invalid chars, e.g. '0xc3 0x3f' | |
invalidchars_str_trimmed = regexp_replace(invalidchars_str, '(^| )0x', '', 'g'); -- Strip '0x' and whitespace to leave just the hex digits | |
invalidchars = decode(invalidchars_str_trimmed, 'hex'); -- Decode hex with decode(..., 'hex'), giving us a bytea result | |
RAISE NOTICE '%Replacing bad chars ''%'' with ''0x%''', coalesce(locator || ': ', ''), invalidchars_str, invalidchars_str_trimmed; | |
oldtemp = temp; | |
temp = replace_binary(temp, invalidchars, '0x' || invalidchars_str_trimmed::bytea); | |
IF oldtemp = temp THEN | |
RAISE EXCEPTION 'We were unable to fix bad characters by applying regex % to %', invalidchars, encode(oldtemp, 'escape'); | |
END IF; | |
ELSE | |
RAISE EXCEPTION 'Unhandled error: %', errormessage; | |
END IF; | |
END; | |
END LOOP; | |
END; | |
$$ | |
LANGUAGE plpgsql; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks!!