Created
October 5, 2011 10:53
-
-
Save danielholmstrom/1264170 to your computer and use it in GitHub Desktop.
Fix double encoded UTF-8 in mysql with an SQL-query.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Example of how to fix double encoded UTF-8 in mysql with a single mysql query. | |
-- This example assumes that the data has been inserted with NAMES=latin1. | |
-- | |
-- XXX: Do not rely on this to work without testing it. XXX | |
-- | |
-- XXX: If this is run on a column with a UNIQUE index and the COLLATE is | |
-- case-insensitive this might fail. The reason for this is that double-encoded | |
-- upper and lower case chars are considered different but when encoded correctly | |
-- they will be considered the same. | |
DROP TABLE IF EXISTS double_encoded_utf8; | |
CREATE TABLE double_encoded_utf8( | |
`text` varchar(500) COLLATE utf8_swedish_ci | |
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_swedish_ci; | |
-- Insert text with NAMES=latin1 | |
SET NAMES latin1; | |
INSERT INTO double_encoded_utf8 VALUES("!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"); | |
SELECT `text` AS '=== Test selected with NAMES=latin1' FROM double_encoded_utf8; | |
SELECT count(*) as '=== Number of matches before encoding' FROM double_encoded_utf8 WHERE `text`="!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; | |
-- Switch to UTF8 | |
SET NAMES utf8; | |
SELECT `text` AS '=== Test selected with NAMES=utf8 before converting' FROM double_encoded_utf8; | |
SELECT count(*) as '=== Number of matches before encoding' FROM double_encoded_utf8 WHERE `text`="!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; | |
-- Convert | |
UPDATE double_encoded_utf8 SET `text`=convert(cast(cast(`text` AS CHAR CHARACTER SET latin1) AS BINARY) USING utf8); | |
SELECT `text` AS '=== Test selected with NAMES=utf8 after converting' FROM double_encoded_utf8; | |
SELECT count(*) as '=== Number of matches after encoding' FROM double_encoded_utf8 WHERE `text`="!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment