Created
March 2, 2015 06:07
-
-
Save johnf/f1920d7ad93aad12d872 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Perform charset encoding detection as per http://tools.ietf.org/html/draft-abarth-mime-sniff-05 | |
encoding = 'UTF-8' | |
if c.header_str =~ /Content-Type: (.*?)$/m | |
content_type = $1 | |
if content_type =~/ISO-8859-1/i | |
encoding = 'ISO-8859-1' | |
elsif content_type =~ /UTF-8/i | |
encoding = 'UTF-8' | |
elsif content_type =~ /application\/vnd\.google\.gdata\.error\+xml/ | |
encoding = 'UTF-8' | |
end | |
elsif body[0] == 0xFE and body[1] == 0xFF | |
encoding = 'UTF-16BE' | |
elsif body[0] == 0xFF and body[1] == 0xFE | |
encoding = 'UTF-16LE' | |
elsif body[0] == 0xEF and body[1] == 0xBB and body[2] == 0xBF | |
encoding = 'UTF-8' | |
else | |
start = body[0, 1024] | |
if start =~ /(charset|encoding)=iso-8859-1/i | |
encoding = 'ISO-8859-1' | |
elsif body =~ /(charset|encoding)=utf-8/i | |
encoding = 'UTF-8' | |
else | |
raise "Unknown character set at url: #{url}" | |
end | |
end | |
body.force_encoding encoding | |
body.encode!('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?') | |
# Until we convert the database to utf8mb4 strip out the 4 byte unicode characters | |
body = body.gsub(/[\u{10000}-\u{FFFFF}]/, '?') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment