Created
June 25, 2024 16:09
-
-
Save mdchaney/e2b05eafab81cbdc4dfed6dd2f8e69a6 to your computer and use it in GitHub Desktop.
fix_encoding_2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module FixEncoding | |
def FixEncoding.remove_bom(str) | |
if str.byteslice(0..2) == "\xEF\xBB\xBF".b | |
return str.byteslice(3..-1) | |
else | |
return str | |
end | |
end | |
def FixEncoding.has_utf8?(str) | |
str.match(/[\xc0-\xf7][\x80-\xbf]/n) | |
end | |
def FixEncoding.has_stupid_quotes?(str) | |
str.match(/[\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94]/n) | |
end | |
def FixEncoding.replace_stupid_quotes(str) | |
str.tr("\x82\x8b\x91\x92\x9b\xb4\x84\x93\x94".b, "''''''\"\"\"") | |
end | |
def FixEncoding.has_win1252?(str) | |
str.match(/[\x80-\x9f]/n) | |
end | |
def FixEncoding.likely_8bit_encoding(str) | |
if str.ascii_only? | |
"ASCII-8BIT" | |
elsif has_win1252?(str) | |
"WINDOWS-1252" | |
else | |
"ISO-8859-1" | |
end | |
end | |
def FixEncoding.transcode_to_utf8(str) | |
str.encode("UTF-8", likely_8bit_encoding(str)) | |
end | |
def FixEncoding.fix_encoding(str) | |
if str.ascii_only? | |
return str.force_encoding('UTF-8') | |
else | |
str = str.b | |
str = remove_bom(str) | |
if has_utf8?(str) | |
return str.force_encoding('UTF-8') | |
else | |
if has_stupid_quotes?(str) | |
str = replace_stupid_quotes(str) | |
end | |
return transcode_to_utf8(str) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment