Created
September 24, 2013 22:59
-
-
Save raypereda/6692493 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'charlock_holmes' | |
def is_utf8?(data) | |
data.force_encoding('UTF-8').valid_encoding? | |
end | |
def detect_with_charlock_holmes(data) | |
detection = CharlockHolmes::EncodingDetector.detect(data) | |
return detection[:encoding] | |
end | |
def detect_with_brut_force(data) | |
Encoding.list.each do |enc| | |
test_data = data.force_encoding(enc.name) | |
next unless test_data.valid_encoding? | |
begin | |
converter = Encoding::Converter.new(enc.name, 'UTF-8') | |
converted_string = converter.convert(test_data) | |
return enc.name if converted_string.valid_encoding? | |
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError, Encoding::ConverterNotFoundError | |
next | |
end | |
end | |
end | |
def detect(data) | |
if is_utf8?(data) | |
return 'UTF-8' | |
else | |
encoding = detect_with_charlock_holmes(data) | |
encoding ||= detect_with_brut_force(data) | |
return encoding | |
end | |
end | |
data = ARGF.read | |
puts detect(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment