Created
February 12, 2013 07:43
-
-
Save charl/4760815 to your computer and use it in GitHub Desktop.
Encoding::UndefinedConversionError: "\xE3" from ASCII-8BIT to UTF-8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ jruby ./test.rb | |
SUCCESS | |
======================================================================= | |
Native: {"created_at"=>"Mon Feb 11 16:33:45 +0000 2013", "id"=>301006116974915600, "id_str"=>"301006116974915586", "text"=>"やっとここまできた: ひとりでやるRiak Advent Calendar 2012 day14 - Haskell Client - kuenishi's blog - http://t.co/QXFzD0Sn", "source"=>"<a href=\"http://sites.google.com/site/yorufukurou/\" rel=\"nofollow\">YoruFukurou</a>", "truncated"=>false, "in_reply_to_status_id"=>nil, "in_reply_to_status_id_str"=>nil, "in_reply_to_user_id"=>nil, "in_reply_to_user_id_str"=>nil, "in_reply_to_screen_name"=>nil, "user"=>{"id"=>5576192, "id_str"=>"5576192", "name"=>"UENISHI Kota", "screen_name"=>"kuenishi", "location"=>"Tokyo, Japan", "description"=>"Just got senior: these tweets are my own; forever. WishList http://t.co/50iJIDzM", "url"=>"http://kuenishi.github.com/", "entities"=>{"url"=>{"urls"=>[{"url"=>"http://kuenishi.github.com/", "expanded_url"=>nil, "indices"=>[0, 27]}]}, "description"=>{"urls"=>[{"url"=>"http://t.co/50iJIDzM", "expanded_url"=>"http://www.amazon.co.jp/registry/wishlist/1P6IW44XCM1H2", "display_url"=>"amazon.co.jp/registry/wishl…", "indices"=>[61, 81]}]}}, "protected"=>false, "followers_count"=>1432, "friends_count"=>495, "listed_count"=>149, "created_at"=>"Sat Apr 28 04:09:38 +0000 2007", "favourites_count"=>766, "utc_offset"=>32400, "time_zone"=>"Tokyo", "geo_enabled"=>false, "verified"=>false, "statuses_count"=>52154, "lang"=>"en", "contributors_enabled"=>false, "is_translator"=>false, "profile_background_color"=>"022330", "profile_background_image_url"=>"http://a0.twimg.com/profile_background_images/673049678/e5d613f19df540861b6328670b894e74.jpeg", "profile_background_image_url_https"=>"https://si0.twimg.com/profile_background_images/673049678/e5d613f19df540861b6328670b894e74.jpeg", "profile_background_tile"=>false, "profile_image_url"=>"http://a0.twimg.com/profile_images/2031012144/549099_3529711883603_1296410328_33489801_1920286049_n_normal.jpeg", "profile_image_url_https"=>"https://si0.twimg.com/profile_images/2031012144/549099_3529711883603_1296410328_33489801_1920286049_n_normal.jpeg", "profile_banner_url"=>"https://si0.twimg.com/profile_banners/5576192/1348992450", "profile_link_color"=>"0084B4", "profile_sidebar_border_color"=>"FFFFFF", "profile_sidebar_fill_color"=>"C0DFEC", "profile_text_color"=>"333333", "profile_use_background_image"=>true, "default_profile"=>false, "default_profile_image"=>false, "following"=>true, "follow_request_sent"=>false, "notifications"=>nil}, "geo"=>nil, "coordinates"=>nil, "place"=>nil, "contributors"=>nil, "retweet_count"=>0, "entities"=>{"hashtags"=>[], "urls"=>[{"url"=>"http://t.co/QXFzD0Sn", "expanded_url"=>"http://bit.ly/XCxYrq", "display_url"=>"bit.ly/XCxYrq", "indices"=>[86, 106]}], "user_mentions"=>[]}, "favorited"=>false, "retweeted"=>false, "possibly_sensitive"=>false} | |
JSON: {"created_at":"Mon Feb 11 16:33:45 +0000 2013","id":301006116974915600,"id_str":"301006116974915586","text":"やっとここまできた: ひとりでやるRiak Advent Calendar 2012 day14 - Haskell Client - kuenishi's blog - http://t.co/QXFzD0Sn","source":"<a href=\"http://sites.google.com/site/yorufukurou/\" rel=\"nofollow\">YoruFukurou</a>","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":5576192,"id_str":"5576192","name":"UENISHI Kota","screen_name":"kuenishi","location":"Tokyo, Japan","description":"Just got senior: these tweets are my own; forever. WishList http://t.co/50iJIDzM","url":"http://kuenishi.github.com/","entities":{"url":{"urls":[{"url":"http://kuenishi.github.com/","expanded_url":null,"indices":[0,27]}]},"description":{"urls":[{"url":"http://t.co/50iJIDzM","expanded_url":"http://www.amazon.co.jp/registry/wishlist/1P6IW44XCM1H2","display_url":"amazon.co.jp/registry/wishl…","indices":[61,81]}]}},"protected":false,"followers_count":1432,"friends_count":495,"listed_count":149,"created_at":"Sat Apr 28 04:09:38 +0000 2007","favourites_count":766,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":52154,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http://a0.twimg.com/profile_background_images/673049678/e5d613f19df540861b6328670b894e74.jpeg","profile_background_image_url_https":"https://si0.twimg.com/profile_background_images/673049678/e5d613f19df540861b6328670b894e74.jpeg","profile_background_tile":false,"profile_image_url":"http://a0.twimg.com/profile_images/2031012144/549099_3529711883603_1296410328_33489801_1920286049_n_normal.jpeg","profile_image_url_https":"https://si0.twimg.com/profile_images/2031012144/549099_3529711883603_1296410328_33489801_1920286049_n_normal.jpeg","profile_banner_url":"https://si0.twimg.com/profile_banners/5576192/1348992450","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":true,"follow_request_sent":false,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http://t.co/QXFzD0Sn","expanded_url":"http://bit.ly/XCxYrq","display_url":"bit.ly/XCxYrq","indices":[86,106]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false} | |
FAIL! | |
======================================================================= | |
Native: [{"id"=>"301006116974915600", "created_at"=>"Mon Feb 11 16:33:45 +0000 2013", "entities_urls_display_url"=>"bit.ly/XCxYrq", "entities_urls_expanded_url"=>"http://bit.ly/XCxYrq", "entities_urls_indices"=>"86 106", "entities_urls_url"=>"http://t.co/QXFzD0Sn", "favorited"=>"false", "id_str"=>"301006116974915586", "possibly_sensitive"=>"false", "retweet_count"=>"0", "retweeted"=>"false", "source"=>"<a href=\"http://sites.google.com/site/yorufukurou/\" rel=\"nofollow\">YoruFukurou</a>", "text"=>"\xE3\x82\x84\xE3\x81\xA3\xE3\x81\xA8\xE3\x81\x93\xE3\x81\x93\xE3\x81\xBE\xE3\x81\xA7\xE3\x81\x8D\xE3\x81\x9F: \xE3\x81\xB2\xE3\x81\xA8\xE3\x82\x8A\xE3\x81\xA7\xE3\x82\x84\xE3\x82\x8BRiak Advent Calendar 2012 day14 - Haskell Client - kuenishi's blog - http://t.co/QXFzD0Sn", "truncated"=>"false", "user_contributors_enabled"=>"false", "user_created_at"=>"Sat Apr 28 04:09:38 +0000 2007", "user_default_profile"=>"false", "user_default_profile_image"=>"false", "user_description"=>"Just got senior: these tweets are my own; forever. WishList http://t.co/50iJIDzM", "user_entities_description_urls_display_url"=>"amazon.co.jp/registry/wishl\xE2\x80\xA6", "user_entities_description_urls_expanded_url"=>"http://www.amazon.co.jp/registry/wishlist/1P6IW44XCM1H2", "user_entities_description_urls_indices"=>"61 81", "user_entities_description_urls_url"=>"http://t.co/50iJIDzM", "user_entities_url_urls_indices"=>"0 27", "user_entities_url_urls_url"=>"http://kuenishi.github.com/", "user_favourites_count"=>"766", "user_follow_request_sent"=>"false", "user_followers_count"=>"1432", "user_following"=>"true", "user_friends_count"=>"495", "user_geo_enabled"=>"false", "user_id"=>"5576192", "user_id_str"=>"5576192", "user_is_translator"=>"false", "user_lang"=>"en", "user_listed_count"=>"149", "user_location"=>"Tokyo, Japan", "user_name"=>"UENISHI Kota", "user_profile_background_color"=>"022330", "user_profile_background_image_url"=>"http://a0.twimg.com/profile_background_images/673049678/e5d613f19df540861b6328670b894e74.jpeg", "user_profile_background_image_url_https"=>"https://si0.twimg.com/profile_background_images/673049678/e5d613f19df540861b6328670b894e74.jpeg", "user_profile_background_tile"=>"false", "user_profile_banner_url"=>"https://si0.twimg.com/profile_banners/5576192/1348992450", "user_profile_image_url"=>"http://a0.twimg.com/profile_images/2031012144/549099_3529711883603_1296410328_33489801_1920286049_n_normal.jpeg", "user_profile_image_url_https"=>"https://si0.twimg.com/profile_images/2031012144/549099_3529711883603_1296410328_33489801_1920286049_n_normal.jpeg", "user_profile_link_color"=>"0084B4", "user_profile_sidebar_border_color"=>"FFFFFF", "user_profile_sidebar_fill_color"=>"C0DFEC", "user_profile_text_color"=>"333333", "user_profile_use_background_image"=>"true", "user_protected"=>"false", "user_screen_name"=>"kuenishi", "user_statuses_count"=>"52154", "user_time_zone"=>"Tokyo", "user_url"=>"http://kuenishi.github.com/", "user_utc_offset"=>"32400", "user_verified"=>"false"}] | |
Encoding::UndefinedConversionError: "\xE3" from ASCII-8BIT to UTF-8 | |
encode at org/jruby/RubyString.java:7563 | |
generate at json/ext/GeneratorState.java:210 | |
generate at /home/charl/.rvm/gems/jruby-1.7.2@riak-encoding/gems/json-1.7.7-java/lib/json/common.rb:223 | |
(root) at ./test.rb:39 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bundler/setup' | |
require "json" | |
Bundler.require | |
txt = nil | |
File.open("raw.txt", "rb:UTF-8") do |f| | |
txt = f.read.split(/\r?\n\r?\n/)[1] | |
end | |
tweet = Riak::JSON.parse(txt) | |
c = Riak::Client.new(:protocol => 'pbc') | |
bucket = "tweets" | |
obj = Riak::RObject.new c.bucket(bucket), tweet["id_str"] | |
obj.content_type = 'application/json' | |
obj.data = tweet | |
obj.store | |
data = [tweet["id_str"]] | |
# Your way of grabbing the doc directly via the client and bucket works as the | |
# text is presented back as UTF-8. | |
puts "SUCCESS" | |
puts "=======================================================================" | |
reply = c[bucket][tweet["id_str"]].data | |
puts "Native: #{reply.inspect}" | |
#puts "Encoding: #{c[bucket][tweet["id_str"]].raw_data.encoding}" | |
puts "JSON: #{JSON.generate reply}" | |
print "\n\n" | |
# Grabbing the doc my way via search casues the text to be presented as | |
# ASCII-8BIT instead of UTF-8 causing the JSON lib to barf when trying to | |
# convert the native hash to a JSON string. | |
puts "FAIL!" | |
puts "=======================================================================" | |
reply = c.search(bucket, (["id_str"] * data.length).zip(data).map {|t| t.join ":"}.join(" OR "), {:rows => 15000})["docs"] | |
puts "Native: #{reply.inspect}" | |
puts "JSON: #{JSON.generate reply.first}" | |
obj.delete |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment