Created
September 20, 2011 19:08
-
-
Save coryschires/1230008 to your computer and use it in GitHub Desktop.
Script to replace bad encoding with HTML entities.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class EncodingFixer | |
| ENCODING_TO_ENTITY = { | |
| "\222" => "’", | |
| "\223" => "“", | |
| "\224" => "”", | |
| "\205" => "…", | |
| "\226" => "–", | |
| "\227" => "—" | |
| } | |
| def self.fix(text) | |
| ENCODING_TO_ENTITY.each do |encoding, entity| | |
| text = text.gsub(encoding, entity) | |
| end | |
| remove_encoding_when_original_character_cannot_be_determined(text) | |
| end | |
| def self.remove_encoding_when_original_character_cannot_be_determined(text) | |
| encoding = text.match(/(\S)\?(\S)/) | |
| text = text.gsub(encoding[0], "#{encoding[1]}#{encoding[2]}") if encoding | |
| text | |
| end | |
| end | |
| # --- TESTS --- | |
| describe EncodingFixer do | |
| it "should replace apostrophe" do | |
| sample = "you couldn\222t ask for a better teacher" | |
| EncodingFixer.fix(sample).should == "you couldn’t ask for a better teacher" | |
| end | |
| it "should replace left double quotes" do | |
| sample = "At Salon 3, the \2233..." | |
| EncodingFixer.fix(sample).should == "At Salon 3, the “3..." | |
| end | |
| it "should replace right double quotes" do | |
| sample = "...3\224 are your resident hair experts" | |
| EncodingFixer.fix(sample).should == "...3” are your resident hair experts" | |
| end | |
| it "should replace both left and right double quotes" do | |
| sample = "At Salon 3, the \2233\224 are your resident hair experts" | |
| EncodingFixer.fix(sample).should == "At Salon 3, the “3” are your resident hair experts" | |
| end | |
| it "should replace an ellipsis" do | |
| sample = "a $45 value\205</strong>now that" | |
| EncodingFixer.fix(sample).should == "a $45 value…</strong>now that" | |
| end | |
| it "should replace an n-dash" do | |
| sample = "Tuesday \226 Friday only" | |
| EncodingFixer.fix(sample).should == "Tuesday – Friday only" | |
| end | |
| it "should replace an m-dash" do | |
| sample = "Tuesday \227 Friday only" | |
| EncodingFixer.fix(sample).should == "Tuesday — Friday only" | |
| end | |
| it "should replace a bunch of things at once" do | |
| sample = "an\205ellipsis, it\222s an apostrophe, and \223double quotes\224, then \226 finally \226 an n-dash" | |
| EncodingFixer.fix(sample).should == "an…ellipsis, it’s an apostrophe, and “double quotes”, then – finally – an n-dash" | |
| end | |
| context "when the original character cannot be determined" do | |
| it "should remove the bad encoding (a typo is better than throwing encoding errors)" do | |
| sample_1 = "hard to find gift and d?cor items" | |
| EncodingFixer.fix(sample_1).should == "hard to find gift and dcor items" | |
| sample_2 = "garnished with Jalape?o" | |
| EncodingFixer.fix(sample_2).should == "garnished with Jalapeo" | |
| sample_3 = "Bon app?tit." | |
| EncodingFixer.fix(sample_3).should == "Bon apptit." | |
| sample_4 = "Home D?cor" | |
| EncodingFixer.fix(sample_4).should == "Home Dcor" | |
| end | |
| it "should not change trailing question marks" do | |
| sample_2 = "garnished with Jalapeno?" | |
| EncodingFixer.fix(sample_2).should == "garnished with Jalapeno?" | |
| end | |
| it "should not change initial question marks" do | |
| sample_2 = "?garnished with Jalapeno" | |
| EncodingFixer.fix(sample_2).should == "?garnished with Jalapeno" | |
| end | |
| it "should not change floating question marks" do | |
| sample_2 = "? garnished ? with Jalapeno" | |
| EncodingFixer.fix(sample_2).should == "? garnished ? with Jalapeno" | |
| end | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment