Skip to content

Instantly share code, notes, and snippets.

@coryschires
Created September 20, 2011 19:08
Show Gist options
  • Select an option

  • Save coryschires/1230008 to your computer and use it in GitHub Desktop.

Select an option

Save coryschires/1230008 to your computer and use it in GitHub Desktop.
Script to replace bad encoding with HTML entities.
class EncodingFixer
ENCODING_TO_ENTITY = {
"\222" => "’",
"\223" => "“",
"\224" => "”",
"\205" => "…",
"\226" => "–",
"\227" => "—"
}
def self.fix(text)
ENCODING_TO_ENTITY.each do |encoding, entity|
text = text.gsub(encoding, entity)
end
remove_encoding_when_original_character_cannot_be_determined(text)
end
def self.remove_encoding_when_original_character_cannot_be_determined(text)
encoding = text.match(/(\S)\?(\S)/)
text = text.gsub(encoding[0], "#{encoding[1]}#{encoding[2]}") if encoding
text
end
end
# --- TESTS ---
describe EncodingFixer do
it "should replace apostrophe" do
sample = "you couldn\222t ask for a better teacher"
EncodingFixer.fix(sample).should == "you couldn’t ask for a better teacher"
end
it "should replace left double quotes" do
sample = "At Salon 3, the \2233..."
EncodingFixer.fix(sample).should == "At Salon 3, the “3..."
end
it "should replace right double quotes" do
sample = "...3\224 are your resident hair experts"
EncodingFixer.fix(sample).should == "...3” are your resident hair experts"
end
it "should replace both left and right double quotes" do
sample = "At Salon 3, the \2233\224 are your resident hair experts"
EncodingFixer.fix(sample).should == "At Salon 3, the “3” are your resident hair experts"
end
it "should replace an ellipsis" do
sample = "a $45 value\205</strong>now that"
EncodingFixer.fix(sample).should == "a $45 value&hellip;</strong>now that"
end
it "should replace an n-dash" do
sample = "Tuesday \226 Friday only"
EncodingFixer.fix(sample).should == "Tuesday &ndash; Friday only"
end
it "should replace an m-dash" do
sample = "Tuesday \227 Friday only"
EncodingFixer.fix(sample).should == "Tuesday &mdash; Friday only"
end
it "should replace a bunch of things at once" do
sample = "an\205ellipsis, it\222s an apostrophe, and \223double quotes\224, then \226 finally \226 an n-dash"
EncodingFixer.fix(sample).should == "an&hellip;ellipsis, it&rsquo;s an apostrophe, and &ldquo;double quotes&rdquo;, then &ndash; finally &ndash; an n-dash"
end
context "when the original character cannot be determined" do
it "should remove the bad encoding (a typo is better than throwing encoding errors)" do
sample_1 = "hard to find gift and d?cor items"
EncodingFixer.fix(sample_1).should == "hard to find gift and dcor items"
sample_2 = "garnished with Jalape?o"
EncodingFixer.fix(sample_2).should == "garnished with Jalapeo"
sample_3 = "Bon app?tit."
EncodingFixer.fix(sample_3).should == "Bon apptit."
sample_4 = "Home D?cor"
EncodingFixer.fix(sample_4).should == "Home Dcor"
end
it "should not change trailing question marks" do
sample_2 = "garnished with Jalapeno?"
EncodingFixer.fix(sample_2).should == "garnished with Jalapeno?"
end
it "should not change initial question marks" do
sample_2 = "?garnished with Jalapeno"
EncodingFixer.fix(sample_2).should == "?garnished with Jalapeno"
end
it "should not change floating question marks" do
sample_2 = "? garnished ? with Jalapeno"
EncodingFixer.fix(sample_2).should == "? garnished ? with Jalapeno"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment