Last active
September 22, 2022 11:02
-
-
Save agemooij/15a0eaebc2c1ddd5ddf4 to your computer and use it in GitHub Desktop.
Scala text normalization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package rfs.rebb | |
package common | |
/** | |
* Performs standard Java/unicode normalization on the trimmed and lowercased form | |
* of the input String and then adds a few extra tricks for dealing with special | |
* characters. | |
* | |
* JVM/Unicode normalization references (warning: learning curve black hole, beware!): | |
* | |
* - http://docs.oracle.com/javase/7/docs/api/java/text/Normalizer.html | |
* - http://stackoverflow.com/questions/5697171/regex-what-is-incombiningdiacriticalmarks | |
* - http://stackoverflow.com/questions/1453171/%C5%84-%C7%B9-%C5%88-%C3%B1-%E1%B9%85-%C5%86-%E1%B9%87-%E1%B9%8B-%E1%B9%89-%CC%88-%C9%B2-%C6%9E-%E1%B6%87-%C9%B3-%C8%B5-n-or-remove-diacritical-marks-from-unicode-cha | |
* - http://lipn.univ-paris13.fr/~cerin/BD/unicode.html | |
* - http://www.unicode.org/reports/tr15/tr15-23.html | |
* - http://www.unicode.org/reports/tr44/#Properties | |
* | |
* Some special cases, like "ø" and "ß" are not being stripped/replaced by the | |
* Java/Unicode normalizer so we have to replace them ourselves. | |
*/ | |
trait NormalizeSupport { | |
import java.text.Normalizer.{ normalize ⇒ jnormalize, _ } | |
def normalize(in: String): String = { | |
val cleaned = in.trim.toLowerCase | |
val normalized = jnormalize(cleaned, Form.NFD).replaceAll("[\\p{InCombiningDiacriticalMarks}\\p{IsM}\\p{IsLm}\\p{IsSk}]+", "") | |
normalized.replaceAll("'s", "") | |
.replaceAll("ß", "ss") | |
.replaceAll("ø", "o") | |
.replaceAll("[^a-zA-Z0-9-]+", "-") | |
.replaceAll("-+", "-") | |
.stripSuffix("-") | |
} | |
} | |
object NormalizeSupport extends NormalizeSupport |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package rfs.rebb | |
package common | |
import org.scalatest._ | |
import Matchers._ | |
import common._ | |
class NormalizeSupportSpec extends UnitSpec with NormalizeSupport { | |
"NormalizeSupport" should { | |
"correctly normalize non -ASCII characters" in { | |
normalize("ÀÁÂÃĀĂȦÄẢÅǍȀȂĄẠḀẦẤàáâä") shouldBe "aaaaaaaaaaaaaaaaaaaaaa" | |
normalize("ÉÊẼĒĔËȆȄȨĖèéêẽēȅë") shouldBe "eeeeeeeeeeeeeeeee" | |
normalize("ÌÍÏïØøÒÖÔöÜüŇñÇçß") shouldBe "iiiioooooouunnccss" | |
} | |
"normalize 's to nothing" in { | |
normalize("aa'sbba") shouldBe "aabba" | |
} | |
"normalize & for -" in { | |
normalize("aa & bb") shouldBe "aa-bb" | |
normalize("aa&& & &&& bb") shouldBe "aa-bb" | |
} | |
"normalize brackets to -" in { | |
normalize("aa(bb)cc") shouldBe "aa-bb-cc" | |
normalize("aa((((bb)))cc") shouldBe "aa-bb-cc" | |
} | |
"normalize multiples of '-' to a single '-'" in { | |
normalize("a----a--b-b-------a") shouldBe "a-a-b-b-a" | |
} | |
"normalize to lowercase" in { | |
normalize("AAbAbbB") shouldBe "aababbb" | |
} | |
"normalize a string with several diacritical marks" in { | |
normalize("a'sa((%%$ & b___--BB a") shouldBe "aa-b-bb-a" | |
} | |
normalizationTestCasesSharedWithNl.foreach { | |
case (input, expectedOutput) ⇒ | |
s"""normalize "${input}" to "${expectedOutput}".""" in { | |
normalize(input) shouldBe expectedOutput | |
} | |
} | |
} | |
private def normalizationTestCasesSharedWithNl: List[(String, String)] = { | |
import org.parboiled.common._ | |
val data = FileUtils.readAllTextFromResource("normalization-checks.csv") | |
val lines = data.trim.split("""\r?\n""").toList | |
lines.map(line ⇒ line.split("""\|\|""")).map(parts ⇒ (parts(0), parts(1))) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks for sharing this! a lifesaver :)