Last active
May 25, 2024 10:19
-
-
Save dacr/2b76d147f283b4b1145d823b603b9ebb to your computer and use it in GitHub Desktop.
Advanced operations on strings with regular expressions / published by https://github.com/dacr/code-examples-manager #41e3e12c-dbbf-4f43-9c4b-5cf05869dffa/f1899e8d472222d605654a0d924e6d55053265a0
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// summary : Advanced operations on strings with regular expressions | |
// keywords : scala, scalatest, regex, cheatsheet, @testable | |
// publish : gist | |
// authors : David Crosson | |
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2) | |
// id : 41e3e12c-dbbf-4f43-9c4b-5cf05869dffa | |
// created-on : 2021-11-19T07:29:40+01:00 | |
// managed-by : https://github.com/dacr/code-examples-manager | |
// run-with : scala-cli $file | |
// --------------------- | |
//> using scala "3.4.2" | |
//> using dep "org.scalatest::scalatest:3.2.16" | |
//> using objectWrapper | |
// --------------------- | |
import org.scalatest._ | |
import flatspec._ | |
import matchers._ | |
import OptionValues._ | |
import java.util.Locale | |
import java.text.{DecimalFormat, NumberFormat} | |
class AdvancedRegexOperations extends AnyFlatSpec with should.Matchers { | |
override def suiteName="AdvancedStringOperations" | |
// --------------------------------------------------------------------------------------------- | |
"split" should "be able to split in 2 parts using the last dot thanks to zero-width positive lookahead regexp" in { | |
val aType="ab.cd.de" | |
val Array(aPackage, aClassName) = aType.split("[.](?=[^.]*$)", 2) | |
aPackage shouldBe "ab.cd" | |
aClassName shouldBe "de" | |
} | |
it should "be able to split on characters while preserving those characters" in { | |
val in = "abc. truc? blah, blu." | |
in.split("""\s*(?<=[?.,])\s*""").toList shouldBe List("abc.", "truc?", "blah,", "blu.") | |
} | |
// --------------------------------------------------------------------------------------------- | |
"regexp" should "fully match" in { | |
val MyRE="TO(.*)TA".r | |
"TOTUTA" match { | |
case MyRE(inside)=> inside shouldBe "TU" | |
case _ => fail() | |
} | |
"TA" match { | |
case MyRE(inside) => fail("") | |
case _ => succeed | |
} | |
} | |
it should "partially match" in { | |
val MyRE="TO(.*)TA".r.unanchored | |
"xxTOTUTAxx" match { | |
case MyRE(inside)=> inside shouldBe "TU" | |
case _ => fail() | |
} | |
} | |
it should "provide alternative styles" in { | |
val MyRE="TO(.*)TA".r | |
val sample = "TOTUTA" | |
val MyRE(sub)=sample ; sub shouldBe "TU" | |
(sample match {case MyRE(in)=>in}) shouldBe "TU" | |
Some(sample).collect{case MyRE(in)=>in}.value shouldBe "TU" | |
MyRE.findFirstMatchIn(sample).map(_.group(1)).value shouldBe "TU" | |
} | |
it should "match several arguments with explicit unitary groups" in { | |
val EntryRE = """(\d+),(\d+),(\d+)""".r | |
EntryRE.matches("1,2,3") shouldBe true | |
val EntryRE(a,b,c) = "1,2,3" | |
a shouldBe "1" | |
b shouldBe "2" | |
c shouldBe "3" | |
} | |
it should "no match several arguments using repeats unfortunately" in { | |
val EntryRE = """(\d+)(?:,(\d+)){2}""".r | |
EntryRE.matches("1,2,3") shouldBe true | |
info("So take care as matching doesn't work with complex regular expression :(") | |
intercept[MatchError] { | |
val EntryRE(a, b, c) = "1,2,3" | |
a shouldBe "1" | |
b shouldBe "2" | |
c shouldBe "3" | |
} | |
} | |
/* | |
it should "be possible to create interpolated regexp" in { | |
// requires : "dev.bgahagan" %% "scala-regex-interpolation" % "1.0.0" | |
import dev.bgahagan.regex.intrpl._ | |
val key="truc" | |
val MyRE=r"$key-(.*)-machin" | |
val MyRE(word) = "truc-bidule-machin" | |
word shouldBe "bidule" | |
"truc-bidule-machin" match { | |
case r"""\w+-(\w+)$centerWord-\w+""" => centerWord shouldBe "bidule" | |
} | |
"truc-bidule-machin" match { | |
case r"""(\w+)$a-(\w+)$b-(\w+)$c""" => (a,b,c) shouldBe ("truc","bidule","machin") | |
} | |
} | |
*/ | |
it should "be possible to use named arguments" in { | |
val MyRE="TO(?<in>.*)TA".r | |
val sample = "TOTUTA" | |
MyRE.findFirstMatchIn(sample).map(_.group("in")).value shouldBe "TU" | |
} | |
it should "be easy to filter collections" in { | |
val input = List("1","a","2") | |
val NumRE = """\d+""".r | |
input.collect{case d@NumRE()=>d} shouldBe List("1", "2") | |
val input2 = List("1","a","2", "3", "b") | |
val NumRE2 = """(\d+)""".r | |
input2.collect{case NumRE2(d)=>d} shouldBe List("1", "2", "3") | |
val input3 = List("t1","a","2w", "a3k", "b", "4") | |
val NumRE3 = """(\d+)""".r.unanchored | |
input3.collect{case NumRE3(d)=>d} shouldBe List("1", "2", "3", "4") | |
} | |
it should "be possible to find all matches" in { | |
val re = """(\[\w+\])""".r | |
re.findAllMatchIn("A [1] [B] [CD] [123] truc").map { | |
case m => m.group(1) | |
}.toList should contain allOf("[1]", "[B]", "[CD]", "[123]") | |
} | |
it should "be possible to minimize matches length using lazy quantifiers: ?? *? +? {m,n}?" in { | |
val sample = """A12B123B""" | |
info(s"default is take the most, here $sample") | |
val re1 = """A.*B""".r | |
re1.findFirstIn(sample).value shouldBe "A12B123B" | |
info("now we want the smallest match; here A12B, this is done through lazy quantifiers") | |
val re2 = """A.*?B""".r | |
re2.findFirstIn(sample).value shouldBe "A12B" | |
info("interesting blog post : https://mariusschulz.com/2014/06/03/why-using-in-regular-expressions-is-almost-never-what-you-actually-want") | |
info("known in java Pattern doc as Reluctant quantifiers - but not well explained") | |
} | |
it should "support advanced characters intervals" in { | |
val re1 = """[^a-h&&d-p]+""".r | |
re1.matches("mnp") shouldBe true | |
re1.matches("hij") shouldBe false | |
val re2 = """[^a-h&&[^r-z]]+""".r | |
re2.matches("mnop") shouldBe true | |
re2.matches("hr") shouldBe false | |
val re3 = """[\d\w]+""".r | |
re3.matches("dave42") shouldBe true | |
re3.matches("dAVe42") shouldBe true | |
re3.matches("john-doe") shouldBe false | |
val re4 = """[^a\dc]""".r | |
} | |
} | |
org.scalatest.tools.Runner.main(Array("-oDF", "-s", classOf[AdvancedRegexOperations].getName)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment