Created
December 25, 2015 06:03
-
-
Save ryanuber/3e50b307e5afd86d877e to your computer and use it in GitHub Desktop.
Fast text scanner in Go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/bench_test.go b/bench_test.go | |
new file mode 100644 | |
index 0000000..1431cfb | |
--- /dev/null | |
+++ b/bench_test.go | |
@@ -0,0 +1,174 @@ | |
+package license | |
+ | |
+import ( | |
+ "bytes" | |
+ "strings" | |
+ "testing" | |
+) | |
+ | |
+// The benchmarks in this file test the performance of our text scanner. This | |
+// text scanning must be repeated (at least in the current go-license code), | |
+// and as more licenses are added, this process becomes more expensive. The | |
+// performance we get out of these benchmarks is typically around 2x faster | |
+// than other comparable functions from the stdlib. This is a special case | |
+// scanner, and may not yield such results in all applications. For good | |
+// measure, we also benchmark the same data set with some stdlib functions. | |
+ | |
+func BenchmarkScan(b *testing.B) { | |
+ for n := 0; n < b.N; n++ { | |
+ if !scan(text, match) { | |
+ b.Fatalf("should match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkScanNoMatch(b *testing.B) { | |
+ for n := 0; n < b.N; n++ { | |
+ if scan(text, noMatch) { | |
+ b.Fatalf("should not match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkStringsContains(b *testing.B) { | |
+ for n := 0; n < b.N; n++ { | |
+ if !strings.Contains(text, match) { | |
+ b.Fatalf("should match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkStringsContainsNoMatch(b *testing.B) { | |
+ for n := 0; n < b.N; n++ { | |
+ if strings.Contains(text, noMatch) { | |
+ b.Fatalf("should not match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkStringsIndex(b *testing.B) { | |
+ for n := 0; n < b.N; n++ { | |
+ if strings.Index(text, match) == -1 { | |
+ b.Fatalf("should match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkStringsIndexNoMatch(b *testing.B) { | |
+ for n := 0; n < b.N; n++ { | |
+ if strings.Index(text, noMatch) != -1 { | |
+ b.Fatalf("should not match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkBytesContains(b *testing.B) { | |
+ subject := []byte(text) | |
+ pattern := []byte(match) | |
+ for n := 0; n < b.N; n++ { | |
+ if !bytes.Contains(subject, pattern) { | |
+ b.Fatalf("should match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkBytesContainsNoMatch(b *testing.B) { | |
+ subject := []byte(text) | |
+ pattern := []byte(noMatch) | |
+ for n := 0; n < b.N; n++ { | |
+ if bytes.Contains(subject, pattern) { | |
+ b.Fatalf("should match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkBytesIndex(b *testing.B) { | |
+ subject := []byte(text) | |
+ pattern := []byte(match) | |
+ for n := 0; n < b.N; n++ { | |
+ if bytes.Index(subject, pattern) == -1 { | |
+ b.Fatalf("should match") | |
+ } | |
+ } | |
+} | |
+ | |
+func BenchmarkBytesIndexNoMatch(b *testing.B) { | |
+ subject := []byte(text) | |
+ pattern := []byte(noMatch) | |
+ for n := 0; n < b.N; n++ { | |
+ if bytes.Index(subject, pattern) != -1 { | |
+ b.Fatalf("should match") | |
+ } | |
+ } | |
+} | |
+ | |
+const match = "Persius maluisset eu per, per ad agam voluptua." | |
+const noMatch = "This text doesn't match anything." | |
+const text = ` | |
+Lorem ipsum dolor sit amet, eos cu prima adipiscing interesset, ad accusam | |
+reformidans pro. Nostrum consequat dissentiet ad sed, nullam graeci voluptua | |
+usu et. Ea clita percipitur eloquentiam est. Per ea mutat nominavi, probo | |
+dolorem ex has. | |
+ | |
+Ut cum saperet quaestio erroribus, ea alii integre cum. Vis cu assum soleat | |
+dissentiet, pro at modus malorum partiendo. Ei duis reprimique per, consulatu | |
+cotidieque qui eu. Diceret temporibus sea ne. Pro in alii falli inermis, consul | |
+mediocrem cu vel. | |
+ | |
+Omnes viderer invenire vis ut, pro no iuvaret persecuti, hinc omnis aperiri pro | |
+ne. Eos minim denique pertinacia no, pro eripuit postulant in. Vix ad illud | |
+ornatus tractatos, aliquam facilis noluisse duo ne, mei aeque viris primis te. | |
+Diceret expetenda consulatu ea sed, ius mucius essent cu. Ex amet illud falli | |
+eos, mei eu falli lobortis aliquando. | |
+ | |
+Eu homero omittam usu. Ut pertinacia disputando pro, vel cu wisi prompta, eu | |
+sit putant legendos. At sit consul mediocrem voluptatibus, pri assum inani in. | |
+Eu cum alia ullum dicam, his nemore adipisci inimicus ei, alii numquam | |
+urbanitas nam ex. Probo sanctus detracto eos ut. Ut sed cibo tacimates, prompta | |
+utroque sed ad. | |
+ | |
+Sit ei odio nisl praesent. At democritum deterruisset eos. Ex eligendi | |
+qualisque eos. Est et elit dicam menandri. Pri adipiscing efficiantur ei, | |
+labitur feugiat in pri, quo in utroque repudiare. | |
+ | |
+Postea fabellas recusabo vim ad, dolore singulis eu cum, mei autem gubergren | |
+vituperatoribus in. Ad aeque dolorem accusam pri, nec error saepe ex. Vim ad | |
+nostro labores, ea sumo animal salutandi has. Ius facete detraxit in, porro | |
+atqui his ne. Mel ei mucius hendrerit, nulla mazim posidonium usu cu, mei te | |
+percipit forensibus disputationi. | |
+ | |
+Ex sea audire fabellas, est ut iusto constituto, nam at causae tractatos | |
+consectetuer. Duo dicant iisque suscipit ei, simul legere no has, cu quo vero | |
+lucilius dissentiet. Ut sed vidit probo dignissim. Melius gubergren ex sed. Usu | |
+novum ludus melius et. Id clita impetus mea. | |
+ | |
+Ea mutat timeam feugait nam, no erant nostro similique nam. Et sea nulla | |
+bonorum, usu torquatos expetendis an. At eum expetenda adversarium, pri ne | |
+vocent postulant facilisis. Nec solum malorum noluisse cu, et nam sale propriae | |
+voluptatibus. Te mel dissentias eloquentiam, inani tantas delectus his et, an | |
+moderatius concludaturque sea. Eam oportere erroribus theophrastus in. | |
+ | |
+Saepe iriure eripuit ea per. Nobis graeco constituam cu vix. Vim everti | |
+argumentum scripserit et. In vis accumsan expetendis, denique fabellas in eos. | |
+ | |
+Est et nullam eligendi. Ei vim explicari abhorreant vituperatoribus, vidit | |
+dolore consulatu vim ea. Vitae debitis constituto sit ex. Cu soluta epicuri | |
+est, vix no tincidunt assueverit, per latine nonumes percipitur ea. | |
+ | |
+Usu primis petentium consequat ne, ei verear eruditi sea. Quo an liber soluta | |
+molestiae, ne per debet principes, te mea animal feugiat adversarium. Dicunt | |
+volutpat adversarium an eum, ea epicuri recusabo expetendis quo. Mel causae | |
+omittam facilisis ea, consul audiam mei an. Ad pri utamur dolorum, mea alia | |
+noluisse id. Te sit rebum reprimique. | |
+ | |
+Persius maluisset eu per, per ad agam voluptua. Facilis tincidunt sed eu. Quo | |
+ne eros nobis. Eros debet honestatis cum at, ius nihil percipit no. | |
+ | |
+Harum vituperata pro cu, simul feugait an per. Cu vix alii adhuc, usu no omnis | |
+prodesset comprehensam. Ut vis facete moderatius. Vim posse maiestatis an, vel | |
+nulla quodsi ne, his ne dictas malorum vulputate. Eos ut quod latine dolorem. | |
+In singulis eleifend mel, ius ut offendit recteque. | |
+ | |
+Aliquip oportere ei vix, vel ad integre rationibus. Vis et aliquip maiorum | |
+intellegat. Nullam doctus deserunt ei has. Ut everti volumus vis. | |
+` | |
diff --git a/license.go b/license.go | |
index d6d0a8f..a84b2f9 100644 | |
--- a/license.go | |
+++ b/license.go | |
@@ -289,9 +289,34 @@ func (l *License) GuessType() error { | |
return nil | |
} | |
-// scan is a shortcut function to check for a literal match within a string | |
-// of text. Any text transformation should be done prior to calling this | |
-// function so that it need not be repeated for every check. | |
-func scan(text, match string) bool { | |
- return strings.Contains(text, match) | |
+// scan is a fast substring matcher. | |
+func scan(text, pattern string) bool { | |
+ // Get the last index in the pattern. Guard passing an empty | |
+ // pattern string by returning false. | |
+ last := len(pattern) - 1 | |
+ if last == -1 { | |
+ return false | |
+ } | |
+ | |
+ // The last possible starting index for a match. If out iterator | |
+ // passes this index, we can safely assume there are no matches. | |
+ lindex := len(text) - last | |
+ | |
+OUTER: | |
+ for i := 0; i < lindex; i++ { | |
+ // Compare just the first byte of the pattern. | |
+ if text[i] != pattern[0] { | |
+ continue | |
+ } | |
+ | |
+ // Reverse scan the rest of the chunk which possibly contains | |
+ // a match. This deprioritizes scanning common prefixes. | |
+ for n := last; n > 0; n-- { | |
+ if text[i+n] != pattern[n] { | |
+ continue OUTER | |
+ } | |
+ } | |
+ return true | |
+ } | |
+ return false | |
} | |
diff --git a/license_test.go b/license_test.go | |
index 5849fb0..59d64c3 100644 | |
--- a/license_test.go | |
+++ b/license_test.go | |
@@ -226,3 +226,20 @@ func TestLicenseFiles(t *testing.T) { | |
t.Fatalf("expect %d files, got: %d", n, len(files)) | |
} | |
} | |
+ | |
+func TestScan(t *testing.T) { | |
+ input := "This is a test. Hello, world!" | |
+ tcases := map[string]bool{ | |
+ "Hello": true, // Matches in the middle | |
+ "world!": true, // Matches at the end | |
+ "This": true, // Matches at the beginning | |
+ "nope": false, // Doesn't match | |
+ "": false, // Empty pattern guard | |
+ "This is longer than the input string": false, | |
+ } | |
+ for pattern, result := range tcases { | |
+ if out := scan(input, pattern); out != result { | |
+ t.Fatalf("bad result: %q (%v)", pattern, out) | |
+ } | |
+ } | |
+} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment