Mix.install([:benchee, :benchee_markdown])
defmodule BulkMatch32 do
defguardp is_inner_byte(c) when c >= 128 and c < 192
def valid?(str), do: do_valid?(str)
defp do_valid?(<<a::32, rest::binary>>) when Bitwise.band(a, 0x80808080) == 0,
do: do_valid?(rest)
defp do_valid?(<<a::8, rest::binary>>) when a < 128, do: do_valid?(rest)
defp do_valid?(<<a::8, b::8, rest::binary>>) when a >= 192 and a < 224 and is_inner_byte(b),
do: do_valid?(rest)
defp do_valid?(<<a::8, b::8, c::8, rest::binary>>)
when a >= 224 and a < 240 and is_inner_byte(b) and is_inner_byte(c),
do: do_valid?(rest)
defp do_valid?(<<a::8, b::8, c::8, d::8, rest::binary>>)
when a >= 240 and is_inner_byte(b) and is_inner_byte(c) and is_inner_byte(d),
do: do_valid?(rest)
defp do_valid?(<<>>), do: true
defp do_valid?(_str), do: false
end
defmodule BulkMatchBandBor do
def valid?(<<a::8, rest::binary>>) when a < 128,
do: valid?(rest)
def valid?(<<a::16, rest::binary>>)
when Bitwise.band(a, 0b1100000010000000) == 49280 and
Bitwise.bor(a, 0b1101111110111111) == 57279,
do: valid?(rest)
def valid?(<<a::24, rest::binary>>)
when Bitwise.band(a, 0b111000001000000010000000) == 14_712_960 and
Bitwise.bor(a, 0b111011111011111110111111) == 15_712_191,
do: valid?(rest)
def valid?(<<a::32, rest::binary>>) do
cond do
Bitwise.band(a, 0b11110000100000001000000010000000) == 4_034_953_344 and
Bitwise.bor(a, 0b11110111101111111011111110111111) == 4_156_538_815 ->
valid?(rest)
Bitwise.band(a, 0x80808080) == 0 ->
valid?(rest)
:otherwise ->
false
end
end
def valid?(<<>>), do: true
def valid?(_str), do: false
end
BulkMatchBandBor.valid?("a") == true and
BulkMatchBandBor.valid?("ø") == true and
BulkMatchBandBor.valid?(<<0xFFFF::16>>) == false and
BulkMatchBandBor.valid?(<<0xEF, 0xB7, 0x90>>) == true and
BulkMatchBandBor.valid?("asd" <> <<0xFFFF::16>>) == false and
BulkMatchBandBor.valid?("$") and
BulkMatchBandBor.valid?("£") and
BulkMatchBandBor.valid?("€") and
BulkMatchBandBor.valid?("𐍈")
defmodule BulkMatchBandBor2 do
def valid?(<<a::8, rest::binary>>) do
cond do
a < 128 -> valid?(rest)
a > 191 and a < 224 -> valid?(:two, rest)
a > 223 and a < 240 -> valid?(:three, rest)
a > 239 -> valid?(:four, rest)
end
end
def valid?(<<>>), do: true
def valid?(_str), do: false
@compile {:inline, valid?: 2}
def valid?(:two, <<a::8, rest::binary>>)
when Bitwise.band(a, 0b10000000) == 128 and
Bitwise.bor(a, 0b10111111) == 191,
do: valid?(rest)
def valid?(:three, <<a::16, rest::binary>>)
when Bitwise.band(a, 0b1000000010000000) == 32896 and
Bitwise.bor(a, 0b1011111110111111) == 49087,
do: valid?(rest)
def valid?(:four, <<a::24, rest::binary>>)
when Bitwise.band(a, 0b100000001000000010000000) == 8_421_504 and
Bitwise.bor(a, 0b101111111011111110111111) == 12_566_463,
do: valid?(rest)
def valid?(_, _), do: false
end
BulkMatchBandBor2.valid?("a") == true and
BulkMatchBandBor2.valid?("ø") == true and
BulkMatchBandBor2.valid?(<<0xFFFF::16>>) == false and
BulkMatchBandBor2.valid?(<<0xEF, 0xB7, 0x90>>) == true and
BulkMatchBandBor2.valid?("asd" <> <<0xFFFF::16>>) == false and
BulkMatchBandBor2.valid?("$") and
BulkMatchBandBor2.valid?("£") and
BulkMatchBandBor2.valid?("€") and
BulkMatchBandBor2.valid?("𐍈")
defmodule BulkMatchBandBor3 do
def valid?(<<a::8, rest::binary>>) do
cond do
a < 128 ->
valid_one?(rest)
a > 191 and a < 224 ->
valid_two?(rest)
a > 223 and a < 240 ->
valid_three?(rest)
a > 239 ->
valid_four?(rest)
end
end
def valid?(<<>>), do: true
def valid?(_str), do: false
@compile {:inline, valid_two?: 1}
def valid_two?(<<a::8, rest::binary>>)
when a < 192,
do: valid?(rest)
def valid_two?(_),
do: false
@compile {:inline, valid_three?: 1}
def valid_three?(<<a::16, rest::binary>>)
when Bitwise.band(a, 0x8080) == 0x8080 and
Bitwise.bor(a, 0xBFBF) == 0xBFBF,
do: valid?(rest)
def valid_three?(_),
do: false
@compile {:inline, valid_four?: 1}
def valid_four?(<<a::24, rest::binary>>)
when Bitwise.band(a, 0x808080) == 0x808080 and
Bitwise.bor(a, 0xBFBFBF) == 0xBFBFBF,
do: valid?(rest)
def valid_four?(_),
do: false
@compile {:inline, valid_one?: 1}
def valid_one?(<<a::56, rest::binary>>)
when Bitwise.band(a, 0x80808080808080) == 0,
do: valid?(rest)
def valid_one?(rest),
do: valid?(rest)
end
BulkMatchBandBor3.valid?("a") == true and
BulkMatchBandBor3.valid?("ø") == true and
BulkMatchBandBor3.valid?(<<0xFFFF::16>>) == false and
BulkMatchBandBor3.valid?(<<0xEF, 0xB7, 0x90>>) == true and
BulkMatchBandBor3.valid?("asd" <> <<0xFFFF::16>>) == false and
BulkMatchBandBor3.valid?("$") and
BulkMatchBandBor3.valid?("£") and
BulkMatchBandBor3.valid?("€") and
BulkMatchBandBor3.valid?("𐍈")
defmodule AndOr4 do
def valid?(<<>>), do: true
def valid?(str) when not is_binary(str), do: false
def valid?(str), do: do_valid?(str)
def do_valid?(<<a::56, rest::binary>>)
when Bitwise.band(a, 0x80808080808080) == 0,
do: do_valid?(rest)
def do_valid?(<<a::8, rest::binary>>) do
rest =
cond do
a <= 127 ->
rest
a >= 194 and a <= 223 ->
case rest do
<<a::8, rest::binary>> when a < 192 -> rest
_ -> false
end
a >= 224 and a <= 239 ->
case rest do
<<b::16, rest::binary>>
when Bitwise.band(b, 0x8080) == 0x8080 and
Bitwise.bor(b, 0xBFBF) == 0xBFBF ->
cond do
a == 224 ->
c = Bitwise.bsr(a, 8)
cond do
c >= 128 and c <= 159 -> false
:otherwise -> rest
end
a == 237 ->
c = Bitwise.bsr(a, 8)
cond do
c >= 160 and c <= 191 -> false
:otherwise -> rest
end
(a >= 225 and a <= 236) or a == 238 or a == 239 ->
rest
:otherwise ->
false
end
_ ->
false
end
a >= 240 ->
case rest do
<<b::24, rest::binary>>
when Bitwise.band(b, 0x808080) == 0x808080 and
Bitwise.bor(b, 0xBFBFBF) == 0xBFBFBF ->
cond do
a == 240 ->
c = Bitwise.bsr(b, 16)
cond do
c >= 128 and c <= 143 -> false
:otherwise -> rest
end
a >= 241 and a <= 243 ->
rest
a == 244 ->
c = Bitwise.bsr(b, 16)
cond do
c >= 160 and c <= 191 -> false
:otherwise -> rest
end
:otherwise ->
false
end
_ ->
false
end
:otherwise ->
false
end
do_valid?(rest)
end
def do_valid?(<<>>), do: true
def do_valid?(false), do: false
end
AndOr4.valid?("a") == true and
AndOr4.valid?("ø") == true and
AndOr4.valid?(<<0xFFFF::16>>) == false and
AndOr4.valid?(<<0xEF, 0xB7, 0x90>>) == true and
AndOr4.valid?("asd" <> <<0xFFFF::16>>) == false and
AndOr4.valid?("$") and
AndOr4.valid?("£") and
AndOr4.valid?("€") and
AndOr4.valid?("𐍈")
# AndOr4.valid?(<<0b11100000, 0b10000000, 0b10000000>>)
defmodule Cowboy do
def valid?(str), do: validate_utf8(str, 0) == 0
defp validate_utf8(<<>>, state), do: state
defp validate_utf8(<<c::8, rest::bits>>, 0) when c < 128, do: validate_utf8(rest, 0)
defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 128 and c < 144,
do: validate_utf8(rest, 0)
defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 128 and c < 144,
do: validate_utf8(rest, 2)
defp validate_utf8(<<c::8, rest::bits>>, 5) when c >= 128 and c < 144,
do: validate_utf8(rest, 2)
defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 128 and c < 144,
do: validate_utf8(rest, 3)
defp validate_utf8(<<c::8, rest::bits>>, 8) when c >= 128 and c < 144,
do: validate_utf8(rest, 3)
defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 144 and c < 160,
do: validate_utf8(rest, 0)
defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 144 and c < 160,
do: validate_utf8(rest, 2)
defp validate_utf8(<<c::8, rest::bits>>, 5) when c >= 144 and c < 160,
do: validate_utf8(rest, 2)
defp validate_utf8(<<c::8, rest::bits>>, 6) when c >= 144 and c < 160,
do: validate_utf8(rest, 3)
defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 144 and c < 160,
do: validate_utf8(rest, 3)
defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 160 and c < 192,
do: validate_utf8(rest, 0)
defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 160 and c < 192,
do: validate_utf8(rest, 2)
defp validate_utf8(<<c::8, rest::bits>>, 4) when c >= 160 and c < 192,
do: validate_utf8(rest, 2)
defp validate_utf8(<<c::8, rest::bits>>, 6) when c >= 160 and c < 192,
do: validate_utf8(rest, 3)
defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 160 and c < 192,
do: validate_utf8(rest, 3)
defp validate_utf8(<<c::8, rest::bits>>, 0) when c >= 194 and c < 224,
do: validate_utf8(rest, 2)
defp validate_utf8(<<224::8, rest::bits>>, 0), do: validate_utf8(rest, 4)
defp validate_utf8(<<c::8, rest::bits>>, 0) when c >= 225 and c < 237,
do: validate_utf8(rest, 3)
defp validate_utf8(<<237::8, rest::bits>>, 0), do: validate_utf8(rest, 5)
defp validate_utf8(<<c::8, rest::bits>>, 0) when c == 238 and c == 239,
do: validate_utf8(rest, 3)
defp validate_utf8(<<240::8, rest::bits>>, 0), do: validate_utf8(rest, 6)
defp validate_utf8(<<c::8, rest::bits>>, 0) when c == 241 and c == 242 and c == 243,
do: validate_utf8(rest, 7)
defp validate_utf8(<<244::8, rest::bits>>, 0), do: validate_utf8(rest, 8)
defp validate_utf8(_, _), do: 1
end
defmodule New.Macro do
defmacro utf8_accept() do
quote do
0
end
end
defmacro utf8_reject() do
quote do
12
end
end
defguard are_all_ascii_plain(b1, b2, b3, b4, b5, b6, b7, b8)
when b1 <= 127 and
b2 <= 127 and
b3 <= 127 and
b4 <= 127 and
b5 <= 127 and
b6 <= 127 and
b7 <= 127 and
b8 <= 127
# This is an adapted table from "Flexible and Economical UTF-8 Decoding" by Bjoern Hoehrmann.
# http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
# Map character to character class
defmacro utf8t() do
quote do
{
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
8,
8,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
10,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
4,
3,
3,
11,
6,
6,
6,
5,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8
}
end
end
# Transition table mapping combination of state & class to next state
defmacro utf8s() do
quote do
{
12,
24,
36,
60,
96,
84,
12,
12,
12,
48,
72,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
0,
12,
12,
12,
12,
12,
0,
12,
0,
12,
12,
12,
24,
12,
12,
12,
12,
12,
24,
12,
24,
12,
12,
12,
12,
12,
12,
12,
12,
12,
24,
12,
12,
12,
12,
12,
24,
12,
12,
12,
12,
12,
12,
12,
24,
12,
12,
12,
12,
12,
12,
12,
12,
12,
36,
12,
36,
12,
12,
12,
36,
12,
12,
12,
12,
12,
36,
12,
36,
12,
12,
12,
36,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12
}
end
end
# Optimisation for 1st byte direct state lookup,
# we know starting state is 0 and ASCII bytes were already handled
defmacro utf8s0() do
quote do
{
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
24,
48,
36,
36,
36,
36,
36,
36,
36,
36,
36,
36,
36,
36,
60,
36,
36,
72,
84,
84,
84,
96,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12
}
end
end
end
defmodule New do
import New.Macro
def valid?(<<byte, rest::bits>>) when byte <= 127,
do: valid?(rest)
def valid?(<<byte, rest::bytes>>) do
case elem(utf8s0(), byte - 127 - 1) do
utf8_reject() -> false
state -> valid_utf8?(rest, state)
end
end
def valid?(<<>>), do: true
def valid_utf8?(<<byte, rest::binary>>, state) do
type = elem(utf8t(), byte)
case elem(utf8s(), state + type - 1) do
utf8_accept() -> valid_ascii?(rest)
utf8_reject() -> false
state -> valid_utf8?(rest, state)
end
end
def valid_utf8?(<<>>, utf8_accept()), do: true
def valid_utf8?(_, _), do: false
def valid_ascii?(binary) do
case binary do
<<b1, b2, b3, b4, b5, b6, b7, b8, rest::binary>>
when are_all_ascii_plain(b1, b2, b3, b4, b5, b6, b7, b8) ->
valid_ascii?(rest)
other ->
valid?(other)
end
end
end
New.valid?("a") == true and
New.valid?("ø") == true and
New.valid?(<<0xFFFF::16>>) == false and
New.valid?(<<0xEF, 0xB7, 0x90>>) == true and
New.valid?("asd" <> <<0xFFFF::16>>) == false
defmodule Benchmark do
def native(str), do: String.valid?(str)
def native_fast_ascii(str), do: String.valid?(str, :fast_ascii)
def lookup(str), do: New.valid?(str)
def cowboy(str), do: Cowboy.valid?(str)
def and_or(str), do: BulkMatchBandBor.valid?(str)
def and_or2(str), do: BulkMatchBandBor2.valid?(str)
def and_or3(str), do: BulkMatchBandBor3.valid?(str)
def and_or4(str), do: AndOr4.valid?(str)
def bulk_32(str), do: BulkMatch32.valid?(str)
def run() do
Benchee.run(
%{
# "Native" => &native/1,
"Native :fast_ascii" => &native_fast_ascii/1,
"Lookup" => &lookup/1,
"Cowboy" => &cowboy/1,
# "AndOr" => &and_or/1,
# "AndOr2" => &and_or2/1,
"AndOr3" => &and_or3/1,
"AndOr4" => &and_or4/1
# "Bulk32" => &bulk_32/1
},
time: 2,
# memory_time: 2,
inputs: %{
"1byte" => String.duplicate("$", 10_000),
# "2byte" => String.duplicate("£", 10_000),
# "3byte" => String.duplicate("€", 10_000),
# "4byte" => String.duplicate("𐍈", 10_000),
"mixed" => String.duplicate("$£€𐍈", 10_000),
# "mixed rev" => String.duplicate("𐍈€£$", 10_000),
"less than 64 bytes" => String.duplicate("$", 63)
}
)
end
end
Benchmark.run()