Phobosのfloat parsing algorithmをみていく。
単純化したもので考える。
nanとかinfとかはみればわかるので(単に文字比較しているだけ)そこは飛ばす。
それでも結構長いな。
/**
* Parses a character range to a floating point number.
*
* Params:
* Target = a floating point type
* source = the lvalue of the range to _parse
*
* Returns:
$(UL
* $(LI A floating point number of type `Target`)
*
* Throws:
* A $(LREF ConvException) if `source` is empty, if no number could be
* parsed, or if an overflow occurred.
*/
auto parse(Target, Source)(ref Source source)
if (isInputRange!Source && isSomeChar!(ElementType!Source) && !is(Source == enum) &&
isFloatingPoint!Target && !is(Target == enum))
{
alias p = source;
static immutable real[14] negtab =
[ 1e-4096L,1e-2048L,1e-1024L,1e-512L,1e-256L,1e-128L,1e-64L,1e-32L,
1e-16L,1e-8L,1e-4L,1e-2L,1e-1L,1.0L ];
static immutable real[13] postab =
[ 1e+4096L,1e+2048L,1e+1024L,1e+512L,1e+256L,1e+128L,1e+64L,1e+32L,
1e+16L,1e+8L,1e+4L,1e+2L,1e+1L ];
ConvException bailOut()(string msg = null, string fn = __FILE__, size_t ln = __LINE__)
{
if (msg == null)
msg = "Floating point conversion error";
return new ConvException(text(msg, " for input \"", source, "\"."), fn, ln);
}
enforce(!p.empty, bailOut());
bool sign = false;
switch (p.front)
{
case '-':
sign = true;
p.popFront();
enforce(!p.empty, bailOut());
if (toLower(p.front) == 'i')
goto case 'i';
break;
case '+':
p.popFront();
enforce(!p.empty, bailOut());
break;
default: {}
}
bool isHex = false;
bool startsWithZero = p.front == '0';
if (startsWithZero)
{
p.popFront();
if (p.empty)
{
return sign ? -0.0 : 0.0;
}
isHex = p.front == 'x' || p.front == 'X';
if (isHex)
{
p.popFront();
}
}
else if (toLower(p.front) == 'n')
{
// nan
p.popFront();
enforce(!p.empty && toUpper(p.front) == 'A',
bailOut("error converting input to floating point"));
p.popFront();
enforce(!p.empty && toUpper(p.front) == 'N',
bailOut("error converting input to floating point"));
// skip past the last 'n'
p.popFront();
return typeof(return).nan;
}
/*
* The following algorithm consists of 2 steps:
* 1) parseDigits processes the textual input into msdec and possibly
* lsdec/msscale variables, followed by the exponent parser which sets
* exp below.
* Hex: input is 0xaaaaa...p+000... where aaaa is the mantissa in hex
* and 000 is the exponent in decimal format with base 2.
* Decimal: input is 0.00333...p+000... where 0.0033 is the mantissa
* in decimal and 000 is the exponent in decimal format with base 10.
* 2) Convert msdec/lsdec and exp into native real format
*/
real ldval = 0.0;
char dot = 0; /* if decimal point has been seen */
int exp = 0;
ulong msdec = 0, lsdec = 0;
ulong msscale = 1;
bool sawDigits;
enum { hex, decimal }
// sets msdec, lsdec/msscale, and sawDigits by parsing the mantissa digits
void parseDigits(alias FloatFormat)()
{
static if (FloatFormat == hex)
{
enum uint base = 16;
enum ulong msscaleMax = 0x1000_0000_0000_0000UL; // largest power of 16 a ulong holds
enum ubyte expIter = 4; // iterate the base-2 exponent by 4 for every hex digit
alias checkDigit = isHexDigit;
/*
* convert letter to binary representation: First clear bit
* to convert lower space chars to upperspace, then -('A'-10)
* converts letter A to 10, letter B to 11, ...
*/
alias convertDigit = (int x) => isAlpha(x) ? ((x & ~0x20) - ('A' - 10)) : x - '0';
sawDigits = false;
}
else static if (FloatFormat == decimal)
{
enum uint base = 10;
enum ulong msscaleMax = 10_000_000_000_000_000_000UL; // largest power of 10 a ulong holds
enum ubyte expIter = 1; // iterate the base-10 exponent once for every decimal digit
alias checkDigit = isDigit;
alias convertDigit = (int x) => x - '0';
// Used to enforce that any mantissa digits are present
sawDigits = startsWithZero;
}
else
static assert(false, "Unrecognized floating-point format used.");
while (!p.empty)
{
int i = p.front;
while (checkDigit(i))
{
sawDigits = true; /* must have at least 1 digit */
i = convertDigit(i);
if (msdec < (ulong.max - base)/base)
{
// For base 16: Y = ... + y3*16^3 + y2*16^2 + y1*16^1 + y0*16^0
msdec = msdec * base + i;
}
else if (msscale < msscaleMax)
{
lsdec = lsdec * base + i;
msscale *= base;
}
else
{
exp += expIter;
}
exp -= dot;
p.popFront();
if (p.empty)
break;
i = p.front;
if (i == '_')
{
p.popFront();
if (p.empty)
break;
i = p.front;
}
}
if (i == '.' && !dot)
{
p.popFront();
dot += expIter;
}
else
break;
}
// Have we seen any mantissa digits so far?
enforce(sawDigits, bailOut("no digits seen"));
static if (FloatFormat == hex)
enforce(!p.empty && (p.front == 'p' || p.front == 'P'),
bailOut("Floating point parsing: exponent is required"));
}
if (isHex)
parseDigits!hex;
else
parseDigits!decimal;
if (isHex || (!p.empty && (p.front == 'e' || p.front == 'E')))
{
char sexp = 0;
int e = 0;
p.popFront();
enforce(!p.empty, new ConvException("Unexpected end of input"));
switch (p.front)
{
case '-': sexp++;
goto case;
case '+': p.popFront();
break;
default: {}
}
sawDigits = false;
while (!p.empty && isDigit(p.front))
{
if (e < 0x7FFFFFFF / 10 - 10) // prevent integer overflow
{
e = e * 10 + p.front - '0';
}
p.popFront();
sawDigits = true;
}
exp += (sexp) ? -e : e;
enforce(sawDigits, new ConvException("No digits seen."));
}
ldval = msdec;
if (msscale != 1) /* if stuff was accumulated in lsdec */
ldval = ldval * msscale + lsdec;
if (isHex)
{
import core.math : ldexp;
// Exponent is power of 2, not power of 10
ldval = ldexp(ldval,exp);
}
else if (ldval)
{
uint u = 0;
int pow = 4096;
while (exp > 0)
{
while (exp >= pow)
{
ldval *= postab[u];
exp -= pow;
}
pow >>= 1;
u++;
}
while (exp < 0)
{
while (exp <= -pow)
{
ldval *= negtab[u];
enforce(ldval != 0, new ConvException("Range error"));
exp += pow;
}
pow >>= 1;
u++;
}
}
// if overflow occurred
enforce(ldval != real.infinity, new ConvException("Range error"));
return cast (Target) (sign ? -ldval : ldval);
}
10進数と16進数で処理が変わるのでそこをわけて考えてみよう。
コアのアルゴリズム以外なるべく無視する。
また、1.2
のような具体的な値で考えてみる。
static immutable real[14] negtab =
[ 1e-4096L,1e-2048L,1e-1024L,1e-512L,1e-256L,1e-128L,1e-64L,1e-32L,
1e-16L,1e-8L,1e-4L,1e-2L,1e-1L,1.0L ];
static immutable real[13] postab =
[ 1e+4096L,1e+2048L,1e+1024L,1e+512L,1e+256L,1e+128L,1e+64L,1e+32L,
1e+16L,1e+8L,1e+4L,1e+2L,1e+1L ];
/*
* The following algorithm consists of 2 steps:
* 1) parseDigits processes the textual input into msdec and possibly
* lsdec/msscale variables, followed by the exponent parser which sets
* exp below.
* Decimal: input is 0.00333...p+000... where 0.0033 is the mantissa
* in decimal and 000 is the exponent in decimal format with base 10.
* 2) Convert msdec/lsdec and exp into native real format
*/
real ldval = 0.0;
char dot = 0; /* if decimal point has been seen */
int exp = 0;
ulong msdec = 0, lsdec = 0;
ulong msscale = 1;
bool sawDigits;
enum { decimal }
// sets msdec, lsdec/msscale, and sawDigits by parsing the mantissa digits
void parseDigits(alias FloatFormat)()
{
enum ulong msscaleMax = 10_000_000_000_000_000_000UL; // largest power of 10 a ulong holds
enum ubyte expIter = 1; // iterate the base-10 exponent once for every decimal digit
alias checkDigit = isDigit;
// Used to enforce that any mantissa digits are present
sawDigits = startsWithZero;
// ここに "1.2" がくることを仮定してみる
while (!p.empty)
{
int i = p.front; // asciiの数値に
while (checkDigit(i))
{
sawDigits = true; /* must have at least 1 digit */
i = i - '0'; // ここで整数になおす
if (msdec < (ulong.max - 10) / 10)
{
msdec = msdec * 10 + i;
// "1" がくるとき: msdec = 0 * 10 + 1 = 1
// "2" がくるとき: msdec = 1 * 10 + 2 = 12
}
else if (msscale < msscaleMax)
{
lsdec = lsdec * 10 + i;
msscale *= 10;
}
else
{
exp += expIter;
}
exp -= dot; // "2" がくるとき: exp -= 1 = -1
p.popFront();
if (p.empty)
break;
i = p.front;
}
if (i == '.' && !dot) // "." がくるとき
{
p.popFront();
dot += expIter; // dot = 1
}
else
break;
}
}
parseDigits!decimal;
// msdec = 12
// dot = 1
// exp = -1
// eかEが出現したとき。今回は無視する。
if (!p.empty && (p.front == 'e' || p.front == 'E'))
{
char sexp = 0; // s-prefixはsigned(neg)の意味
int e = 0;
p.popFront();
switch (p.front)
{
case '-': sexp++;
goto case;
case '+': p.popFront();
break;
default: {}
}
sawDigits = false;
while (!p.empty && isDigit(p.front))
{
if (e < 0x7FFFFFFF / 10 - 10) // prevent integer overflow
{
e = e * 10 + p.front - '0';
}
p.popFront();
sawDigits = true;
}
exp += (sexp) ? -e : e;
}
ldval = msdec; // ldval = 12
if (msscale != 1) /* if stuff was accumulated in lsdec */
ldval = ldval * msscale + lsdec;
uint u = 0;
int pow = 4096;
while (exp > 0)
{
while (exp >= pow)
{
ldval *= postab[u];
exp -= pow;
}
pow >>= 1;
u++;
}
// ldval = 12
// exp = -1
while (exp < 0)
{
// 一回目のループだと -1 <= -4096 でスキップ。
// pow >>= 1するので -1 < - (pow=1) となるまでこのパスには入らない。
// そのあいだu++するのでnegtabのインデックスが更新されていく。
while (exp <= -pow)
{
ldval *= negtab[u]; // ループ外でなんやかんやあって ldval *= 1e-1L -> 12 *= 1e-1L -> 1.2 となる。
exp += pow; // -1 += 1 -> 0 なのでここで二重ループを抜ける。
}
pow >>= 1; // 4096 -> 2048
u++; // 0 -> 1 -> 2
}
// if overflow occurred
enforce(ldval != real.infinity, new ConvException("Range error"));
return cast (Target) (sign ? -ldval : ldval); // 最終的に1.2が返ってくるようになった。
}
というわけで元のコメントにあるように
- 文字列で与えられた数値のパース箇所
- パースした情報を元に実際の浮動小数点数を求める箇所
の2つの主なセクションで考えることができる。
パース部分は単純に前から一文字ずつみていって数値に変換したりドットが出現したらその情報を持ったりする。
浮動小数点数を求めるところは整数表現に対してドットの場所分ずらした値に対応する指数表とかけて求めている。 "1.2"なら 12 * (10 ** -1) = 1.2 のような求め方。
Eisel-Lemire Algorithmでも小さい値の場合はこの考え方で求めている。
基本的なアルゴリズムはほとんど一緒で16進数なのでシフトする部分とかldexp使う部分が違うくらい。
// 16進数は絶対にstartsWithZero == true
bool startsWithZero = true; /* p.front == '0'; */
p.popFront();
bool isHex = true; /* p.front == 'x' || p.front == 'X'; */
p.popFront();
/*
* The following algorithm consists of 2 steps:
* 1) parseDigits processes the textual input into msdec and possibly
* lsdec/msscale variables, followed by the exponent parser which sets
* exp below.
* Hex: input is 0xaaaaa...p+000... where aaaa is the mantissa in hex
* and 000 is the exponent in decimal format with base 2.
* Decimal: input is 0.00333...p+000... where 0.0033 is the mantissa
* in decimal and 000 is the exponent in decimal format with base 10.
* 2) Convert msdec/lsdec and exp into native real format
*/
real ldval = 0.0;
char dot = 0; /* if decimal point has been seen */
int exp = 0;
ulong msdec = 0, lsdec = 0;
ulong msscale = 1;
bool sawDigits;
enum { hex }
// sets msdec, lsdec/msscale, and sawDigits by parsing the mantissa digits
void parseDigits(alias FloatFormat)()
{
enum ulong msscaleMax = 0x1000_0000_0000_0000UL; // largest power of 16 a ulong holds
enum ubyte expIter = 4; // iterate the base-2 exponent by 4 for every hex digit
alias checkDigit = isHexDigit;
/*
* convert letter to binary representation: First clear bit
* to convert lower space chars to upperspace, then -('A'-10)
* converts letter A to 10, letter B to 11, ...
*/
alias convertDigit = (int x) => isAlpha(x) ? ((x & ~0x20) - ('A' - 10)) : x - '0';
sawDigits = false;
while (!p.empty)
{
int i = p.front;
while (checkDigit(i))
{
sawDigits = true; /* must have at least 1 digit */
i = convertDigit(i);
if (msdec < (ulong.max - 16) / 16)
{
// For base 16: Y = ... + y3*16^3 + y2*16^2 + y1*16^1 + y0*16^0
msdec = msdec * 16 + i;
}
else if (msscale < msscaleMax)
{
lsdec = lsdec * 16 + i;
msscale *= 16;
}
else
{
exp += expIter;
}
exp -= dot;
p.popFront();
if (p.empty)
break;
i = p.front;
}
if (i == '.' && !dot)
{
p.popFront();
dot += expIter;
}
else
break;
}
}
parseDigits!hex;
char sexp = 0;
int e = 0;
p.popFront();
switch (p.front)
{
case '-': sexp++;
goto case;
case '+': p.popFront();
break;
default: {}
}
sawDigits = false;
while (!p.empty && isDigit(p.front))
{
if (e < 0x7FFFFFFF / 10 - 10) // prevent integer overflow
{
e = e * 10 + p.front - '0';
}
p.popFront();
sawDigits = true;
}
exp += (sexp) ? -e : e;
ldval = msdec;
if (msscale != 1) /* if stuff was accumulated in lsdec */
ldval = ldval * msscale + lsdec;
// ldvalに2のexp乗をかける
import core.math : ldexp;
// Exponent is power of 2, not power of 10
ldval = ldexp(ldval, exp);
return cast (Target) (sign ? -ldval : ldval);
}