unixdj · October 26, 2022 16:13
diff --git a/angle-parse.go b/angle-parse.go
 package main

 /*
 [...] angles [...] are specified as
 up to three numbers representing degrees, minutes and seconds.  The
 last number may be fractional.  The numbers are either separated by
 <,> (commas) or suffixed, in order:

  degrees by <°> (degree sign),
  minutes by <′> (prime)        or <'> (apostrophe),
  seconds by <″> (double prime) or <"> (quotation mark).

 The last suffix may be omitted.
 */

 // angleStringOK returns ok if the string looks valid and suffixed if it
 // contains <">, <'> or multibyte UTF-8 characters.
 func angleStringOK(s string) (ok, suffixed bool) {
 	// The main goals of this function are:
 	//   - to disallow strings with:
 	//     - signs (<+> and <->), to avoid validating strings before
 	//       passing them to strconv.ParseFloat
 	//     - more than 2 commas, to avoid excessive allocation
 	//   - to decide whether the numbers are separated by commas or
 	//     suffixed with <°> etc.
 	// Additionally, it disallows:
 	//   - illegal ASCII characters
 	//   - more than 3 suffixes or multibyte characters
 	//   - strings having both commas and multibyte characters
 	// The validation is preliminary, optimised for speed and very crude.
 	// In particular, it:
 	//   - doesn't check which multibyte characters are present and counts
 	//     any such character as a suffix
 	//   - doesn't validate syntax (character sequences)
 	//   - doesn't even validate UTF-8 (byte sequences and invalid bytes
 	//     0xc0, 0xc1, 0xfe, 0xff)
 	const (
 		invalid = iota
 		valid
 		comma
 		suffix
 		tbl20
 	)
 	var commas, suffixes byte
 	// map of 8 byte ranges (top 3 bits)
 	hi := [8]byte{
 		0x20 >> 5: tbl20,  // 0x20 - 0x3f: see table lo20
 		0x80 >> 5: valid,  // 0x80 - 0x9f: 10xxxxxx, UTF-8 continuation
 		0xa0 >> 5: valid,  // 0xa0 - 0xbf: 10xxxxxx, UTF-8 continuation
 		0xc0 >> 5: suffix, // 0xc0 - 0xdf: 11xxxxxx, UTF-8 start
 		0xe0 >> 5: suffix, // 0xe0 - 0xff: 11xxxxxx, UTF-8 start
 	}
 	// map of valid characters in the 0x20 - 0x3f range (bottom 5 bits)
 	lo20 := [32]byte{
 		'"' - 0x20:  suffix, // 0x22: count suffixes
 		'\'' - 0x20: suffix, // 0x27: count suffixes
 		',' - 0x20:  comma,  // 0x2c: count commas
 		'.' - 0x20:  valid,  // 0x2e
 		'0' - 0x20:  valid,  // 0x30
 		'1' - 0x20:  valid,  // 0x31
 		'2' - 0x20:  valid,  // 0x32
 		'3' - 0x20:  valid,  // 0x33
 		'4' - 0x20:  valid,  // 0x34
 		'5' - 0x20:  valid,  // 0x35
 		'6' - 0x20:  valid,  // 0x36
 		'7' - 0x20:  valid,  // 0x37
 		'8' - 0x20:  valid,  // 0x38
 		'9' - 0x20:  valid,  // 0x39
 	}
 	for i := 0; i < len(s); i++ {
 		c := s[i]
 		act := hi[c>>5]
 		if act == tbl20 {
 			act = lo20[c&0x1f]
 		}
 		switch act {
 		case invalid:
 			return false, false
 		case valid:
 		case comma:
 			// up to 2 commas, no suffixes
 			if commas>>1|suffixes != 0 {
 				return false, false
 			}
 			commas++
 		default:
 			// up to 3 suffixes, no commas
 			if suffixes++; suffixes>>2|commas != 0 {
 				return false, false
 			}
 		}
 	}
 	return true, suffixes != 0
 }
	package main

	/*
	[...] angles [...] are specified as
	up to three numbers representing degrees, minutes and seconds. The
	last number may be fractional. The numbers are either separated by
	<,> (commas) or suffixed, in order:

	degrees by <°> (degree sign),
	minutes by <′> (prime) or <'> (apostrophe),
	seconds by <″> (double prime) or <"> (quotation mark).

	The last suffix may be omitted.
	*/

	// angleStringOK returns ok if the string looks valid and suffixed if it
	// contains <">, <'> or multibyte UTF-8 characters.
	func angleStringOK(s string) (ok, suffixed bool) {
	// The main goals of this function are:
	// - to disallow strings with:
	// - signs (<+> and <->), to avoid validating strings before
	// passing them to strconv.ParseFloat
	// - more than 2 commas, to avoid excessive allocation
	// - to decide whether the numbers are separated by commas or
	// suffixed with <°> etc.
	// Additionally, it disallows:
	// - illegal ASCII characters
	// - more than 3 suffixes or multibyte characters
	// - strings having both commas and multibyte characters
	// The validation is preliminary, optimised for speed and very crude.
	// In particular, it:
	// - doesn't check which multibyte characters are present and counts
	// any such character as a suffix
	// - doesn't validate syntax (character sequences)
	// - doesn't even validate UTF-8 (byte sequences and invalid bytes
	// 0xc0, 0xc1, 0xfe, 0xff)
	const (
	invalid = iota
	valid
	comma
	suffix
	tbl20
	)
	var commas, suffixes byte
	// map of 8 byte ranges (top 3 bits)
	hi := [8]byte{
	0x20 >> 5: tbl20, // 0x20 - 0x3f: see table lo20
	0x80 >> 5: valid, // 0x80 - 0x9f: 10xxxxxx, UTF-8 continuation
	0xa0 >> 5: valid, // 0xa0 - 0xbf: 10xxxxxx, UTF-8 continuation
	0xc0 >> 5: suffix, // 0xc0 - 0xdf: 11xxxxxx, UTF-8 start
	0xe0 >> 5: suffix, // 0xe0 - 0xff: 11xxxxxx, UTF-8 start
	}
	// map of valid characters in the 0x20 - 0x3f range (bottom 5 bits)
	lo20 := [32]byte{
	'"' - 0x20: suffix, // 0x22: count suffixes
	'\'' - 0x20: suffix, // 0x27: count suffixes
	',' - 0x20: comma, // 0x2c: count commas
	'.' - 0x20: valid, // 0x2e
	'0' - 0x20: valid, // 0x30
	'1' - 0x20: valid, // 0x31
	'2' - 0x20: valid, // 0x32
	'3' - 0x20: valid, // 0x33
	'4' - 0x20: valid, // 0x34
	'5' - 0x20: valid, // 0x35
	'6' - 0x20: valid, // 0x36
	'7' - 0x20: valid, // 0x37
	'8' - 0x20: valid, // 0x38
	'9' - 0x20: valid, // 0x39
	}
	for i := 0; i < len(s); i++ {
	c := s[i]
	act := hi[c>>5]
	if act == tbl20 {
	act = lo20[c&0x1f]
	}
	switch act {
	case invalid:
	return false, false
	case valid:
	case comma:
	// up to 2 commas, no suffixes
	if commas>>1\|suffixes != 0 {
	return false, false
	}
	commas++
	default:
	// up to 3 suffixes, no commas
	if suffixes++; suffixes>>2\|commas != 0 {
	return false, false
	}
	}
	}
	return true, suffixes != 0
	}