Last active
June 14, 2022 12:01
-
-
Save karwa/b26f537020fa259c4fcb897e6cc71d9a to your computer and use it in GitHub Desktop.
proof of concept bootstring encoding for docc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright The swift-url Contributors. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
printEncoded("somesymbol123") | |
printEncoded("bytebufferview.suffix(_:)") | |
printEncoded("perform(loghandle:)") | |
printEncoded("天空") | |
printEncoded("test-[]!{}@<>|£:}{!@^&£)$&") | |
func printEncoded(_ symbol: String) { | |
var encoded = "" | |
let success = Punycode.encode(symbol.unicodeScalars) { ascii in | |
encoded.unicodeScalars.append(Unicode.Scalar(ascii)) | |
} | |
guard success else { | |
print("❌ Failed to encode \(symbol)") | |
return | |
} | |
print("Original:", symbol) | |
print("Encoded: ", encoded) | |
var encodedScalars = Array(encoded.unicodeScalars) | |
switch Punycode.decodeInPlace(&encodedScalars) { | |
case .success(let count): | |
encodedScalars.removeSubrange(count...) | |
case .notPunycode: | |
break // Nothing to do. | |
case .failed: | |
print("❌ Failed to decode \(encoded)") | |
} | |
var decoded = "" | |
decoded.unicodeScalars += encodedScalars | |
print("Decoded: ", decoded) | |
} | |
/// Punycode is a simple and efficient transfer encoding syntax designed for use with Internationalized Domain Names | |
/// in Applications (IDNA). It uniquely and reversibly transforms a Unicode string into an ASCII string. | |
/// | |
/// Punycode is defined by [RFC-3492][rfc-3492]. | |
/// | |
/// > Note: | |
/// > | |
/// > Punycode is only an implementation detail of IDNA, and Punycode encoding or decoding | |
/// > alone is **not sufficient** to transform an IDN to/from its Unicode presentation. | |
/// > The functions ``IDNA/IDNA/toASCII(utf8:beStrict:writer:)`` and ``IDNA/IDNA/toUnicode(utf8:beStrict:writer:)`` | |
/// > should be used instead, and their implementation makes use of Punycode where appropriate. | |
/// | |
/// [rfc-3492]: https://datatracker.ietf.org/doc/html/rfc3492 | |
/// | |
public enum Punycode {} | |
extension Punycode { | |
// Parameters defining the "Punycode" variant of the Bootstring format. | |
@inlinable internal static var Base: UInt32 { 36 } | |
@inlinable internal static var TMin: UInt32 { 1 } | |
@inlinable internal static var TMax: UInt32 { 26 } | |
@inlinable internal static var Damp: UInt32 { 700 } | |
@inlinable internal static var Skew: UInt32 { 38 } | |
@inlinable internal static var InitialDecoderState: (codePoint: UInt32, bias: UInt32) { (codePoint: 0, bias: 72) } | |
/// Whether this scalar is a 'basic' code-point. | |
/// | |
/// Basic codepoints are ASCII and may be written directly in a Punycode-encoded label. | |
/// | |
@inlinable | |
internal static func isBasic(_ codepoint: Unicode.Scalar) -> Bool { | |
guard codepoint.isASCII else { return false } | |
switch codepoint { | |
case "-", "a"..."z", "A"..."Z", "0"..."9": return true | |
default: return false | |
} | |
} | |
/// Encodes a number within the range `0..<36` to the ASCII characters `a-z,0-9`. | |
/// | |
/// For example, `0 => a`, `10 => k`, and `30` returns the ASCII character `"4"` in this number system. | |
/// This function returns lowercase characters. | |
/// | |
/// The given number is assumed to already be within the range `0..<36`. If it is not within that range, | |
/// the return value is not specified. | |
/// | |
@inlinable | |
internal static func encode_digit(_ number: UInt32) -> UInt8 { | |
let table: StaticString = "abcdefghijklmnopqrstuvwxyz0123456789" | |
return table.withUTF8Buffer { $0[Int(min(number, 35))] } | |
} | |
/// Decodes an ASCII alphanumeric codepoint as a number within the range `0..<36` | |
/// | |
/// For example, `a => 0`, `K => 10`, and the character `"4"` returns the number `30`. | |
/// This function also decodes uppercase ASCII alphas, even though that is not strictly required by IDNA | |
/// (domain labels are case-folded before they are decoded anyway). | |
/// | |
@inlinable | |
internal static func decode_digit(_ character: Unicode.Scalar) -> UInt32? { | |
switch character { | |
case "a"..."z": | |
return character.value &- Unicode.Scalar("a").value | |
case "0"..."9": | |
return 26 &+ (character.value &- Unicode.Scalar("0").value) | |
case "A"..."Z": | |
return character.value &- Unicode.Scalar("A").value | |
default: | |
return nil | |
} | |
} | |
// https://datatracker.ietf.org/doc/html/rfc3492#section-6.1 | |
@inlinable | |
internal static func calculateAdjustedBias( | |
delta: UInt32, countProcessedScalars: UInt32, isFirstAdjustment: Bool | |
) -> UInt32 { | |
// Since most things here are constants, the compiler can provide away most traps by itself. | |
assert(countProcessedScalars > 0, "countProcessedScalars should be incremented before adjusting the bias") | |
var delta = isFirstAdjustment ? delta / Damp : delta / 2 | |
delta += delta.dividedReportingOverflow(by: countProcessedScalars).partialValue | |
var k = UInt32.zero | |
while delta > ((Base - TMin) * TMax) / 2 { | |
delta = delta / (Base - TMin) | |
k &+= Punycode.Base | |
} | |
return k &+ (((Base - TMin + 1) &* delta) / (delta + Skew)) | |
} | |
} | |
// -------------------------------------------- | |
// MARK: - Encoding | |
// -------------------------------------------- | |
extension Punycode { | |
/// Encodes the given Unicode domain label as an ASCII Punycode string. | |
/// | |
/// The encoded label is written by means of the given closure, which is invoked with each ASCII byte | |
/// of the result. If the label already consists only of ASCII codepoints, it does not require encoding and | |
/// will be passed through unchanged. This also applies to labels that are already Punycode-encoded; | |
/// they will not be decoded or validated by this function and will simply be passed through as they are. | |
/// | |
/// If the label _does_ require Punycode encoding, the closure will also be invoked to write | |
/// the ACE prefix (`"xn--"`). | |
/// | |
/// ```swift | |
/// func encodeDomainLabel(_ input: String) -> String? { | |
/// var asciiString = [UInt8]() | |
/// let success = Punycode.encode(input.unicodeScalars) { byte in | |
/// asciiString.append(byte) | |
/// } | |
/// return success ? String(decoding: asciiString, as: UTF8.self) : nil | |
/// } | |
/// | |
/// encodeDomainLabel("你好你好") // "xn--6qqa088eba" | |
/// ``` | |
/// | |
/// > Note: | |
/// > | |
/// > Punycode is only an implementation detail of IDNA, and Punycode encoding | |
/// > alone is **not sufficient** to construct an Internationalized Domain Name | |
/// > from its Unicode presentation. If that is what you are looking to do, | |
/// > use the ``IDNA/IDNA/toASCII(utf8:beStrict:writer:)`` function instead. | |
/// | |
/// - parameters: | |
/// - source: The string to encode, as a collection of Unicode scalars. | |
/// - writeASCII: A closure which is invoked with each ASCII character of the result. | |
/// | |
/// - returns: Whether or not the string was successfully encoded. | |
/// If `false`, any data written by `writeASCII` should be discarded. | |
/// | |
@inlinable | |
public static func encode<Source>( | |
_ source: Source, into writeASCII: (UInt8) -> Void | |
) -> Bool where Source: Collection, Source.Element == Unicode.Scalar { | |
// 1. Make sure there is at least one non-basic codepoint for us to encode. | |
// Otherwise it is empty/ASCII-only and should be passed-through. | |
guard !source.isEmpty, source.contains(where: { !isBasic($0) }) else { | |
for basicScalar in source { writeASCII(UInt8(truncatingIfNeeded: basicScalar.value)) } | |
return true | |
} | |
// 2. Write the "xn--" ACE prefix. | |
writeASCII(UInt8(ascii: "x")) | |
writeASCII(UInt8(ascii: "n")) | |
writeASCII(UInt8(ascii: "-")) | |
writeASCII(UInt8(ascii: "-")) | |
// 3. Write the basic code-points and delimiter. | |
var countBasicScalars = UInt32.zero | |
var countAllScalars = UInt32.zero | |
for scalar in source { | |
countAllScalars &+= 1 | |
if isBasic(scalar) { | |
writeASCII(UInt8(truncatingIfNeeded: scalar.value)) | |
countBasicScalars &+= 1 | |
} | |
} | |
if countBasicScalars > 0 { | |
writeASCII(UInt8(ascii: "-")) | |
} | |
// From the RFC: | |
// | |
// > For example, if the encoder were to verify that no | |
// > input code points exceed M and that the input length does not exceed | |
// > L, then no delta could ever exceed (M - initial_n) * (L + 1), and | |
// > hence no overflow could occur if integer variables were capable of | |
// > representing values that large. | |
// https://datatracker.ietf.org/doc/html/rfc3492#section-6.4 | |
// | |
// The RFC prefers to detect overflow rather than enforcing limits which rule it out, | |
// but since the Unicode.Scalar type is already validated, M is already set at 0x10FFFF. | |
// Using 32-bit integers, we find (L + 1) = 0xFOF, or L = 3854 scalars to rule out overflow. | |
// Every scalar needs at least 1 ASCII char in the output, and we are limited to 63 chars in IDNA anyway, | |
// so this is easily enough. | |
guard countAllScalars < 0xF0F else { | |
return false | |
} | |
// 3. Write the tail, encoding the non-basic code-points. | |
// Punycode works with a { codepoint, offset } state, which starts at an initial { 0x80, 0 } | |
// and is incremented by a series of deltas, each of which describes an offset at which to insert the code-point. | |
// | |
// The deltas are written in codepoint order - i.e. the offsets at which to insert every "a", then every "b", | |
// "j", "z", etc..., but for every Unicode code-point in the string. Also, to advance the codepoint, it encodes | |
// a big enough delta to wrap around the string N times -- accounting for the fact that the string is actually | |
// growing between every delta as we insert codepoints. | |
// | |
// To encode, we basically simulate that process. Finding codepoints to insert and writing how many steps | |
// it took us to get there. | |
var delta = UInt32.zero | |
var countProcessedScalars = countBasicScalars | |
var (codePoint, bias) = InitialDecoderState | |
while countProcessedScalars < countAllScalars { | |
// i. Find the smallest codepoint we haven't processed yet, | |
// then calculate the delta which advances the state's codepoint to { nextCodePoint, 0 }. | |
do { | |
var nextCodePoint = UInt32(0x10FFFF) | |
for scalar in source { | |
if scalar.value >= codePoint, scalar.value < nextCodePoint, !isBasic(scalar) { | |
nextCodePoint = scalar.value | |
} | |
} | |
let stateIncrement = (nextCodePoint &- codePoint) &* (countProcessedScalars &+ 1) | |
delta &+= stateIncrement | |
codePoint = nextCodePoint | |
} | |
// ii. Find the offsets at which the decoder should insert this codepoint. | |
// - For each offset it already decoded (value < codepoint), | |
// increment delta to tell it to skip. | |
// - At each offset where the new codepoint occurs (value == codepoint), we want to insert a value. | |
// So write the current value of delta as a variable-length integer, then reset delta to 0. | |
for scalar in source { | |
if scalar.value < codePoint || isBasic(scalar) { | |
delta &+= 1 | |
} else if scalar.value == codePoint { | |
var q = delta | |
var k = Base | |
while true { | |
let t = (k <= bias) ? TMin : min(k &- bias, TMax) | |
if q < t { | |
writeASCII(Punycode.encode_digit(q)) | |
break | |
} | |
let digit: UInt32 | |
(q, digit) = (q &- t).quotientAndRemainder(dividingBy: Base &- t) | |
writeASCII(Punycode.encode_digit(digit &+ t)) | |
k &+= Punycode.Base | |
} | |
// Adjust the bias, reset delta, etc. For the next offset/codepoint. | |
bias = calculateAdjustedBias( | |
delta: delta, | |
countProcessedScalars: countProcessedScalars &+ 1, | |
isFirstAdjustment: countProcessedScalars == countBasicScalars | |
) | |
delta = 0 | |
countProcessedScalars &+= 1 | |
} | |
} | |
// iii. Finished processing this codepoint. On to the next one. | |
delta &+= 1 | |
codePoint &+= 1 | |
} | |
return true | |
} | |
} | |
// -------------------------------------------- | |
// MARK: - Decoding | |
// -------------------------------------------- | |
extension Punycode { | |
/// The result of decoding an ASCII Punycode label to Unicode. | |
/// | |
public enum DecodeInPlaceResult { | |
/// The string was decoded successfully. | |
/// It can be found in the first `count` elements of the buffer. | |
/// | |
case success(count: Int) | |
/// The string is invalid and could not be decoded. | |
/// The contents of the buffer should be considered invalid and discarded. | |
/// | |
case failed | |
/// The string lacks the Punycode ACE prefix (`"xn--"`) and so has not been decoded. | |
/// | |
case notPunycode | |
} | |
/// Whether or not the given buffer begins with the Punycode ACE prefix (`"xn--"`). | |
/// This function checks for the prefix case-sensitively. | |
/// | |
@inlinable | |
internal static func hasACEPrefix<Buffer>( | |
_ buffer: Buffer | |
) -> Bool where Buffer: Collection, Buffer.Element == Unicode.Scalar { | |
var iter = buffer.makeIterator() | |
guard iter.next() == "x", iter.next() == "n", iter.next() == "-", iter.next() == "-" else { | |
return false | |
} | |
return true | |
} | |
/// Decodes the given ASCII Punycode label to Unicode. | |
/// | |
/// The label, expressed as a buffer of Unicode codepoints, is decoded in-place over the existing contents. | |
/// If decoding is successful, the return value ``DecodeInPlaceResult/success(count:)`` communicates | |
/// the length of the decoded string within the buffer. | |
/// | |
/// The label must contain the ACE prefix (`"xn--"`), otherwise no decoding will be performed | |
/// and the function will return ``DecodeInPlaceResult/notPunycode``. | |
/// | |
/// If the label contains the ACE prefix but is ill-formed, or is so long that it would cause | |
/// internal calculations to overflow (~3800 scalars), the function will return ``DecodeInPlaceResult/failed``. | |
/// The contents of the buffer should be considered invalid and discarded in that case. | |
/// | |
/// ```swift | |
/// func decodeDomainLabel(_ input: String) -> String? { | |
/// var buffer = Array(input.unicodeScalars) | |
/// switch Punycode.decodeInPlace(&buffer) { | |
/// case .success(let count): | |
/// var result = "" | |
/// result.unicodeScalars.append(contentsOf: buffer.prefix(count)) | |
/// return result | |
/// case .notPunycode: | |
/// return input // No "xn--" prefix. | |
/// case .failed: | |
/// return nil // "xn--" prefix but nonsense punycode. | |
/// } | |
/// } | |
/// | |
/// decodeDomainLabel("xn--6qqa088eba") // "你好你好" | |
/// ``` | |
/// | |
/// > Note: | |
/// > | |
/// > Punycode is just an implementation detail of IDNA, and decoding a domain label is not enough | |
/// > to properly handle Internationalized Domain Names (IDNs). If you wish to decode an encoded IDN | |
/// > to its Unicode/display form, use the ``IDNA/IDNA/toUnicode(utf8:beStrict:writer:)`` function instead. | |
/// | |
@inlinable | |
public static func decodeInPlace<Buffer>( | |
_ buffer: inout Buffer | |
) -> DecodeInPlaceResult where Buffer: RandomAccessCollection & MutableCollection, Buffer.Element == Unicode.Scalar { | |
// 1. Make sure there is something for us to decode. | |
// If there is no ACE prefix, we don't consider it a Punycode string. | |
guard hasACEPrefix(buffer) else { | |
return .notPunycode | |
} | |
// 2. Find the base string containing the basic (ASCII) codepoints, and copy them to the front. | |
// The rest is the tail, containing a sequence of encoded deltas. | |
var tailCursor = buffer.index(buffer.startIndex, offsetBy: 4) | |
var countProcessedScalars = UInt32.zero | |
if let tailDelimiter = buffer[tailCursor...].lastIndex(of: "-") { | |
var src = tailCursor | |
var dst = buffer.startIndex | |
while src < tailDelimiter { | |
guard isBasic(buffer[src]) else { return .failed } | |
buffer[dst] = buffer[src] | |
buffer.formIndex(after: &src) | |
buffer.formIndex(after: &dst) | |
} | |
tailCursor = buffer.index(after: tailDelimiter) | |
countProcessedScalars = UInt32(buffer.distance(from: buffer.startIndex, to: dst)) | |
} | |
// 3. Consume the tail, decoding deltas, which are instructions | |
// to insert a scalar at a "random" (non-sequential) location. | |
var insertionPoint = buffer.startIndex | |
var (codePoint, bias) = InitialDecoderState | |
while tailCursor < buffer.endIndex { | |
// 3a. Decode a variable-length integer into delta. | |
var delta = UInt32.zero | |
do { | |
var (weight, k) = (UInt32(1), Base) | |
while true { | |
guard tailCursor < buffer.endIndex, let digit = decode_digit(buffer[tailCursor]) else { return .failed } | |
buffer.formIndex(after: &tailCursor) | |
guard digit <= (.max &- delta).dividedReportingOverflow(by: weight).partialValue else { return .failed } | |
delta &+= digit &* weight | |
let t = (k <= bias) ? TMin : min(k &- bias, TMax) | |
if digit < t { break } | |
let weight_overflow: Bool | |
(weight, weight_overflow) = weight.multipliedReportingOverflow(by: Base &- t) | |
guard !weight_overflow else { return .failed } | |
k &+= Base | |
} | |
} | |
// 3b. Process delta. | |
// Use it to adjust the bias for the next integer, then use it to increment our | |
// { codePoint, insertionPoint } state. | |
do { | |
// Increment 'countProcessedScalars' now even though it is actually written later; | |
// it's how Punycode works, and it is important for this to never be zero. | |
countProcessedScalars += 1 | |
bias = calculateAdjustedBias( | |
delta: delta, | |
countProcessedScalars: countProcessedScalars, | |
isFirstAdjustment: insertionPoint == buffer.startIndex | |
) | |
let stepsToEnd = countProcessedScalars &- UInt32(buffer.distance(from: buffer.startIndex, to: insertionPoint)) | |
if delta < stepsToEnd { | |
// Same codepoint, repeated at a different offset. | |
buffer.formIndex(&insertionPoint, offsetBy: Int(delta)) | |
} else { | |
// Different codepoint. | |
delta &-= stepsToEnd | |
let (codepointDelta, insertionOffset) = delta.quotientAndRemainder(dividingBy: countProcessedScalars) | |
let codePoint_overflow: Bool | |
(codePoint, codePoint_overflow) = codePoint.addingReportingOverflow(1 &+ codepointDelta) | |
guard !codePoint_overflow else { return .failed } | |
insertionPoint = buffer.index(buffer.startIndex, offsetBy: Int(insertionOffset)) | |
} | |
} | |
guard let scalarToInsert = Unicode.Scalar(codePoint) else { return .failed } | |
// 3c. Insert the scalar. | |
// Since we consumed *at least* one scalar from the front of the tail to get this decoded scalar, | |
// we can just move everything after the insertion point down by 1 place, and overwrite the tail | |
// without growing. | |
do { | |
var src = buffer.index(buffer.startIndex, offsetBy: Int(countProcessedScalars)) | |
var dst = buffer.index(after: src) | |
assert(dst <= tailCursor, "We only write over parts of the tail that have already been consumed") | |
while src >= insertionPoint { | |
buffer[dst] = buffer[src] | |
buffer.formIndex(before: &dst) | |
buffer.formIndex(before: &src) | |
} | |
buffer[insertionPoint] = scalarToInsert | |
} | |
buffer.formIndex(after: &insertionPoint) | |
} | |
// 4. Finished. | |
// MutableCollection does not allow us to remove the junk at the end, so return the new length | |
// and let the caller handle that problem. | |
return .success(count: Int(countProcessedScalars)) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment