Skip to content

Instantly share code, notes, and snippets.

@algal
Forked from jpsim/LineEndsFind.mm
Created April 22, 2018 22:48
Show Gist options
  • Save algal/8ecfae13cb1b5044ef9a6526f8e5a9ab to your computer and use it in GitHub Desktop.
Save algal/8ecfae13cb1b5044ef9a6526f8e5a9ab to your computer and use it in GitHub Desktop.
A function to find the offsets of newlines ('\n') in UTF-16 encoded string. Try as I might, I cannot get a Swift version within an order of magnitude of the C++ version. Both routines must return arrays of the same size and with equal elements.

With "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n" as input:

$ swiftc LineEndsFind.swift && ./LineEndsFind
Unicode unsafe time for 1M strings: 1.12317534297472
Unicode safe time for 1M strings: 3.4386172790546
Unicode unsafe result: [11, 13, 15]
Unicode safe result: [1, 3, 5]
$ clang++ LineEndsFind.mm -ObjC++ -std=c++14 -fobjc-arc -framework QuartzCore -o main && ./main
Unicode unsafe time for 1M strings: 0.876858
Unicode unsafe result: 11 13 15
#import <Foundation/Foundation.h>
#import <QuartzCore/QuartzCore.h>
#import <iostream>
#import <vector> // Needed for gist to compile.
#pragma mark - Pure Implementation Functions
const static unichar kUTF16Newline = (unichar)'\n'; // old naming habits die hard!
/**
* Calculates an array of line end "positions" for a given string.
* The equivalent Swift function was `(String) -> [Int]` or `(NSString) -> [Int]`
*
* In this context a "position" is the zero-based index of a newline
* character in the string as if it were an array of UTF-16 codepoints.
*
* @param s the string.
* @return: an array of newline positions.
*/
std::vector<size_t> LineEndsFind(NSString* s) {
assert(s);
std::vector<size_t> lineEnds;
unichar *const start = (unichar *)[s cStringUsingEncoding:NSUTF16StringEncoding];
unichar *current = start;
while (*current != 0) {
unichar c = *current;
if (c == kUTF16Newline) {
lineEnds.push_back(current - start);
}
current++;
}
return lineEnds;
}
int main() {
auto t1 = CACurrentMediaTime();
for (int i = 0; i < 1'000'000; ++i) {
LineEndsFind(@"πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n");
}
auto t2 = CACurrentMediaTime();
auto duration = t2 -t1;
std::cout << "Unicode unsafe time for 1M strings: " << duration << '\n';
std::cout << "Unicode unsafe result: ";
for (const auto & pos : LineEndsFind(@"πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")) {
std::cout << pos << ' ';
}
std::cout << '\n';
return 0;
}
import Foundation
import QuartzCore
// Fast unicode unsafe implementation
public func makeLineEndArray(string: NSString) -> [Int] {
var lineEnds = [Int]()
for i in 0..<string.length where string.character(at: i) == 10 {
lineEnds.append(i)
}
return lineEnds
}
// Slow unicode-safe implementation
public func makeUnicodeSafeLineEndArray(string: String) -> [Int] {
var lineEnds = [Int]()
for (index, char) in string.enumerated() where char == "\n" {
lineEnds.append(index)
}
return lineEnds
/* Even slower functional approach:
return string.enumerated()
.filter { $0.1 == "\n" }
.map { $0.0 }
*/
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode unsafe time for 1M strings: \(t2-t1)")
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode safe time for 1M strings: \(t2-t1)")
}
print("Unicode unsafe result: \(makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
print("Unicode safe result: \(makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
/// 2nd more Swifty attempt UTF16 view.
/// Didn't return just for testiing
public class func makeLineEndArray2(string: String) {
precondition(!string.isEmpty)
var lineEnds = [Int]()
for i in 0..<string.utf16.count {
if string.utf16[String.UTF16View.Index(i)] == 10 {
lineEnds.append(i)
}
}
print("From swift: lineEndsCount = \(lineEnds.count)")
}
import Foundation
import QuartzCore
// Not as Fast unicode unsafe implementation
// (I don't know why this is slower than an optimized build using NSString.character(at:) --AG)
public func makeLineEndArray(string: NSString) -> [Int] {
var lineEnds = [Int]()
guard let startPtr:UnsafePointer<Int8> = string.cString(using: String.Encoding.utf16.rawValue)
else {
return []
}
startPtr.withMemoryRebound(to: Int16.self, capacity: string.length) {
(p:UnsafePointer<Int16>) -> Void in
for i in 0..<string.length where p.advanced(by: i).pointee == 10
{
lineEnds.append(i)
}
}
return lineEnds
}
// Slow unicode-safe implementation
public func makeUnicodeSafeLineEndArray(string: String) -> [Int] {
var lineEnds = [Int]()
for (index, char) in string.enumerated() where char == "\n" {
lineEnds.append(index)
}
return lineEnds
/* Even slower functional approach:
return string.enumerated()
.filter { $0.1 == "\n" }
.map { $0.0 }
*/
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode unsafe time for 1M strings: \(t2-t1)")
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode safe time for 1M strings: \(t2-t1)")
}
print("Unicode unsafe result: \(makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
print("Unicode safe result: \(makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
import Foundation
import QuartzCore
// Not as Fast unicode unsafe implementation
// (I don't know why this is slower than an optimized build using NSString.character(at:) --AG)
public func makeLineEndArray(string: NSString) -> [Int] {
var lineEnds = [Int]()
let startPtr:UnsafePointer<Int8> = string.cString(using: String.Encoding.utf16.rawValue)!
// this is certainly unsafe, but I believe it is correct
let p = unsafeBitCast(startPtr, to: UnsafePointer<Int16>.self)
for i in 0..<string.length where p.advanced(by: i).pointee == 10
{
lineEnds.append(i)
}
return lineEnds
}
// Slow unicode-safe implementation
public func makeUnicodeSafeLineEndArray(string: String) -> [Int] {
var lineEnds = [Int]()
for (index, char) in string.enumerated() where char == "\n" {
lineEnds.append(index)
}
return lineEnds
/* Even slower functional approach:
return string.enumerated()
.filter { $0.1 == "\n" }
.map { $0.0 }
*/
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode unsafe time for 1M strings: \(t2-t1)")
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode safe time for 1M strings: \(t2-t1)")
}
print("Unicode unsafe result: \(makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
print("Unicode safe result: \(makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
import Foundation
import QuartzCore
// Not as Fast unicode unsafe implementation
public func makeLineEndArray(string s: String) -> [Int] {
precondition(!s.isEmpty)
var lineEnds = [Int]()
for i in 0..<s.utf16.count {
if s.utf16[s.utf16.index(s.utf16.startIndex, offsetBy: i)] == 10 {
lineEnds.append(i)
}
}
return lineEnds
}
// Slow unicode-safe implementation
public func makeUnicodeSafeLineEndArray(string: String) -> [Int] {
var lineEnds = [Int]()
for (index, char) in string.enumerated() where char == "\n" {
lineEnds.append(index)
}
return lineEnds
/* Even slower functional approach:
return string.enumerated()
.filter { $0.1 == "\n" }
.map { $0.0 }
*/
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode unsafe time for 1M strings: \(t2-t1)")
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode safe time for 1M strings: \(t2-t1)")
}
print("Unicode unsafe result: \(makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
print("Unicode safe result: \(makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")

Meaningless benchmarks are fun!

Run on my MacBook Pro 15-inch late 2016

Unoptimized swift build

Unicode unsafe time for 1M strings: 1.04036423703656 Unicode safe time for 1M strings: 3.48763609700836 Unicode unsafe result: [11, 13, 15] Unicode safe result: [1, 3, 5]

Optimized swift build with NSString.characterAt

Unicode unsafe time for 1M strings: 0.923268710030243 Unicode safe time for 1M strings: 1.92451678798534 Unicode unsafe result: [11, 13, 15] Unicode safe result: [1, 3, 5]

Optimized swift build with NSString.cString(using:) and safely rebound raw pointer manipulation

Unicode unsafe time for 1M strings: 1.1308168740361 Unicode safe time for 1M strings: 1.81629342702217 Unicode unsafe result: [11, 13, 15] Unicode safe result: [1, 3, 5]

Optimized swift build with NSString.cString(using:) and unsafely cast raw pointer manipulation

Unicode unsafe time for 1M strings: 1.0765609620139 Unicode safe time for 1M strings: 1.83420633501373 Unicode unsafe result: [11, 13, 15] Unicode safe result: [1, 3, 5]

Optimized swift build using String.utf16

Unicode unsafe time for 1M strings: 0.395662263967097 Unicode safe time for 1M strings: 1.80865901499055 Unicode unsafe result: [11, 13, 15] Unicode safe result: [1, 3, 5]

C++ build

Unicode unsafe time for 1M strings: 0.936712 Unicode unsafe result: 11 13 15

#!/bin/sh
echo "# Meaningless benchmarks are fun!"
echo
echo "Run on my MacBook Pro 15-inch late 2016"
echo
echo "### Unoptimized swift build"
swiftc ./LineEndsFind.swift && ./LineEndsFind
echo
echo "### Optimized swift build with NSString.characterAt"
swiftc -Ounchecked ./LineEndsFind.swift && ./LineEndsFind
echo
echo "### Optimized swift build with NSString.cString(using:) and safely rebound raw pointer manipulation"
swiftc -Ounchecked ./LineEndsFind3.swift && ./LineEndsFind3
echo
echo "### Optimized swift build with NSString.cString(using:) and unsafely cast raw pointer manipulation"
swiftc -Ounchecked ./LineEndsFind4.swift && ./LineEndsFind4
echo
echo "### Optimized swift build using String.utf16"
swiftc -Ounchecked ./LineEndsFind5.swift && ./LineEndsFind5
echo
echo "### C++ build"
clang++ LineEndsFind.mm -ObjC++ -std=c++14 -fobjc-arc -framework QuartzCore -o main && ./main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment