Skip to content

Instantly share code, notes, and snippets.

@jpsim
Forked from iosdevzone/LineEndsFind.mm
Created April 22, 2018 13:20
Show Gist options
  • Save jpsim/77f2c47ac83a5e87422ca91727ea6d3c to your computer and use it in GitHub Desktop.
Save jpsim/77f2c47ac83a5e87422ca91727ea6d3c to your computer and use it in GitHub Desktop.
A function to find the offsets of newlines ('\n') in UTF-16 encoded string. Try as I might, I cannot get a Swift version within an order of magnitude of the C++ version. Both routines must return arrays of the same size and with equal elements.

With "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n" as input:

$ swiftc LineEndsFind.swift && ./LineEndsFind
Unicode unsafe time for 1M strings: 1.12317534297472
Unicode safe time for 1M strings: 3.4386172790546
Unicode unsafe result: [11, 13, 15]
Unicode safe result: [1, 3, 5]
$ clang++ LineEndsFind.mm -ObjC++ -std=c++14 -fobjc-arc -framework QuartzCore -o main && ./main
Unicode unsafe time for 1M strings: 0.876858
Unicode unsafe result: 11 13 15
#import <Foundation/Foundation.h>
#import <QuartzCore/QuartzCore.h>
#import <iostream>
#import <vector> // Needed for gist to compile.
#pragma mark - Pure Implementation Functions
const static unichar kUTF16Newline = (unichar)'\n'; // old naming habits die hard!
/**
* Calculates an array of line end "positions" for a given string.
* The equivalent Swift function was `(String) -> [Int]` or `(NSString) -> [Int]`
*
* In this context a "position" is the zero-based index of a newline
* character in the string as if it were an array of UTF-16 codepoints.
*
* @param s the string.
* @return: an array of newline positions.
*/
std::vector<size_t> LineEndsFind(NSString* s) {
assert(s);
std::vector<size_t> lineEnds;
unichar *const start = (unichar *)[s cStringUsingEncoding:NSUTF16StringEncoding];
unichar *current = start;
while (*current != 0) {
unichar c = *current;
if (c == kUTF16Newline) {
lineEnds.push_back(current - start);
}
current++;
}
return lineEnds;
}
int main() {
auto t1 = CACurrentMediaTime();
for (int i = 0; i < 1'000'000; ++i) {
LineEndsFind(@"πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n");
}
auto t2 = CACurrentMediaTime();
auto duration = t2 -t1;
std::cout << "Unicode unsafe time for 1M strings: " << duration << '\n';
std::cout << "Unicode unsafe result: ";
for (const auto & pos : LineEndsFind(@"πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")) {
std::cout << pos << ' ';
}
std::cout << '\n';
return 0;
}
import Foundation
import QuartzCore
// Fast unicode unsafe implementation
public func makeLineEndArray(string: NSString) -> [Int] {
var lineEnds = [Int]()
for i in 0..<string.length where string.character(at: i) == 10 {
lineEnds.append(i)
}
return lineEnds
}
// Slow unicode-safe implementation
public func makeUnicodeSafeLineEndArray(string: String) -> [Int] {
var lineEnds = [Int]()
for (index, char) in string.enumerated() where char == "\n" {
lineEnds.append(index)
}
return lineEnds
/* Even slower functional approach:
return string.enumerated()
.filter { $0.1 == "\n" }
.map { $0.0 }
*/
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode unsafe time for 1M strings: \(t2-t1)")
}
do {
let t1 = CACurrentMediaTime()
for _ in 0..<1_000_000 {
_ = makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n")
}
let t2 = CACurrentMediaTime()
print("Unicode safe time for 1M strings: \(t2-t1)")
}
print("Unicode unsafe result: \(makeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
print("Unicode safe result: \(makeUnicodeSafeLineEndArray(string: "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\n1\n2\n"))")
/// 2nd more Swifty attempt UTF16 view.
/// Didn't return just for testiing
public class func makeLineEndArray2(string: String) {
precondition(!string.isEmpty)
var lineEnds = [Int]()
for i in 0..<string.utf16.count {
if string.utf16[String.UTF16View.Index(i)] == 10 {
lineEnds.append(i)
}
}
print("From swift: lineEndsCount = \(lineEnds.count)")
}
@dmcyk
Copy link

dmcyk commented Apr 23, 2018

hey, guess it might be better to make this comparison when compiled with optimisations.

On my MBP 13':

swiftc LineEndsFind.swift; ./LineEndsFind
----
Unicode unsafe time for 1M strings: 1.29247910302365
Unicode safe time for 1M strings: 4.18399031000445
Unicode unsafe result: [11, 13, 15]
Unicode safe result: [1, 3, 5]

while

swiftc -O LineEndsFind.swift; ./LineEndsFind
----
Unicode unsafe time for 1M strings: 1.1233036009944
Unicode safe time for 1M strings: 2.22174877097132
Unicode unsafe result: [11, 13, 15]
Unicode safe result: [1, 3, 5]

and C++:

clang++ LineEndsFind.mm -ObjC++ -std=c++14 -fobjc-arc -framework QuartzCore -o main; ./main
----
Unicode unsafe time for 1M strings: 1.06488
Unicode unsafe result: 11 13 15 
clang++ LineEndsFind.mm -ObjC++ -std=c++14 -fobjc-arc -O3 -framework QuartzCore -o main; ./main
---- 
Unicode unsafe time for 1M strings: 0.705442
Unicode unsafe result: 11 13 15 

Quite a difference there for the unicode safe solution

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment