Skip to content

Instantly share code, notes, and snippets.

@tempelmann
Created April 28, 2019 16:44
Show Gist options
  • Save tempelmann/165be6f21280caafc8dd0b344e413966 to your computer and use it in GitHub Desktop.
Save tempelmann/165be6f21280caafc8dd0b344e413966 to your computer and use it in GitHub Desktop.
Search unicode (UTF-8) text in any normalization form, case-insensitive
//
// Created by Thomas Tempelmann on 28.04.19.
// See also: https://stackoverflow.com/q/55884928/43615
//
// This code relies on string functions only available on macOS, sorry.
//
#include <Foundation/Foundation.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <regex.h>
#include <assert.h>
static NSMutableArray* addAlternatives (NSArray<NSArray<NSString*>*> *chars, NSInteger index, NSArray<NSString*> *variations) {
NSMutableArray<NSString*> *result = [NSMutableArray array];
NSArray<NSString*> *alternativeChars = chars[index];
for (NSString *ch in alternativeChars) {
for (NSString *variation in variations) {
[result addObject:[variation stringByAppendingString:ch]];
}
}
if (++index < chars.count) {
result = addAlternatives (chars, index, result);
}
return result;
}
static NSString *regExCombinations (NSString *text) {
// While regex can search case insensitively, it only does that for ASCII letters (A-Z), but not
// for further international latin characters. It also does not account for variations in
// normalization of Unicode characters.
// This function therefore builds a regex search string that includes all the alternatives not
// covered by regex.
// It assumes that if there are more than one character that has different normalization represenations,
// the same normalization will be used on all these characters. This is based on the assumption that
// text in a file will use the same normalization throughout, instead of mixing them, at least within
// the short area of the text we're looking for.
// We need to precompose the string first. If we do not do that, a decomposed "ä" appears as two chars ("a" and "¨"),
// and our test that skips regular A-Z chars would prevent us from running into the upper/lowercase test.
text = text.precomposedStringWithCanonicalMapping;
// Split up the search text into an array of characters, for which we then each can have multiple alternatives
NSMutableArray<NSArray<NSString*>*> *charsOfText = [NSMutableArray arrayWithCapacity:text.length];
for (NSInteger index = 0; index < text.length; ) {
NSRange range = [text rangeOfComposedCharacterSequenceAtIndex:index];
NSString *ch = [text substringWithRange:range];
NSArray<NSString*> *alternatives = @[ch]; // default: just the original char
if (range.length == 1 && [ch characterAtIndex:0] >= 128) {
NSString *ch1 = [ch lowercaseString];
NSString *ch2 = [ch uppercaseString];
if (![ch1 isEqualToString:ch2]) {
// this is a char that has different representation and which is probably not handled by regex, so let's create two alternatives of this
alternatives = @[ch1, ch2];
}
}
[charsOfText addObject:alternatives];
index += range.length;
}
// Assemble the created character alternatives back into full strings.
NSMutableArray<NSString*> *variations = addAlternatives (charsOfText, 0, @[@""]);
// Add variations of various normalization forms as well
SEL methods[] = {
@selector(decomposedStringWithCanonicalMapping),
@selector(precomposedStringWithCanonicalMapping),
@selector(decomposedStringWithCompatibilityMapping),
@selector(precomposedStringWithCompatibilityMapping)
};
for (int methodIndex = 0; methodIndex < 4; ++methodIndex) {
for (int varIndex = 0; varIndex < variations.count; ++varIndex) {
NSString *variation = variations[varIndex];
SEL selector = methods[methodIndex];
NSString *str = ((id (*)(id, SEL))[variation methodForSelector:selector])(variation, selector); // https://stackoverflow.com/a/20058585/43615
if (![variations containsObject:str]) {
[variations addObject:str];
}
}
}
// Put them all together into a single regular expression
NSMutableArray *result = [NSMutableArray arrayWithCapacity:variations.count];
[variations enumerateObjectsUsingBlock:^(NSString * _Nonnull str, NSUInteger idx, BOOL * _Nonnull stop) {
// We need to escape any special chars in the string so that regex does not interpret them
NSMutableString *escaped = [NSMutableString string];
for (NSInteger index = 0; index < str.length; ) {
NSRange range = [str rangeOfComposedCharacterSequenceAtIndex:index];
NSString *ch = [str substringWithRange:range];
[escaped appendString:@"\\"];
[escaped appendString:ch];
index += range.length;
}
[result addObject:escaped];
}];
return [result componentsJoinedByString:@"|"];
}
void findIn (const char *what, const char *data, int whatPre, int dataPre) {
int found;
regex_t re;
NSString *what2 = regExCombinations ([NSString stringWithUTF8String:what]);
what = what2.UTF8String;
regcomp (&re, what, REG_ICASE | REG_EXTENDED);
found = regexec(&re, data, 0, NULL, 0) == 0;
printf ("Found %-20s (%s) in %s (%s): %s\n", what, whatPre?"pre":"dec", data, dataPre?"pre":"dec", found?"yes":"no");
}
void findInBoth (const char *what, int whatPre) {
char dataPre[] = { '<', 0xC3, 0xA4, '>', 0}; // precomposed
char dataDec[] = { '<', 0x61, 0xCC, 0x88, '>', 0}; // decomposed
findIn (what, dataPre, whatPre, 1);
findIn (what, dataDec, whatPre, 0);
}
int main(int argc, const char * argv[]) {
char a_pre[] = { 0xC3, 0xA4, 0}; // precomposed ä
char a_dec[] = { 0x61, 0xCC, 0x88, 0}; // decomposed ä
char A_pre[] = { 0xC3, 0x84, 0}; // precomposed Ä
char A_dec[] = { 0x41, 0xCC, 0x88, 0}; // decomposed Ä
findInBoth (a_pre, 1);
findInBoth (a_dec, 0);
findInBoth (A_pre, 1);
findInBoth (A_dec, 0);
return 0;
}
@tempelmann
Copy link
Author

Caveat: For long strings, e.g. above 8-10 chars, this algorithm can get very inefficient and slow because it has an exponential growth in the variations array.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment