Created
April 28, 2019 16:44
-
-
Save tempelmann/165be6f21280caafc8dd0b344e413966 to your computer and use it in GitHub Desktop.
Search unicode (UTF-8) text in any normalization form, case-insensitive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Created by Thomas Tempelmann on 28.04.19. | |
// See also: https://stackoverflow.com/q/55884928/43615 | |
// | |
// This code relies on string functions only available on macOS, sorry. | |
// | |
#include <Foundation/Foundation.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <regex.h> | |
#include <assert.h> | |
static NSMutableArray* addAlternatives (NSArray<NSArray<NSString*>*> *chars, NSInteger index, NSArray<NSString*> *variations) { | |
NSMutableArray<NSString*> *result = [NSMutableArray array]; | |
NSArray<NSString*> *alternativeChars = chars[index]; | |
for (NSString *ch in alternativeChars) { | |
for (NSString *variation in variations) { | |
[result addObject:[variation stringByAppendingString:ch]]; | |
} | |
} | |
if (++index < chars.count) { | |
result = addAlternatives (chars, index, result); | |
} | |
return result; | |
} | |
static NSString *regExCombinations (NSString *text) { | |
// While regex can search case insensitively, it only does that for ASCII letters (A-Z), but not | |
// for further international latin characters. It also does not account for variations in | |
// normalization of Unicode characters. | |
// This function therefore builds a regex search string that includes all the alternatives not | |
// covered by regex. | |
// It assumes that if there are more than one character that has different normalization represenations, | |
// the same normalization will be used on all these characters. This is based on the assumption that | |
// text in a file will use the same normalization throughout, instead of mixing them, at least within | |
// the short area of the text we're looking for. | |
// We need to precompose the string first. If we do not do that, a decomposed "ä" appears as two chars ("a" and "¨"), | |
// and our test that skips regular A-Z chars would prevent us from running into the upper/lowercase test. | |
text = text.precomposedStringWithCanonicalMapping; | |
// Split up the search text into an array of characters, for which we then each can have multiple alternatives | |
NSMutableArray<NSArray<NSString*>*> *charsOfText = [NSMutableArray arrayWithCapacity:text.length]; | |
for (NSInteger index = 0; index < text.length; ) { | |
NSRange range = [text rangeOfComposedCharacterSequenceAtIndex:index]; | |
NSString *ch = [text substringWithRange:range]; | |
NSArray<NSString*> *alternatives = @[ch]; // default: just the original char | |
if (range.length == 1 && [ch characterAtIndex:0] >= 128) { | |
NSString *ch1 = [ch lowercaseString]; | |
NSString *ch2 = [ch uppercaseString]; | |
if (![ch1 isEqualToString:ch2]) { | |
// this is a char that has different representation and which is probably not handled by regex, so let's create two alternatives of this | |
alternatives = @[ch1, ch2]; | |
} | |
} | |
[charsOfText addObject:alternatives]; | |
index += range.length; | |
} | |
// Assemble the created character alternatives back into full strings. | |
NSMutableArray<NSString*> *variations = addAlternatives (charsOfText, 0, @[@""]); | |
// Add variations of various normalization forms as well | |
SEL methods[] = { | |
@selector(decomposedStringWithCanonicalMapping), | |
@selector(precomposedStringWithCanonicalMapping), | |
@selector(decomposedStringWithCompatibilityMapping), | |
@selector(precomposedStringWithCompatibilityMapping) | |
}; | |
for (int methodIndex = 0; methodIndex < 4; ++methodIndex) { | |
for (int varIndex = 0; varIndex < variations.count; ++varIndex) { | |
NSString *variation = variations[varIndex]; | |
SEL selector = methods[methodIndex]; | |
NSString *str = ((id (*)(id, SEL))[variation methodForSelector:selector])(variation, selector); // https://stackoverflow.com/a/20058585/43615 | |
if (![variations containsObject:str]) { | |
[variations addObject:str]; | |
} | |
} | |
} | |
// Put them all together into a single regular expression | |
NSMutableArray *result = [NSMutableArray arrayWithCapacity:variations.count]; | |
[variations enumerateObjectsUsingBlock:^(NSString * _Nonnull str, NSUInteger idx, BOOL * _Nonnull stop) { | |
// We need to escape any special chars in the string so that regex does not interpret them | |
NSMutableString *escaped = [NSMutableString string]; | |
for (NSInteger index = 0; index < str.length; ) { | |
NSRange range = [str rangeOfComposedCharacterSequenceAtIndex:index]; | |
NSString *ch = [str substringWithRange:range]; | |
[escaped appendString:@"\\"]; | |
[escaped appendString:ch]; | |
index += range.length; | |
} | |
[result addObject:escaped]; | |
}]; | |
return [result componentsJoinedByString:@"|"]; | |
} | |
void findIn (const char *what, const char *data, int whatPre, int dataPre) { | |
int found; | |
regex_t re; | |
NSString *what2 = regExCombinations ([NSString stringWithUTF8String:what]); | |
what = what2.UTF8String; | |
regcomp (&re, what, REG_ICASE | REG_EXTENDED); | |
found = regexec(&re, data, 0, NULL, 0) == 0; | |
printf ("Found %-20s (%s) in %s (%s): %s\n", what, whatPre?"pre":"dec", data, dataPre?"pre":"dec", found?"yes":"no"); | |
} | |
void findInBoth (const char *what, int whatPre) { | |
char dataPre[] = { '<', 0xC3, 0xA4, '>', 0}; // precomposed | |
char dataDec[] = { '<', 0x61, 0xCC, 0x88, '>', 0}; // decomposed | |
findIn (what, dataPre, whatPre, 1); | |
findIn (what, dataDec, whatPre, 0); | |
} | |
int main(int argc, const char * argv[]) { | |
char a_pre[] = { 0xC3, 0xA4, 0}; // precomposed ä | |
char a_dec[] = { 0x61, 0xCC, 0x88, 0}; // decomposed ä | |
char A_pre[] = { 0xC3, 0x84, 0}; // precomposed Ä | |
char A_dec[] = { 0x41, 0xCC, 0x88, 0}; // decomposed Ä | |
findInBoth (a_pre, 1); | |
findInBoth (a_dec, 0); | |
findInBoth (A_pre, 1); | |
findInBoth (A_dec, 0); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Caveat: For long strings, e.g. above 8-10 chars, this algorithm can get very inefficient and slow because it has an exponential growth in the
variations
array.