tempelmann · April 28, 2019 16:44 · tempelmann · Nov 5, 2019
diff --git a/regex with normalization variations.c b/regex with normalization variations.c
 //
 //  Created by Thomas Tempelmann on 28.04.19.
 //  See also: https://stackoverflow.com/q/55884928/43615
 //
 // This code relies on string functions only available on macOS, sorry.
 //

 #include <Foundation/Foundation.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <regex.h>
 #include <assert.h>

 static NSMutableArray* addAlternatives (NSArray<NSArray<NSString*>*> *chars, NSInteger index, NSArray<NSString*> *variations) {
 	NSMutableArray<NSString*> *result = [NSMutableArray array];
 	NSArray<NSString*> *alternativeChars = chars[index];
 	for (NSString *ch in alternativeChars) {
 		for (NSString *variation in variations) {
 			[result addObject:[variation stringByAppendingString:ch]];
 		}
 	}
 	if (++index < chars.count) {
 		result = addAlternatives (chars, index, result);
 	}
 	return result;
 }

 static NSString *regExCombinations (NSString *text) {
 	// While regex can search case insensitively, it only does that for ASCII letters (A-Z), but not
 	// for further international latin characters. It also does not account for variations in
 	// normalization of Unicode characters.
 	// This function therefore builds a regex search string that includes all the alternatives not
 	// covered by regex.
 	// It assumes that if there are more than one character that has different normalization represenations,
 	// the same normalization will be used on all these characters. This is based on the assumption that
 	// text in a file will use the same normalization throughout, instead of mixing them, at least within
 	// the short area of the text we're looking for.
 	
 	// We need to precompose the string first. If we do not do that, a decomposed "ä" appears as two chars ("a" and "¨"),
 	// and our test that skips regular A-Z chars would prevent us from running into the upper/lowercase test. 
 	text = text.precomposedStringWithCanonicalMapping;
 	
 	// Split up the search text into an array of characters, for which we then each can have multiple alternatives
 	NSMutableArray<NSArray<NSString*>*> *charsOfText = [NSMutableArray arrayWithCapacity:text.length];
 	for (NSInteger index = 0; index < text.length; ) {
 		NSRange range = [text rangeOfComposedCharacterSequenceAtIndex:index];
 		NSString *ch = [text substringWithRange:range];
 		NSArray<NSString*> *alternatives = @[ch];	// default: just the original char
 		if (range.length == 1 && [ch characterAtIndex:0] >= 128) {
 			NSString *ch1 = [ch lowercaseString];
 			NSString *ch2 = [ch uppercaseString];
 			if (![ch1 isEqualToString:ch2]) {
 				// this is a char that has different representation and which is probably not handled by regex, so let's create two alternatives of this
 				alternatives = @[ch1, ch2];
 			}
 		}
 		[charsOfText addObject:alternatives];
 		index += range.length;
 	} 
 	
 	// Assemble the created character alternatives back into full strings.
 	NSMutableArray<NSString*> *variations = addAlternatives (charsOfText, 0, @[@""]);
 	
 	// Add variations of various normalization forms as well
 	SEL methods[] = {
 		@selector(decomposedStringWithCanonicalMapping),
 		@selector(precomposedStringWithCanonicalMapping),
 		@selector(decomposedStringWithCompatibilityMapping),
 		@selector(precomposedStringWithCompatibilityMapping)
 	};
 	for (int methodIndex = 0; methodIndex < 4; ++methodIndex) {
 		for (int varIndex = 0; varIndex < variations.count; ++varIndex) {
 			NSString *variation = variations[varIndex];
 			SEL selector = methods[methodIndex];
 			NSString *str = ((id (*)(id, SEL))[variation methodForSelector:selector])(variation, selector);	// https://stackoverflow.com/a/20058585/43615
 			if (![variations containsObject:str]) {
 				[variations addObject:str];
 			}
 		}
 	}

 	// Put them all together into a single regular expression
 	NSMutableArray *result = [NSMutableArray arrayWithCapacity:variations.count];
 	[variations enumerateObjectsUsingBlock:^(NSString * _Nonnull str, NSUInteger idx, BOOL * _Nonnull stop) {
 		// We need to escape any special chars in the string so that regex does not interpret them
 		NSMutableString *escaped = [NSMutableString string];
 		for (NSInteger index = 0; index < str.length; ) {
 			NSRange range = [str rangeOfComposedCharacterSequenceAtIndex:index];
 			NSString *ch = [str substringWithRange:range];
 			[escaped appendString:@"\\"];
 			[escaped appendString:ch];
 			index += range.length;
 		}
 		[result addObject:escaped];
 	}];
 	return [result componentsJoinedByString:@"|"];
 }

 void findIn (const char *what, const char *data, int whatPre, int dataPre) {
 	int found;
 	regex_t re;
 	NSString *what2 = regExCombinations ([NSString stringWithUTF8String:what]);
 	what = what2.UTF8String;
 	regcomp (&re, what, REG_ICASE | REG_EXTENDED);
 	found = regexec(&re, data, 0, NULL, 0) == 0;
 	printf ("Found %-20s (%s) in %s (%s): %s\n", what, whatPre?"pre":"dec", data, dataPre?"pre":"dec", found?"yes":"no");
 }

 void findInBoth (const char *what, int whatPre) {
 	char dataPre[] = { '<', 0xC3, 0xA4, '>', 0};		// precomposed
 	char dataDec[] = { '<', 0x61, 0xCC, 0x88, '>', 0};	// decomposed
 	findIn (what, dataPre, whatPre, 1);
 	findIn (what, dataDec, whatPre, 0);
 }

 int main(int argc, const char * argv[]) {
 	
 	char a_pre[] = { 0xC3, 0xA4, 0};		// precomposed ä
 	char a_dec[] = { 0x61, 0xCC, 0x88, 0};	// decomposed ä
 	char A_pre[] = { 0xC3, 0x84, 0};		// precomposed Ä
 	char A_dec[] = { 0x41, 0xCC, 0x88, 0};	// decomposed Ä
 	
 	findInBoth (a_pre, 1);
 	findInBoth (a_dec, 0);
 	findInBoth (A_pre, 1);
 	findInBoth (A_dec, 0);

 	return 0;
 }
	//
	// Created by Thomas Tempelmann on 28.04.19.
	// See also: https://stackoverflow.com/q/55884928/43615
	//
	// This code relies on string functions only available on macOS, sorry.
	//

	#include <Foundation/Foundation.h>
	#include <string.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <regex.h>
	#include <assert.h>

	static NSMutableArray* addAlternatives (NSArray<NSArray<NSString>> chars, NSInteger index, NSArray<NSString> *variations) {
	NSMutableArray<NSString> result = [NSMutableArray array];
	NSArray<NSString> alternativeChars = chars[index];
	for (NSString *ch in alternativeChars) {
	for (NSString *variation in variations) {
	[result addObject:[variation stringByAppendingString:ch]];
	}
	}
	if (++index < chars.count) {
	result = addAlternatives (chars, index, result);
	}
	return result;
	}

	static NSString regExCombinations (NSString text) {
	// While regex can search case insensitively, it only does that for ASCII letters (A-Z), but not
	// for further international latin characters. It also does not account for variations in
	// normalization of Unicode characters.
	// This function therefore builds a regex search string that includes all the alternatives not
	// covered by regex.
	// It assumes that if there are more than one character that has different normalization represenations,
	// the same normalization will be used on all these characters. This is based on the assumption that
	// text in a file will use the same normalization throughout, instead of mixing them, at least within
	// the short area of the text we're looking for.

	// We need to precompose the string first. If we do not do that, a decomposed "ä" appears as two chars ("a" and "¨"),
	// and our test that skips regular A-Z chars would prevent us from running into the upper/lowercase test.
	text = text.precomposedStringWithCanonicalMapping;

	// Split up the search text into an array of characters, for which we then each can have multiple alternatives
	NSMutableArray<NSArray<NSString>> *charsOfText = [NSMutableArray arrayWithCapacity:text.length];
	for (NSInteger index = 0; index < text.length; ) {
	NSRange range = [text rangeOfComposedCharacterSequenceAtIndex:index];
	NSString *ch = [text substringWithRange:range];
	NSArray<NSString> alternatives = @[ch]; // default: just the original char
	if (range.length == 1 && [ch characterAtIndex:0] >= 128) {
	NSString *ch1 = [ch lowercaseString];
	NSString *ch2 = [ch uppercaseString];
	if (![ch1 isEqualToString:ch2]) {
	// this is a char that has different representation and which is probably not handled by regex, so let's create two alternatives of this
	alternatives = @[ch1, ch2];
	}
	}
	[charsOfText addObject:alternatives];
	index += range.length;
	}

	// Assemble the created character alternatives back into full strings.
	NSMutableArray<NSString> variations = addAlternatives (charsOfText, 0, @[@""]);

	// Add variations of various normalization forms as well
	SEL methods[] = {
	@selector(decomposedStringWithCanonicalMapping),
	@selector(precomposedStringWithCanonicalMapping),
	@selector(decomposedStringWithCompatibilityMapping),
	@selector(precomposedStringWithCompatibilityMapping)
	};
	for (int methodIndex = 0; methodIndex < 4; ++methodIndex) {
	for (int varIndex = 0; varIndex < variations.count; ++varIndex) {
	NSString *variation = variations[varIndex];
	SEL selector = methods[methodIndex];
	NSString str = ((id ()(id, SEL))[variation methodForSelector:selector])(variation, selector); // https://stackoverflow.com/a/20058585/43615
	if (![variations containsObject:str]) {
	[variations addObject:str];
	}
	}
	}

	// Put them all together into a single regular expression
	NSMutableArray *result = [NSMutableArray arrayWithCapacity:variations.count];
	[variations enumerateObjectsUsingBlock:^(NSString * _Nonnull str, NSUInteger idx, BOOL * _Nonnull stop) {
	// We need to escape any special chars in the string so that regex does not interpret them
	NSMutableString *escaped = [NSMutableString string];
	for (NSInteger index = 0; index < str.length; ) {
	NSRange range = [str rangeOfComposedCharacterSequenceAtIndex:index];
	NSString *ch = [str substringWithRange:range];
	[escaped appendString:@"\\"];
	[escaped appendString:ch];
	index += range.length;
	}
	[result addObject:escaped];
	}];
	return [result componentsJoinedByString:@"\|"];
	}

	void findIn (const char what, const char data, int whatPre, int dataPre) {
	int found;
	regex_t re;
	NSString *what2 = regExCombinations ([NSString stringWithUTF8String:what]);
	what = what2.UTF8String;
	regcomp (&re, what, REG_ICASE \| REG_EXTENDED);
	found = regexec(&re, data, 0, NULL, 0) == 0;
	printf ("Found %-20s (%s) in %s (%s): %s\n", what, whatPre?"pre":"dec", data, dataPre?"pre":"dec", found?"yes":"no");
	}

	void findInBoth (const char *what, int whatPre) {
	char dataPre[] = { '<', 0xC3, 0xA4, '>', 0}; // precomposed
	char dataDec[] = { '<', 0x61, 0xCC, 0x88, '>', 0}; // decomposed
	findIn (what, dataPre, whatPre, 1);
	findIn (what, dataDec, whatPre, 0);
	}

	int main(int argc, const char * argv[]) {

	char a_pre[] = { 0xC3, 0xA4, 0}; // precomposed ä
	char a_dec[] = { 0x61, 0xCC, 0x88, 0}; // decomposed ä
	char A_pre[] = { 0xC3, 0x84, 0}; // precomposed Ä
	char A_dec[] = { 0x41, 0xCC, 0x88, 0}; // decomposed Ä

	findInBoth (a_pre, 1);
	findInBoth (a_dec, 0);
	findInBoth (A_pre, 1);
	findInBoth (A_dec, 0);

	return 0;
	}