Created
January 16, 2011 13:18
-
-
Save oleganza/781772 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import "NSData+OADataHelpers.h" | |
@implementation NSData (OADataHelpers) | |
- (NSString*) UTF8String | |
{ | |
return [[[NSString alloc] initWithData:[self dataByHealingUTF8Stream] encoding:NSUTF8StringEncoding] autorelease]; | |
} | |
// Replaces all broken sequences by � character and returns NSData with valid UTF-8 bytes. | |
- (NSData*) dataByHealingUTF8Stream | |
{ | |
// bits | |
// 7 U+007F 0xxxxxxx | |
// 11 U+07FF 110xxxxx 10xxxxxx | |
// 16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx | |
// 21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
// 26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
// 31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
#define b00000000 0x00 | |
#define b10000000 0x80 | |
#define b11000000 0xc0 | |
#define b11100000 0xe0 | |
#define b11110000 0xf0 | |
#define b11111000 0xf8 | |
#define b11111100 0xfc | |
#define b11111110 0xfe | |
static NSString* replacementCharacter = @"�"; | |
NSData* replacementCharacterData = [replacementCharacter dataUsingEncoding:NSUTF8StringEncoding]; | |
NSMutableData* resultData = [NSMutableData dataWithCapacity:[self length]]; | |
const char *bytes = [self bytes]; | |
NSUInteger length = [self length]; | |
static const NSUInteger bufferMaxSize = 1024; | |
char buffer[bufferMaxSize]; // not initialized, but will be filled in completely before copying to resultData | |
NSUInteger bufferIndex = 0; | |
#define FlushBuffer() if (bufferIndex > 0) { \ | |
[resultData appendBytes:buffer length:bufferIndex]; \ | |
bufferIndex = 0; \ | |
} | |
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \ | |
[resultData appendBytes:buffer length:bufferIndex]; \ | |
bufferIndex = 0; \ | |
} | |
NSUInteger byteIndex = 0; | |
BOOL invalidByte = NO; | |
while (byteIndex < length) | |
{ | |
char byte = bytes[byteIndex]; | |
if ((byte & b10000000) == b00000000) // 0xxxxxxx | |
{ | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
} | |
else if ((byte & b11100000) == b11000000) // 110xxxxx 10xxxxxx | |
{ | |
if (byteIndex+1 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000) | |
{ | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11110000) == b11100000) // 1110xxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+2 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000) | |
{ | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11111000) == b11110000) // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+3 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
char byte4 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000 && | |
(byte4 & b11000000) == b10000000) | |
{ | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
buffer[bufferIndex++] = byte4; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11111100) == b11111000) // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+4 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
char byte4 = bytes[++byteIndex]; | |
char byte5 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000 && | |
(byte4 & b11000000) == b10000000 && | |
(byte5 & b11000000) == b10000000) | |
{ | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
buffer[bufferIndex++] = byte4; | |
buffer[bufferIndex++] = byte5; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11111110) == b11111100) // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+5 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
char byte4 = bytes[++byteIndex]; | |
char byte5 = bytes[++byteIndex]; | |
char byte6 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000 && | |
(byte4 & b11000000) == b10000000 && | |
(byte5 & b11000000) == b10000000 && | |
(byte6 & b11000000) == b10000000) | |
{ | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
buffer[bufferIndex++] = byte4; | |
buffer[bufferIndex++] = byte5; | |
buffer[bufferIndex++] = byte6; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
if (invalidByte) | |
{ | |
invalidByte = NO; | |
FlushBuffer(); | |
[resultData appendData:replacementCharacterData]; | |
} | |
byteIndex++; | |
} | |
FlushBuffer(); | |
return resultData; | |
} | |
@end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment