Created
May 15, 2015 02:07
-
-
Save d6u/b1863b9c169d31719186 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| // NSString+HTML.m | |
| // MWFeedParser | |
| // | |
| // Copyright (c) 2010 Michael Waterfall | |
| // | |
| // Permission is hereby granted, free of charge, to any person obtaining a copy | |
| // of this software and associated documentation files (the "Software"), to deal | |
| // in the Software without restriction, including without limitation the rights | |
| // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| // copies of the Software, and to permit persons to whom the Software is | |
| // furnished to do so, subject to the following conditions: | |
| // | |
| // 1. The above copyright notice and this permission notice shall be included | |
| // in all copies or substantial portions of the Software. | |
| // | |
| // 2. This Software cannot be used to archive or collect data such as (but not | |
| // limited to) that of events, news, experiences and activities, for the | |
| // purpose of any concept relating to diary/journal keeping. | |
| // | |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| // THE SOFTWARE. | |
| // | |
| #import "NSString+HTML.h" | |
| #import "GTMNSString+HTML.h" | |
| @implementation NSString (HTML) | |
| #pragma mark - Instance Methods | |
| - (NSString *)stringByConvertingHTMLToPlainText { | |
| @autoreleasepool { | |
| // Character sets | |
| NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", (unichar)0x0085, (unichar)0x000C, (unichar)0x2028, (unichar)0x2029]]; | |
| NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", (unichar)0x0085, (unichar)0x000C, (unichar)0x2028, (unichar)0x2029]]; | |
| NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]; | |
| // Scan and find all tags | |
| NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length]; | |
| NSScanner *scanner = [[NSScanner alloc] initWithString:self]; | |
| [scanner setCharactersToBeSkipped:nil]; | |
| [scanner setCaseSensitive:YES]; | |
| NSString *str = nil, *tagName = nil; | |
| BOOL dontReplaceTagWithSpace = NO; | |
| do { | |
| // Scan up to the start of a tag or whitespace | |
| if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) { | |
| [result appendString:str]; | |
| str = nil; // reset | |
| } | |
| // Check if we've stopped at a tag/comment or whitespace | |
| if ([scanner scanString:@"<" intoString:NULL]) { | |
| // Stopped at a comment, script tag, or other tag | |
| if ([scanner scanString:@"!--" intoString:NULL]) { | |
| // Comment | |
| [scanner scanUpToString:@"-->" intoString:NULL]; | |
| [scanner scanString:@"-->" intoString:NULL]; | |
| } else if ([scanner scanString:@"script" intoString:NULL]) { | |
| // Script tag where things don't need escaping! | |
| [scanner scanUpToString:@"</script>" intoString:NULL]; | |
| [scanner scanString:@"</script>" intoString:NULL]; | |
| } else { | |
| // Tag - remove and replace with space unless it's | |
| // a closing inline tag then dont replace with a space | |
| if ([scanner scanString:@"/" intoString:NULL]) { | |
| // Closing tag - replace with space unless it's inline | |
| tagName = nil; dontReplaceTagWithSpace = NO; | |
| if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) { | |
| tagName = [tagName lowercaseString]; | |
| dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] || | |
| [tagName isEqualToString:@"b"] || | |
| [tagName isEqualToString:@"i"] || | |
| [tagName isEqualToString:@"q"] || | |
| [tagName isEqualToString:@"span"] || | |
| [tagName isEqualToString:@"em"] || | |
| [tagName isEqualToString:@"strong"] || | |
| [tagName isEqualToString:@"cite"] || | |
| [tagName isEqualToString:@"abbr"] || | |
| [tagName isEqualToString:@"acronym"] || | |
| [tagName isEqualToString:@"label"]); | |
| } | |
| // Replace tag with string unless it was an inline | |
| if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; | |
| } | |
| // Scan past tag | |
| [scanner scanUpToString:@">" intoString:NULL]; | |
| [scanner scanString:@">" intoString:NULL]; | |
| } | |
| } else { | |
| // Stopped at whitespace - replace all whitespace and newlines with a space | |
| if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) { | |
| if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result | |
| } | |
| } | |
| } while (![scanner isAtEnd]); | |
| // Cleanup | |
| // Decode HTML entities and return | |
| NSString *retString = [result stringByDecodingHTMLEntities]; | |
| // Return | |
| return retString; | |
| } | |
| } | |
| - (NSString *)stringByDecodingHTMLEntities { | |
| // Can return self so create new string if we're a mutable string | |
| return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]]; | |
| } | |
| - (NSString *)stringByEncodingHTMLEntities { | |
| // Can return self so create new string if we're a mutable string | |
| return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]]; | |
| } | |
| - (NSString *)stringByEncodingHTMLEntities:(BOOL)isUnicode { | |
| // Can return self so create new string if we're a mutable string | |
| return [NSString stringWithString:(isUnicode ? [self gtm_stringByEscapingForHTML] : [self gtm_stringByEscapingForAsciiHTML])]; | |
| } | |
| - (NSString *)stringWithNewLinesAsBRs { | |
| @autoreleasepool { | |
| // Strange New lines: | |
| // Next Line, U+0085 | |
| // Form Feed, U+000C | |
| // Line Separator, U+2028 | |
| // Paragraph Separator, U+2029 | |
| // Scanner | |
| NSScanner *scanner = [[NSScanner alloc] initWithString:self]; | |
| [scanner setCharactersToBeSkipped:nil]; | |
| NSMutableString *result = [[NSMutableString alloc] init]; | |
| NSString *temp; | |
| NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString: | |
| [NSString stringWithFormat:@"\n\r%C%C%C%C", (unichar)0x0085, (unichar)0x000C, (unichar)0x2028, (unichar)0x2029]]; | |
| // Scan | |
| do { | |
| // Get non new line characters | |
| temp = nil; | |
| [scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp]; | |
| if (temp) [result appendString:temp]; | |
| temp = nil; | |
| // Add <br /> s | |
| if ([scanner scanString:@"\r\n" intoString:nil]) { | |
| // Combine \r\n into just 1 <br /> | |
| [result appendString:@"<br />"]; | |
| } else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) { | |
| // Scan other new line characters and add <br /> s | |
| if (temp) { | |
| for (NSUInteger i = 0; i < temp.length; i++) { | |
| [result appendString:@"<br />"]; | |
| } | |
| } | |
| } | |
| } while (![scanner isAtEnd]); | |
| // Cleanup & return | |
| NSString *retString = [NSString stringWithString:result]; | |
| // Return | |
| return retString; | |
| } | |
| } | |
| - (NSString *)stringByRemovingNewLinesAndWhitespace { | |
| @autoreleasepool { | |
| // Strange New lines: | |
| // Next Line, U+0085 | |
| // Form Feed, U+000C | |
| // Line Separator, U+2028 | |
| // Paragraph Separator, U+2029 | |
| // Scanner | |
| NSScanner *scanner = [[NSScanner alloc] initWithString:self]; | |
| [scanner setCharactersToBeSkipped:nil]; | |
| NSMutableString *result = [[NSMutableString alloc] init]; | |
| NSString *temp; | |
| NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString: | |
| [NSString stringWithFormat:@" \t\n\r%C%C%C%C", (unichar)0x0085, (unichar)0x000C, (unichar)0x2028, (unichar)0x2029]]; | |
| // Scan | |
| while (![scanner isAtEnd]) { | |
| // Get non new line or whitespace characters | |
| temp = nil; | |
| [scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp]; | |
| if (temp) [result appendString:temp]; | |
| // Replace with a space | |
| if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) { | |
| if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result | |
| [result appendString:@" "]; | |
| } | |
| } | |
| // Cleanup | |
| // Return | |
| NSString *retString = [NSString stringWithString:result]; | |
| // Return | |
| return retString; | |
| } | |
| } | |
| - (NSString *)stringByLinkifyingURLs { | |
| if (!NSClassFromString(@"NSRegularExpression")) return self; | |
| @autoreleasepool { | |
| NSString *pattern = @"(?<!=\")\\b((http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%%&:/~\\+#]*[\\w\\-\\@?^=%%&/~\\+#])?)"; | |
| NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:pattern options:0 error:nil]; | |
| NSString *modifiedString = [regex stringByReplacingMatchesInString:self options:0 range:NSMakeRange(0, [self length]) | |
| withTemplate:@"<a href=\"$1\" class=\"linkified\">$1</a>"]; | |
| return modifiedString; | |
| } | |
| } | |
| - (NSString *)stringByStrippingTags { | |
| @autoreleasepool { | |
| // Find first & and short-cut if we can | |
| NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location; | |
| if (ampIndex == NSNotFound) { | |
| return [NSString stringWithString:self]; // return copy of string as no tags found | |
| } | |
| // Scan and find all tags | |
| NSScanner *scanner = [NSScanner scannerWithString:self]; | |
| [scanner setCharactersToBeSkipped:nil]; | |
| NSMutableSet *tags = [[NSMutableSet alloc] init]; | |
| NSString *tag; | |
| do { | |
| // Scan up to < | |
| tag = nil; | |
| [scanner scanUpToString:@"<" intoString:NULL]; | |
| [scanner scanUpToString:@">" intoString:&tag]; | |
| // Add to set | |
| if (tag) { | |
| NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag]; | |
| [tags addObject:t]; | |
| } | |
| } while (![scanner isAtEnd]); | |
| // Strings | |
| NSMutableString *result = [[NSMutableString alloc] initWithString:self]; | |
| NSString *finalString; | |
| // Replace tags | |
| NSString *replacement; | |
| for (NSString *t in tags) { | |
| // Replace tag with space unless it's an inline element | |
| replacement = @" "; | |
| if ([t isEqualToString:@"<a>"] || | |
| [t isEqualToString:@"</a>"] || | |
| [t isEqualToString:@"<span>"] || | |
| [t isEqualToString:@"</span>"] || | |
| [t isEqualToString:@"<strong>"] || | |
| [t isEqualToString:@"</strong>"] || | |
| [t isEqualToString:@"<em>"] || | |
| [t isEqualToString:@"</em>"]) { | |
| replacement = @""; | |
| } | |
| // Replace | |
| [result replaceOccurrencesOfString:t | |
| withString:replacement | |
| options:NSLiteralSearch | |
| range:NSMakeRange(0, result.length)]; | |
| } | |
| // Remove multi-spaces and line breaks | |
| finalString = [result stringByRemovingNewLinesAndWhitespace]; | |
| // Cleanup | |
| // Return | |
| return finalString; | |
| } | |
| } | |
| @end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment