Created
April 15, 2010 02:00
-
-
Save gidili/366599 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public enum TweetSectionType { NormalText, Url, AtName, HashCode } | |
public class TweetSection | |
{ | |
public TweetSectionType SectionType; | |
public string Text; | |
} | |
public class TweetDecoder | |
{ | |
#region regex patterns | |
public const string atRegexPattern = @"@([A-Za-z0-9_]+)"; | |
public const string hashRegexPattern = @"#([A-Za-z0-9_]+)"; | |
// here be dragons | |
public const string urlRegexDragonPattern = @"^(?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?((?#Subdomains)(?:(?:[-\w\d{1-3}]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|edu|co\.uk|ac\.uk|it|fr|tv|museum|asia|local|travel|[a-z]{2})?)|(?#IP)((\b25[0-5]\b|\b[2][0-4][0-9]\b|\b[0-1]?[0-9]?[0-9]\b)(\.(\b25[0-5]\b|\b[2][0-4][0-9]\b|\b[0-1]?[0-9]?[0-9]\b)){3}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$ |/.,*:;=]|%[a-f\d]{2})*)?$"; | |
#endregion | |
#region regexes | |
// always the same so declare once as compiled for faster execution | |
private static Regex atRegex = new Regex(atRegexPattern, RegexOptions.Compiled); | |
private static Regex hashRegex = new Regex(hashRegexPattern, RegexOptions.Compiled); | |
private static Regex urlRegex = new Regex(urlRegexDragonPattern, RegexOptions.Compiled); | |
#endregion | |
// Here be magic. | |
public static List<TweetSection> DecodeSections(string originalTweet) | |
{ | |
// declare return variable | |
var sectionz = new List<TweetSection>(); | |
// split in words | |
var wordz = new List<string>(originalTweet.Split(' ')); | |
// loop through words and figure out what kind of section. | |
foreach(var word in wordz) | |
{ | |
// section defaults to normal text | |
var sectionType = TweetSectionType.NormalText; | |
if (atRegex.IsMatch(word)) | |
{ | |
sectionType = TweetSectionType.AtName; | |
} | |
else if (hashRegex.IsMatch(word)) | |
{ | |
sectionType = TweetSectionType.HashCode; | |
} | |
else if (urlRegex.IsMatch(word)) | |
{ | |
sectionType = TweetSectionType.Url; | |
} | |
// add word to section list | |
sectionz.Add(new TweetSection { SectionType = sectionType, Text = word }); | |
} | |
return sectionz; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment