Skip to content

Instantly share code, notes, and snippets.

@RyuaNerin
Created February 8, 2015 22:38
Show Gist options
  • Save RyuaNerin/a510cca0b3901903b7d9 to your computer and use it in GitHub Desktop.
Save RyuaNerin/a510cca0b3901903b7d9 to your computer and use it in GitHub Desktop.
트위터 공홈에 있는 방식 그대로 썼다가 망한거
// Same with Twitter
class TwitterStr
{
public TwitterStr(string url, int i1, int i2)
{
this.url = url;
this.indices = new int[] { i1, i2 };
}
public string url;
public int[] indices;
}
private int GetLength(string a)
{
int d = getUnicodeTextLength(a);
List<TwitterStr> e = extractUrlsWithIndices(a);
modifyIndicesFromUTF16ToUnicode(a, e);
for (var f = 0; f < e.Count; f++)
{
d += e[f].indices[0] - e[f].indices[1];
if (e[f].url.ToLower().StartsWith("https://"))
d += 23;
else
d += 22;
}
return d;
}
private Regex regUnicodeText = new Regex(@"[\uD800-\uDBFF][\uDC00-\uDFFF]", RegexOptions.Compiled | RegexOptions.Multiline);
private int getUnicodeTextLength(string text)
{
return regUnicodeText.Replace(text, "").Length;
}
private Regex regExtractUrl = new Regex(@"(((?:[^A-Za-z0-9@@$##]|^))((https?:\/\/)?((?:(?:(?:[^](?:[_-]|[^])*)?[^]\.)*(?:(?:[^](?:-|[^])*)?[^]\.)(?:(?:(?:abogado|academy|accountants|active|actor|aero|agency|airforce|allfinanz|alsace|archi|army|arpa|asia|associates|attorney|auction|audio|autos|axa|band|bar|bargains|bayern|beer|berlin|best|bid|bike|bio|biz|black|blackfriday|blue|bmw|bnpparibas|boo|boutique|brussels|budapest|build|builders|business|buzz|bzh|cab|cal|camera|camp|cancerresearch|capetown|capital|caravan|cards|care|career|careers|casa|cash|cat|catering|center|ceo|cern|channel|cheap|christmas|chrome|church|citic|city|claims|cleaning|click|clinic|clothing|club|codes|coffee|college|cologne|com|community|company|computer|condos|construction|consulting|contractors|cooking|cool|coop|country|credit|creditcard|crs|cruises|cuisinella|cymru|dad|dance|dating|day|deals|degree|democrat|dental|dentist|desi|diamonds|diet|digital|direct|directory|discount|dnp|domains|durban|dvag|eat|edu|education|email|emerck|engineer|engineering|enterprises|equipment|esq|estate|eus|events|exchange|expert|exposed|fail|farm|feedback|finance|financial|fish|fishing|fitness|flights|florist|flsmidth|fly|foo|forsale|foundation|frl|frogans|fund|furniture|futbol|gal|gallery|gbiz|gent|gift|gifts|gives|glass|gle|global|globo|gmail|gmo|gmx|google|gop|gov|graphics|gratis|green|gripe|guide|guitars|guru|hamburg|haus|healthcare|help|here|hiphop|hiv|holdings|holiday|homes|horse|host|hosting|house|how|ibm|immo|immobilien|industries|info|ing|ink|institute|insure|int|international|investments|jetzt|jobs|joburg|juegos|kaufen|kim|kitchen|kiwi|koeln|krd|kred|lacaixa|land|lawyer|lease|lgbt|life|lighting|limited|limo|link|loans|london|lotto|ltda|luxe|luxury|maison|management|mango|market|marketing|media|meet|melbourne|meme|menu|miami|mil|mini|mobi|moda|moe|monash|mortgage|moscow|motorcycles|mov|museum|nagoya|name|navy|net|network|neustar|new|nexus|ngo|nhk|ninja|nra|nrw|nyc|okinawa|ong|onl|ooo|org|organic|otsuka|ovh|paris|partners|parts|pharmacy|photo|photography|photos|physio|pics|pictures|pink|pizza|place|plumbing|pohl|poker|post|praxi|press|pro|prod|productions|prof|properties|property|pub|qpon|quebec|realtor|recipes|red|rehab|reise|reisen|ren|rentals|repair|report|republican|rest|restaurant|reviews|rich|rio|rip|rocks|rodeo|rsvp|ruhr|ryukyu|saarland|sarl|sca|scb|schmidt|schule|scot|services|sexy|shiksha|shoes|singles|social|software|sohu|solar|solutions|soy|space|spiegel|supplies|supply|support|surf|surgery|suzuki|systems|taipei|tatar|tattoo|tax|technology|tel|tienda|tips|tirol|today|tokyo|tools|top|town|toys|trade|training|travel|tui|university|uno|uol|vacations|vegas|ventures|vermögensberater|vermögensberatung|versicherung|vet|viajes|villas|vision|vlaanderen|vodka|vote|voting|voto|voyage|wales|wang|watch|webcam|website|wed|wedding|whoswho|wien|wiki|williamhill|wme|work|works|world|wtc|wtf|xxx|xyz|yachts|yandex|yoga|yokohama|youtube|zip|zone|дети|москва|онлайн|орг|рус|сайт|بازار|شبكة|موقع|संगठन|みんな|世界|中信|中文网|企业|佛山|公司|公益|商城|商标|在线|广东|我爱你|手机|政务|机构|游戏|移动|组织机构|网址|网络|集团|삼성)(?=[^0-9a-zA-Z@]|$))|(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|бел|мкд|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عراق|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中國|台湾|台灣|新加坡|香港|한국)(?=[^0-9a-zA-Z@]|$))|(?:xn--[0-9a-z]+))))(?::([0-9]+))?(\/(?:(?:[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]*(?:\((?:[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]+|(?:[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]*\([a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]+\)[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]*))\)[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]*)*[\+\-a-z0-9=_#\/À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]|(?:\((?:[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]+|(?:[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]*\([a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]+\)[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]*))\)))|(?:@[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]+\/))*)?(\?[a-z0-9!?\*'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]*[a-z0-9_&=#\/])?", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private Regex reInvalidUrlWithoutProtocolPrecedingChars = new Regex(@"[-_.\/]$", RegexOptions.Compiled);
private Regex reValidTcoUrl = new Regex(@"^https?:\/\/t\.co\/[a-z0-9]+", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private Regex reValidAsciiDomain = new Regex(@"(?:(?:[\-a-z0-9À-ÖØ-öø-ÿĀ-ɏɓ-ɔɖ-ɗəɛɣɨɯɲʉʋʻ̀-ͯḀ-ỿ]+)\.)+(?:(?:(?:abogado|academy|accountants|active|actor|aero|agency|airforce|allfinanz|alsace|archi|army|arpa|asia|associates|attorney|auction|audio|autos|axa|band|bar|bargains|bayern|beer|berlin|best|bid|bike|bio|biz|black|blackfriday|blue|bmw|bnpparibas|boo|boutique|brussels|budapest|build|builders|business|buzz|bzh|cab|cal|camera|camp|cancerresearch|capetown|capital|caravan|cards|care|career|careers|casa|cash|cat|catering|center|ceo|cern|channel|cheap|christmas|chrome|church|citic|city|claims|cleaning|click|clinic|clothing|club|codes|coffee|college|cologne|com|community|company|computer|condos|construction|consulting|contractors|cooking|cool|coop|country|credit|creditcard|crs|cruises|cuisinella|cymru|dad|dance|dating|day|deals|degree|democrat|dental|dentist|desi|diamonds|diet|digital|direct|directory|discount|dnp|domains|durban|dvag|eat|edu|education|email|emerck|engineer|engineering|enterprises|equipment|esq|estate|eus|events|exchange|expert|exposed|fail|farm|feedback|finance|financial|fish|fishing|fitness|flights|florist|flsmidth|fly|foo|forsale|foundation|frl|frogans|fund|furniture|futbol|gal|gallery|gbiz|gent|gift|gifts|gives|glass|gle|global|globo|gmail|gmo|gmx|google|gop|gov|graphics|gratis|green|gripe|guide|guitars|guru|hamburg|haus|healthcare|help|here|hiphop|hiv|holdings|holiday|homes|horse|host|hosting|house|how|ibm|immo|immobilien|industries|info|ing|ink|institute|insure|int|international|investments|jetzt|jobs|joburg|juegos|kaufen|kim|kitchen|kiwi|koeln|krd|kred|lacaixa|land|lawyer|lease|lgbt|life|lighting|limited|limo|link|loans|london|lotto|ltda|luxe|luxury|maison|management|mango|market|marketing|media|meet|melbourne|meme|menu|miami|mil|mini|mobi|moda|moe|monash|mortgage|moscow|motorcycles|mov|museum|nagoya|name|navy|net|network|neustar|new|nexus|ngo|nhk|ninja|nra|nrw|nyc|okinawa|ong|onl|ooo|org|organic|otsuka|ovh|paris|partners|parts|pharmacy|photo|photography|photos|physio|pics|pictures|pink|pizza|place|plumbing|pohl|poker|post|praxi|press|pro|prod|productions|prof|properties|property|pub|qpon|quebec|realtor|recipes|red|rehab|reise|reisen|ren|rentals|repair|report|republican|rest|restaurant|reviews|rich|rio|rip|rocks|rodeo|rsvp|ruhr|ryukyu|saarland|sarl|sca|scb|schmidt|schule|scot|services|sexy|shiksha|shoes|singles|social|software|sohu|solar|solutions|soy|space|spiegel|supplies|supply|support|surf|surgery|suzuki|systems|taipei|tatar|tattoo|tax|technology|tel|tienda|tips|tirol|today|tokyo|tools|top|town|toys|trade|training|travel|tui|university|uno|uol|vacations|vegas|ventures|vermögensberater|vermögensberatung|versicherung|vet|viajes|villas|vision|vlaanderen|vodka|vote|voting|voto|voyage|wales|wang|watch|webcam|website|wed|wedding|whoswho|wien|wiki|williamhill|wme|work|works|world|wtc|wtf|xxx|xyz|yachts|yandex|yoga|yokohama|youtube|zip|zone|дети|москва|онлайн|орг|рус|сайт|بازار|شبكة|موقع|संगठन|みんな|世界|中信|中文网|企业|佛山|公司|公益|商城|商标|在线|广东|我爱你|手机|政务|机构|游戏|移动|组织机构|网址|网络|集团|삼성)(?=[^0-9a-zA-Z@]|$))|(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|бел|мкд|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عراق|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中國|台湾|台灣|新加坡|香港|한국)(?=[^0-9a-zA-Z@]|$))|(?:xn--[0-9a-z]+))", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private Regex reValidSpecialShortDomain = new Regex(@"^(?:(?:[^](?:-|[^])*)?[^]\.)(?:(?:co|tv)(?=[^0-9a-zA-Z@]|$", RegexOptions.Compiled);
private Regex reInvalidShortDomain = new Regex(@"^(?:(?:[^](?:-|[^])*)?[^]\.)(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|бел|мкд|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عراق|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中國|台湾|台灣|新加坡|香港|한국)(?=[^0-9a-zA-Z@]|$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private List<TwitterStr> extractUrlsWithIndices(string text, bool extractUrlsWithoutProtocol = true)
{
List<TwitterStr> lst = new List<TwitterStr>();
if (string.IsNullOrEmpty(text) || (extractUrlsWithoutProtocol ? text == "." : text == ":"))
return lst;
Match m1 = regExtractUrl.Match(text);
while (m1.Success)
{
string e = m1.Groups[2].Value;
string f = m1.Groups[3].Value;
string g = m1.Groups[4].Value;
string h = m1.Groups[5].Value;
string i = m1.Groups[7].Value;
int j = m1.Index + m1.Length;
int k = j - f.Length;
if (string.IsNullOrEmpty(g))
{
if (!extractUrlsWithoutProtocol || reInvalidUrlWithoutProtocolPrecedingChars.IsMatch(e))
continue;
TwitterStr l = null;
int m = 0;
reValidAsciiDomain.Replace(h,
a =>
{
int c = h.IndexOf(a.Value, m);
m = c + a.Length;
l = new TwitterStr(a.Value, k + c, k + m);
if (!string.IsNullOrEmpty(i))
if (!reValidSpecialShortDomain.IsMatch(a.Value))
if (reInvalidShortDomain.IsMatch(a.Value))
lst.Add(l);
return null;
});
if (l == null)
continue;
if (!string.IsNullOrEmpty(i))
{
l.url = f.Replace(h, l.url);
l.indices[1] = j;
}
}
else
{
Match mTco = reValidTcoUrl.Match(f);
if (mTco.Success)
{
f = mTco.Groups[0].Value;
j = k + f.Length;
lst.Add(new TwitterStr(f, k, j));
}
}
m1 = m1.NextMatch();
}
return lst;
}
private void modifyIndicesFromUTF16ToUnicode(string a, List<TwitterStr> c)
{
convertUnicodeIndices(a, c, true);
}
private void convertUnicodeIndices(string a, List<TwitterStr> b, bool c)
{
if (b.Count == 0)
return;
int d = 0;
int e = 0;
b.Sort(new Comparison<TwitterStr>((ea, eb) => ea.indices[0].CompareTo(eb.indices[0])));
int f = 0;
TwitterStr g = b[0];
while (d < a.Length)
{
if (g.indices[0] == (c ? d : e))
{
int h = g.indices[1] - g.indices[1];
g.indices[0] = c ? e : d;
g.indices[1] = g.indices[0] + h;
f++;
if (f == b.Count)
break;
g = b[f];
}
int i = (int)a[d];
if (55296 <= i && i <= 56319 && d < a.Length - 1)
{
i = (int)a[d + 1];
if (56320 <= i && i <= 57343)
d++;
e++;
d++;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment