Last active
August 29, 2015 14:04
-
-
Save mitsu-ksgr/7c20b4a71e1ab5a465e2 to your computer and use it in GitHub Desktop.
【C++】文字列中に指定文字列がいくつ出現するかカウントする関数。UTF-8の文字列の長さを取得する関数。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
/** | |
* @brief Return the number of occurrences of substring 'sub in string 'src'. | |
* @param src search target. | |
* @param sub substring. | |
* @return the number of occurrences of substring. | |
*/ | |
int countSubStr(std::string &src, std::string &sub) | |
{ | |
size_t len = src.length(); | |
int count = 0; | |
for(int idx = 0; idx < len; ++count) { | |
int ret = src.find(sub, idx); | |
if(ret == std::string::npos) | |
break; | |
idx = ret + 1; | |
} | |
return count; | |
} | |
/** | |
* @brief Return the length of UTF-8 String. | |
* @param str UTF-8 String. | |
* @return the length of UTF-8 String. | |
*/ | |
int getLengthUTF8String(const char *str) | |
{ | |
// Note: http://ja.wikipedia.org/wiki/UTF-8 | |
// | |
// 1byte: 0xxxxxxx | |
// 2byte: 110xxxxx | |
// 3byte: 1110xxxx | |
// 4byte: 11110xxx | |
constexpr const int kBitChecker[] { | |
0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 | |
}; | |
int count = 0, idx = 0; | |
char ch; | |
while((ch = str[idx]) != '\0') { | |
// 1 byte characters | |
if(!(ch & kBitChecker[8])) { | |
count++; | |
idx++; | |
// 2 byte characters | |
} else if((ch & kBitChecker[8]) && (ch & kBitChecker[7]) && !(ch & kBitChecker[6])) { | |
count++; | |
idx += 2; | |
// 3 byte characters | |
} else if((ch & kBitChecker[8]) && (ch & kBitChecker[7]) && | |
(ch & kBitChecker[6]) && !(ch & kBitChecker[5])) { | |
count++; | |
idx += 3; | |
// 4 byte characters | |
} else if((ch & kBitChecker[8]) && (ch & kBitChecker[7]) && | |
(ch & kBitChecker[6]) && (ch & kBitChecker[5]) && !(ch & kBitChecker[4])) { | |
count++; | |
idx += 4; | |
// Is not first character | |
} else { | |
idx++; | |
continue; | |
} | |
} | |
return count; | |
} | |
int getLengthUTF8String(std::string &str) { | |
return getLengthUTF8String(str.c_str()); | |
} | |
int main(int argc, const char **argv) | |
{ | |
std::string src, sub; | |
if(argc >= 3) { | |
src = argv[1]; | |
sub = argv[2]; | |
} else { | |
src = "abcabcabc"; | |
sub = "bc"; | |
} | |
std::cout << "src = " << src << std::endl; | |
std::cout << "sub = " << sub << std::endl; | |
// Test. | |
int cnt = countSubStr(src, sub); | |
std::cout << "cnt = " << cnt << std::endl; | |
// Test2. | |
std::cout << "------------------------" << std::endl; | |
std::string utf8 = "ABCDEΑΒΓΔΕあいうえお𥼣𥽜𥿠𥿔𨷻"; | |
int len = getLengthUTF8String(utf8); | |
std::cout << "utf8 = " << utf8 << std::endl; | |
std::cout << "length = " << utf8.length() << std::endl; | |
std::cout << "getLengthUTF8String = " << len << std::endl; | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment