Skip to content

Instantly share code, notes, and snippets.

@senarukana
Last active August 29, 2015 14:22
Show Gist options
  • Save senarukana/3d7335eda17b1e041fc1 to your computer and use it in GitHub Desktop.
Save senarukana/3d7335eda17b1e041fc1 to your computer and use it in GitHub Desktop.
UTF8 encode
#include <iostream>
using namespace std;
/*
0xxxxxxx A single-byte US-ASCII code (from the first 127 characters)
110xxxxx One more byte follows
1110xxxx Two more bytes follow
11110xxx Three more bytes follow
10xxxxxx A continuation of one of the multi-byte characters
*/
bool validUTF8(const char *s) {
int m, r, i = 0;
while (*s != '\0') {
m = 7;
while (m > 0 && (*s & (1 << m)) > 0) --m;
++s;
r = 6-m;
if (r == -1) // single byte
continue;
else if (r == 0 || r > 3) // continuation or bytes more than 3
return false;
else {
while (r > 0 && *s != '\0') {
if ((*s & (1<<7)) && (*s & (1<<6)) == 0) {
++s;
--r;
} else {
break;
}
}
if (r > 0) return false;
}
}
return true;
}
// 11010000
// 10011000
int main() {
const char *s= "Hello, 李者";
cout<<validUTF8(s)<<endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment