Skip to content

Instantly share code, notes, and snippets.

@suchi
Created May 25, 2011 12:47
Show Gist options
  • Save suchi/990898 to your computer and use it in GitHub Desktop.
Save suchi/990898 to your computer and use it in GitHub Desktop.
日本語テキストを0x20区切りの分かち書きに簡易変換します。「プログラミング作法」 "The Practice of Programming" のMarkovChainを日本語でも楽しめるようにするためのもの。
#include <stdio.h>
#include <ctype.h>
typedef unsigned char uchar;
struct wc {
enum wctype {
KANJI, // 0
HIRAKATA, // 1
PUNKT, // 2
OTHERS, // 3
};
uchar c1;
uchar c2;
wctype gettype() {
if ((c1 == 0x81) && (c2 == 0x5b)) {
return HIRAKATA;
} else if (((c1 == 0x82) && (0x9f <= c2)) || ((c1 == 0x83) && (c2 <= 0x96))) {
return HIRAKATA;
} else if ((0x88 < c1) && (c1 < 0xfc)) {
return KANJI;
} else if ((c1 == 0x88) && (c2 >= 0x9f)) {
return KANJI;
} else if ((c1 == 0xfc) && (c2 <= 0x4b)) {
return KANJI;
} else if ((c1 == 0x81) && ((c2 == 0x41) || (c2 == 0x42))) {
return PUNKT;
} else {
return OTHERS;
}
}
void write(FILE* fp) {
fputc(c1, fp);
fputc(c2, fp);
}
};
bool operator<(const wc& x, const wc& y)
{
if (x.c1 == y.c1) {
return x.c2 < y.c2;
}
return x.c1 < y.c1;
}
bool nextchar(uchar*& cp, wc& w) // [io]
{
if (*cp == NULL) {
return false;
}
while (*cp != NULL) {
if (*cp < 0x80) {
cp++;
w.c1 = w.c2 = NULL;
return true;
}
w.c1 = *cp++;
if (*cp == NULL) {
return false;
}
w.c2 = *cp++;
return true;
}
return false; // not found
}
void putstr(uchar* buf)
{
bool bInWord = false;
bool bInKana = false;
uchar* cp = buf;
wc w;
while (nextchar(cp, w)) {
wc::wctype t = w.gettype();
//fprintf(stderr, "[%c%c (%d)]\n", w.c1, w.c2, t);
if (t == wc::KANJI) {
if (bInWord) {
if (bInKana) {
fputc(' ', stdout);
bInKana = false;
}
w.write(stdout);
} else {
fputc(' ', stdout);
bInWord = true;
w.write(stdout);
}
} else if (t == wc::HIRAKATA) {
if (bInWord) {
bInKana = true;
w.write(stdout);
} else {
bInWord = true;
bInKana = true;
w.write(stdout);
}
} else if (t == wc::PUNKT) {
fputc(' ', stdout);
w.write(stdout);
fputc('\n', stdout);
bInKana = bInWord = false;
} else {
if (bInWord) {
fputc(' ', stdout);
}
bInKana = bInWord = false;
}
}
}
int main(int argc, char** argv)
{
uchar buf[256];
while (fgets((char*)buf, 255, stdin) != NULL) {
putstr(buf);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment