Created
May 25, 2011 12:47
-
-
Save suchi/990898 to your computer and use it in GitHub Desktop.
日本語テキストを0x20区切りの分かち書きに簡易変換します。「プログラミング作法」 "The Practice of Programming" のMarkovChainを日本語でも楽しめるようにするためのもの。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <ctype.h> | |
typedef unsigned char uchar; | |
struct wc { | |
enum wctype { | |
KANJI, // 0 | |
HIRAKATA, // 1 | |
PUNKT, // 2 | |
OTHERS, // 3 | |
}; | |
uchar c1; | |
uchar c2; | |
wctype gettype() { | |
if ((c1 == 0x81) && (c2 == 0x5b)) { | |
return HIRAKATA; | |
} else if (((c1 == 0x82) && (0x9f <= c2)) || ((c1 == 0x83) && (c2 <= 0x96))) { | |
return HIRAKATA; | |
} else if ((0x88 < c1) && (c1 < 0xfc)) { | |
return KANJI; | |
} else if ((c1 == 0x88) && (c2 >= 0x9f)) { | |
return KANJI; | |
} else if ((c1 == 0xfc) && (c2 <= 0x4b)) { | |
return KANJI; | |
} else if ((c1 == 0x81) && ((c2 == 0x41) || (c2 == 0x42))) { | |
return PUNKT; | |
} else { | |
return OTHERS; | |
} | |
} | |
void write(FILE* fp) { | |
fputc(c1, fp); | |
fputc(c2, fp); | |
} | |
}; | |
bool operator<(const wc& x, const wc& y) | |
{ | |
if (x.c1 == y.c1) { | |
return x.c2 < y.c2; | |
} | |
return x.c1 < y.c1; | |
} | |
bool nextchar(uchar*& cp, wc& w) // [io] | |
{ | |
if (*cp == NULL) { | |
return false; | |
} | |
while (*cp != NULL) { | |
if (*cp < 0x80) { | |
cp++; | |
w.c1 = w.c2 = NULL; | |
return true; | |
} | |
w.c1 = *cp++; | |
if (*cp == NULL) { | |
return false; | |
} | |
w.c2 = *cp++; | |
return true; | |
} | |
return false; // not found | |
} | |
void putstr(uchar* buf) | |
{ | |
bool bInWord = false; | |
bool bInKana = false; | |
uchar* cp = buf; | |
wc w; | |
while (nextchar(cp, w)) { | |
wc::wctype t = w.gettype(); | |
//fprintf(stderr, "[%c%c (%d)]\n", w.c1, w.c2, t); | |
if (t == wc::KANJI) { | |
if (bInWord) { | |
if (bInKana) { | |
fputc(' ', stdout); | |
bInKana = false; | |
} | |
w.write(stdout); | |
} else { | |
fputc(' ', stdout); | |
bInWord = true; | |
w.write(stdout); | |
} | |
} else if (t == wc::HIRAKATA) { | |
if (bInWord) { | |
bInKana = true; | |
w.write(stdout); | |
} else { | |
bInWord = true; | |
bInKana = true; | |
w.write(stdout); | |
} | |
} else if (t == wc::PUNKT) { | |
fputc(' ', stdout); | |
w.write(stdout); | |
fputc('\n', stdout); | |
bInKana = bInWord = false; | |
} else { | |
if (bInWord) { | |
fputc(' ', stdout); | |
} | |
bInKana = bInWord = false; | |
} | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
uchar buf[256]; | |
while (fgets((char*)buf, 255, stdin) != NULL) { | |
putstr(buf); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment