Created
May 14, 2009 09:40
-
-
Save tsupo/111587 to your computer and use it in GitHub Desktop.
sort in Japanese text order
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * jsort.c | |
| * ひらがな文字列のソート(辞書の順による) | |
| * | |
| * by H. Tsujimura 11 Jan 1995 / 15 May 2009 | |
| * | |
| * 使用方法: | |
| * jsort [-t区切り文字] [+位置] [ファイル名] | |
| * 指定された区切り文字で区切られた、指定位置の文字列を | |
| * 基準にソートを行なう | |
| * | |
| * 例) ファイルの中身が | |
| * aaa,てすと,950110,いち | |
| * bbb,しけん,950110,に | |
| * ccc,けんさ,950111,さん | |
| * のような file1 に対して | |
| * jsort -t, +1 file1 | |
| * を実行すると | |
| * ccc,けんさ,950111,さん | |
| * bbb,しけん,950110,に | |
| * aaa,てすと,950110,いち | |
| * のようにソートされ、 | |
| * jsort -t, +4 file1 | |
| * を実行すると | |
| * aaa,てすと,950110,いち | |
| * ccc,けんさ,950111,さん | |
| * bbb,しけん,950110,に | |
| * のようにソートされる | |
| */ | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #define NUL '\0' | |
| int jstrcmp( const unsigned char *s, const unsigned char *t ); | |
| int readTarget( FILE *fp, int delm, int elem ); | |
| int jsort( int delm, int elem ); | |
| #ifdef _MSC_VER | |
| #pragma warning ( disable : 4996 ) | |
| #endif | |
| int | |
| main( int argc, char *argv[] ) | |
| { | |
| int i, ret = 255; | |
| int delimeter = ' '; /* 区切り文字 */ | |
| int element = 0; /* 位置 */ | |
| FILE *fp; | |
| char filename[128]; | |
| /* コマンドライン引数を解釈する */ | |
| filename[0] = NUL; | |
| for ( i = 1; i < argc; i++ ) { | |
| if ( argv[i][0] == '-' ) { | |
| switch ( argv[i][1] ) { | |
| case 't': | |
| if ( argv[i][2] != NUL ) | |
| delimeter = argv[i][2]; | |
| break; | |
| } | |
| continue; | |
| } | |
| if ( argv[i][0] == '+' ) { | |
| if ( (argv[i][1] >= '0') && (argv[i][1] <= '9') ) { | |
| element = atoi( &argv[i][1] ); | |
| if ( element < 0 ) | |
| element = 0; | |
| } | |
| continue; | |
| } | |
| strcpy( filename, argv[i] ); | |
| } | |
| /* バッファにソート対象文字列を読み込む */ | |
| if ( filename[0] == NUL ) | |
| ret = readTarget( stdin, delimeter, element ); | |
| else if ( ( fp = fopen( filename, "r" ) ) != NULL ) { | |
| ret = readTarget( fp, delimeter, element ); | |
| fclose( fp ); | |
| } | |
| /* ソートを行なう */ | |
| if ( ret == 0 ) | |
| ret = jsort( delimeter, element ); | |
| return ( ret ); | |
| } | |
| /* | |
| * ソート対象文字列を格納するためのキュー | |
| */ | |
| struct queue { | |
| char *object; /* 格納された文字列へのポインタ */ | |
| struct queue *prev; /* 直前の要素へのポインタ */ | |
| struct queue *next; /* 直後の要素へのポインタ */ | |
| }; | |
| struct queue *target = NULL; /* キューの先頭へのポインタ */ | |
| struct queue *tail = NULL; /* キューの最後尾へのポインタ */ | |
| int targetSize = 0; /* キューに読み込んだ文字列の数 */ | |
| /* | |
| * 文字列を格納する領域を確保し、キューにつなぐ | |
| */ | |
| int | |
| enqueue( const char *p ) | |
| { | |
| struct queue *qp; | |
| /* キューの領域を確保する */ | |
| if ( (qp = (struct queue *)malloc( sizeof(struct queue) )) == NULL ) { | |
| fprintf( stderr, "jsort: memory exhausted\n" ); | |
| return ( 0 ); | |
| } | |
| /* 文字列を格納しておく領域を確保する */ | |
| if ( (qp->object = (char *)malloc( strlen(p) + 1 ) ) == NULL ) { | |
| fprintf( stderr, "jsort: memory exhausted\n" ); | |
| free( (char *)qp ); | |
| return ( 0 ); | |
| } | |
| strcpy( qp->object, p ); /* 文字列を格納する */ | |
| qp->next = NULL; | |
| /* キューの最後尾につなぐ */ | |
| if ( target == NULL ) { | |
| target = tail = qp; | |
| qp->prev = NULL; | |
| } | |
| else { | |
| qp->prev = tail; | |
| tail->next = qp; | |
| tail = qp; | |
| } | |
| targetSize++; | |
| return ( 1 ); | |
| } | |
| /* | |
| * ソート対象文字列をキューに読み込む | |
| */ | |
| int | |
| readTarget( FILE *fp, int delm, int elem ) | |
| { | |
| char *p, buf[BUFSIZ+1]; | |
| int cnt = 0; | |
| while ( ( p = fgets( buf, BUFSIZ - 1, fp ) ) != NULL ) { | |
| /* 行頭のwhite spaceを捨てる */ | |
| while ( (*p == ' ') || (*p == '\t') ) | |
| p++; | |
| /* `#' または `%' で始まる行はソート対象外(コメント行)とする */ | |
| if ( (*p == '#') || (*p == '%') ) | |
| continue; | |
| /* 行末の改行コードを捨てる */ | |
| if ( *(p + strlen(p) - 1) == '\n' ) | |
| *(p + strlen(p) - 1) = NUL; | |
| /* 空行はソート対象から外す */ | |
| if ( !(*p) ) | |
| continue; | |
| /* 位置の指定が 0 でない場合、区切り文字が含まれない行はソート */ | |
| /* 対象外とする */ | |
| if ( (elem > 0) && (strchr( p, delm ) == NULL) ) | |
| continue; | |
| /* ソート対象の文字列をキューにつなぐ */ | |
| cnt += enqueue( p ); | |
| } | |
| return ( cnt > 0 ? 0 : 1 ); | |
| } | |
| /* | |
| * キューにつながれている要素を入れ換える | |
| */ | |
| void | |
| swapqueue( struct queue *p, struct queue *q ) | |
| { | |
| struct queue *r, *s; | |
| if (p->prev) | |
| p->prev->next = q; | |
| if (q->next) | |
| q->next->prev = p; | |
| if ( p->next == q ) { /* p と q が隣合っている場合 */ | |
| r = p->prev; | |
| p->prev = q; | |
| p->next = q->next; | |
| q->prev = r; | |
| q->next = p; | |
| } | |
| else { | |
| if (p->next) | |
| p->next->prev = q; | |
| if (q->prev) | |
| q->prev->next = p; | |
| r = p->prev; | |
| s = p->next; | |
| p->prev = q->prev; | |
| p->next = q->next; | |
| q->prev = r; | |
| q->next = s; | |
| } | |
| /* キューの先頭、最後尾を示すポインタを新しい状況に合わせる */ | |
| if (target == p) | |
| target = q; | |
| else if (target == q) | |
| target = p; | |
| if (tail == p) | |
| tail = q; | |
| else if (tail == q) | |
| tail = p; | |
| } | |
| /* | |
| * キューにつながれている文字列のソートを行なう | |
| */ | |
| int | |
| jsort( int delm, int elem ) | |
| { | |
| int ret = 0, i, cnt1, cnt2; | |
| struct queue *qp, *qq; | |
| char s[BUFSIZ], t[BUFSIZ], *p; | |
| if ( targetSize == 0 ) | |
| return ( ret ); | |
| if ( targetSize == 1 ) { | |
| /* ソート対象が1行のみなら、ソートする必要はない */ | |
| printf( "%s\n", target->object ); | |
| free( target->object ); | |
| free( (char *)target ); | |
| return ( ret ); | |
| } | |
| /* ソートを行なう */ | |
| for ( qp = target, cnt1 = 0; qp && qp->next; qp = qp->next, cnt1++ ) { | |
| for ( qq = qp->next, cnt2 = 0; qq; qq = qq->next, cnt2++ ) { | |
| /* ソートの基準となる部分の文字列を抽出する */ | |
| strcpy(s, qp->object); | |
| strcpy(t, qq->object); | |
| for ( i = 0; i <= elem; i++ ) { | |
| if ( (p = strchr(s, delm)) != NULL ) { | |
| if ( i < elem ) { | |
| p++; | |
| while ((*p == ' ') || (*p == '\t')) | |
| p++; | |
| strcpy(s, p); | |
| } | |
| else { | |
| *p = NUL; | |
| break; | |
| } | |
| } | |
| } | |
| for ( i = 0; i <= elem; i++ ) { | |
| if ( (p = strchr(t, delm)) != NULL ) { | |
| if ( i < elem ) { | |
| p++; | |
| while ((*p == ' ') || (*p == '\t')) | |
| p++; | |
| strcpy(t, p); | |
| } | |
| else { | |
| *p = NUL; | |
| break; | |
| } | |
| } | |
| } | |
| /* 抽出した文字列を比較し、ソートする */ | |
| if ( jstrcmp(s, t) > 0 ) { | |
| swapqueue( qp, qq ); /* qp と qq を入れ換える */ | |
| /* ポインタを再設定する */ | |
| qp = target; | |
| for (i = 0; i < cnt1; i++) | |
| qp = qp->next; | |
| qq = qp->next; | |
| for (i = 0; i < cnt2; i++) | |
| qq = qq->next; | |
| } | |
| } | |
| } | |
| /* ソートされた結果を標準出力に書き出す */ | |
| for ( qp = target; qp; qp = qp->next ) { | |
| printf( "%s\n", qp->object ); | |
| } | |
| /* キューを解放する */ | |
| for ( qp = tail; qp; qp = qq ) { | |
| qq = qp->prev; | |
| free( qp->object ); | |
| free( (char *)qp ); | |
| } | |
| return ( ret ); | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * jstrcmp.c | |
| * ひらがな文字列の比較(辞書の順による) | |
| * | |
| * by H. Tsujimura 11 Jan 1995 / 15 May 2009 | |
| */ | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| static struct orderTable { | |
| unsigned char *ch; | |
| int force; | |
| } order[] = { | |
| { "ぁ", 1 }, { "あ", 1 }, | |
| { "ぃ", 2 }, { "い", 2 }, | |
| { "ぅ", 3 }, { "う", 3 }, | |
| { "ぇ", 4 }, { "え", 4 }, | |
| { "ぉ", 5 }, { "お", 5 }, | |
| { "か", 11 }, { "が", 11 }, { "ヵ", 11 }, { "ヶ", 11 }, | |
| { "き", 12 }, { "ぎ", 12 }, | |
| { "く", 13 }, { "ぐ", 13 }, | |
| { "け", 14 }, { "げ", 14 }, | |
| { "こ", 15 }, { "ご", 15 }, | |
| { "さ", 21 }, { "ざ", 21 }, | |
| { "し", 22 }, { "じ", 22 }, | |
| { "す", 23 }, { "ず", 23 }, | |
| { "せ", 24 }, { "ぜ", 24 }, | |
| { "そ", 25 }, { "ぞ", 25 }, | |
| { "た", 31 }, { "だ", 31 }, | |
| { "ち", 32 }, { "ぢ", 32 }, | |
| { "っ", 33 }, { "つ", 33 }, { "づ", 33 }, | |
| { "て", 34 }, { "で", 34 }, | |
| { "と", 35 }, { "ど", 35 }, | |
| { "な", 41 }, | |
| { "に", 42 }, | |
| { "ぬ", 43 }, | |
| { "ね", 44 }, | |
| { "の", 45 }, | |
| { "は", 51 }, { "ば", 51 }, { "ぱ", 51 }, | |
| { "ひ", 52 }, { "び", 52 }, { "ぴ", 52 }, | |
| { "ふ", 53 }, { "ぶ", 53 }, { "ヴ", 53 }, { "ぷ", 53 }, | |
| { "へ", 54 }, { "べ", 54 }, { "ぺ", 54 }, | |
| { "ほ", 55 }, { "ぼ", 55 }, { "ぽ", 55 }, | |
| { "ま", 61 }, | |
| { "み", 62 }, | |
| { "む", 63 }, | |
| { "め", 64 }, | |
| { "も", 65 }, | |
| { "ゃ", 71 }, { "や", 71 }, | |
| { "ゅ", 73 }, { "ゆ", 73 }, | |
| { "ょ", 75 }, { "よ", 75 }, | |
| { "ら", 81 }, | |
| { "り", 82 }, | |
| { "る", 83 }, | |
| { "れ", 84 }, | |
| { "ろ", 85 }, | |
| { "ゎ", 91 }, { "わ", 91 }, | |
| { "ゐ", 92 }, | |
| { "ゑ", 94 }, | |
| { "を", 95 }, | |
| { "ん", 100 }, | |
| }; | |
| int | |
| jstrcmp( const unsigned char *s, const unsigned char *t ) | |
| { | |
| int len1 = (int)strlen(s); | |
| int len2 = (int)strlen(t); | |
| int len, i, order1, order2; | |
| int preVal = 0; | |
| len = (len1 <= len2) ? len1 : len2; | |
| for ( i = 0; i < len; i += 2 ) { | |
| if ( (*(s+i) == *(t+i)) && (*(s+i+1) == *(t+i+1)) ) { | |
| if ( len1 != len2 ) | |
| preVal = 0; | |
| continue; | |
| } | |
| for ( order1 = 0; order1 < sizeof (order) / sizeof (struct orderTable); | |
| order1++ ) { | |
| if ( (*(s+i) == *(order[order1].ch)) && | |
| (*(s+i+1) == *(order[order1].ch + 1)) ) | |
| break; | |
| } | |
| for ( order2 = 0; order2 < sizeof (order) / sizeof (struct orderTable); | |
| order2++ ) { | |
| if ( (*(t+i) == *(order[order2].ch)) && | |
| (*(t+i+1) == *(order[order2].ch + 1)) ) | |
| break; | |
| } | |
| if (order[order1].force == order[order2].force) { | |
| if ( preVal == 0 ) | |
| preVal = order1 - order2; | |
| continue; | |
| } | |
| else | |
| return ( order[order1].force - order[order2].force ); | |
| } | |
| return ( preVal != 0 ? preVal : len1 - len2 ); | |
| } | |
| #ifdef TEST | |
| unsigned char target1[] = "しっけん"; | |
| unsigned char target2[] = "じっけん"; | |
| unsigned char target3[] = "しつげん"; | |
| unsigned char target4[] = "じつげん"; | |
| unsigned char target5[] = "しつもん"; | |
| unsigned char target6[] = "じっこん"; | |
| unsigned char target7[] = "しつけ"; | |
| unsigned char target8[] = "しっけ"; | |
| unsigned char target9[] = "じっき"; | |
| unsigned char targetA[] = "しつぎ"; | |
| unsigned char targetB[] = "じつぎ"; | |
| unsigned char targetC[] = "じっけんしつ"; | |
| unsigned char targetD[] = "じゃぐ"; | |
| unsigned char targetE[] = "じゃく"; | |
| unsigned char targetF[] = "しやく"; | |
| unsigned char targetG[] = "しゃく"; | |
| int | |
| main() | |
| { | |
| #if 0 | |
| printf( "しっけ:しつけ = %d\n", jstrcmp("しっけ","しつけ") ); | |
| printf( "じっけん:しっけん = %d\n", jstrcmp("じっけん","しっけん") ); | |
| printf( "しつげん:しっけん = %d\n", jstrcmp("しつげん","しっけん") ); | |
| printf( "しつげん:じっけん = %d\n", jstrcmp("しつげん","じっけん") ); | |
| printf( "あいうえお:かきくけこ = %d\n", | |
| jstrcmp("あいうえお","かきくけこ") ); | |
| #endif | |
| { | |
| unsigned char *p[16], *tmp; | |
| int i, j; | |
| p[0] = target1; | |
| p[1] = target2; | |
| p[2] = target3; | |
| p[3] = target4; | |
| p[4] = target5; | |
| p[5] = target6; | |
| p[6] = target7; | |
| p[7] = target8; | |
| p[8] = target9; | |
| p[9] = targetA; | |
| p[10] = targetB; | |
| p[11] = targetC; | |
| p[12] = targetD; | |
| p[13] = targetE; | |
| p[14] = targetF; | |
| p[15] = targetG; | |
| for ( i = 0; i < sizeof (p) / sizeof (unsigned char *) - 1; i++ ) { | |
| for ( j = i + 1; j < sizeof (p) / sizeof (unsigned char *); j++ ) { | |
| if ( jstrcmp( p[i], p[j] ) > 0 ) { | |
| tmp = p[i]; | |
| p[i] = p[j]; | |
| p[j] = tmp; | |
| } | |
| } | |
| } | |
| for ( i = 0; i < sizeof (p) / sizeof (unsigned char *); i++ ) | |
| printf( "p[%2d] = %s\n", i, p[i] ); | |
| } | |
| return ( 0 ); | |
| } | |
| #endif | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment