Skip to content

Instantly share code, notes, and snippets.

@tsupo
Created May 14, 2009 09:40
Show Gist options
  • Select an option

  • Save tsupo/111587 to your computer and use it in GitHub Desktop.

Select an option

Save tsupo/111587 to your computer and use it in GitHub Desktop.
sort in Japanese text order
/*
* jsort.c
* ひらがな文字列のソート(辞書の順による)
*
* by H. Tsujimura 11 Jan 1995 / 15 May 2009
*
* 使用方法:
* jsort [-t区切り文字] [+位置] [ファイル名]
* 指定された区切り文字で区切られた、指定位置の文字列を
* 基準にソートを行なう
*
* 例) ファイルの中身が
* aaa,てすと,950110,いち
* bbb,しけん,950110,に
* ccc,けんさ,950111,さん
* のような file1 に対して
* jsort -t, +1 file1
* を実行すると
* ccc,けんさ,950111,さん
* bbb,しけん,950110,に
* aaa,てすと,950110,いち
* のようにソートされ、
* jsort -t, +4 file1
* を実行すると
* aaa,てすと,950110,いち
* ccc,けんさ,950111,さん
* bbb,しけん,950110,に
* のようにソートされる
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NUL '\0'
int jstrcmp( const unsigned char *s, const unsigned char *t );
int readTarget( FILE *fp, int delm, int elem );
int jsort( int delm, int elem );
#ifdef _MSC_VER
#pragma warning ( disable : 4996 )
#endif
int
main( int argc, char *argv[] )
{
int i, ret = 255;
int delimeter = ' '; /* 区切り文字 */
int element = 0; /* 位置 */
FILE *fp;
char filename[128];
/* コマンドライン引数を解釈する */
filename[0] = NUL;
for ( i = 1; i < argc; i++ ) {
if ( argv[i][0] == '-' ) {
switch ( argv[i][1] ) {
case 't':
if ( argv[i][2] != NUL )
delimeter = argv[i][2];
break;
}
continue;
}
if ( argv[i][0] == '+' ) {
if ( (argv[i][1] >= '0') && (argv[i][1] <= '9') ) {
element = atoi( &argv[i][1] );
if ( element < 0 )
element = 0;
}
continue;
}
strcpy( filename, argv[i] );
}
/* バッファにソート対象文字列を読み込む */
if ( filename[0] == NUL )
ret = readTarget( stdin, delimeter, element );
else if ( ( fp = fopen( filename, "r" ) ) != NULL ) {
ret = readTarget( fp, delimeter, element );
fclose( fp );
}
/* ソートを行なう */
if ( ret == 0 )
ret = jsort( delimeter, element );
return ( ret );
}
/*
* ソート対象文字列を格納するためのキュー
*/
struct queue {
char *object; /* 格納された文字列へのポインタ */
struct queue *prev; /* 直前の要素へのポインタ */
struct queue *next; /* 直後の要素へのポインタ */
};
struct queue *target = NULL; /* キューの先頭へのポインタ */
struct queue *tail = NULL; /* キューの最後尾へのポインタ */
int targetSize = 0; /* キューに読み込んだ文字列の数 */
/*
* 文字列を格納する領域を確保し、キューにつなぐ
*/
int
enqueue( const char *p )
{
struct queue *qp;
/* キューの領域を確保する */
if ( (qp = (struct queue *)malloc( sizeof(struct queue) )) == NULL ) {
fprintf( stderr, "jsort: memory exhausted\n" );
return ( 0 );
}
/* 文字列を格納しておく領域を確保する */
if ( (qp->object = (char *)malloc( strlen(p) + 1 ) ) == NULL ) {
fprintf( stderr, "jsort: memory exhausted\n" );
free( (char *)qp );
return ( 0 );
}
strcpy( qp->object, p ); /* 文字列を格納する */
qp->next = NULL;
/* キューの最後尾につなぐ */
if ( target == NULL ) {
target = tail = qp;
qp->prev = NULL;
}
else {
qp->prev = tail;
tail->next = qp;
tail = qp;
}
targetSize++;
return ( 1 );
}
/*
* ソート対象文字列をキューに読み込む
*/
int
readTarget( FILE *fp, int delm, int elem )
{
char *p, buf[BUFSIZ+1];
int cnt = 0;
while ( ( p = fgets( buf, BUFSIZ - 1, fp ) ) != NULL ) {
/* 行頭のwhite spaceを捨てる */
while ( (*p == ' ') || (*p == '\t') )
p++;
/* `#' または `%' で始まる行はソート対象外(コメント行)とする */
if ( (*p == '#') || (*p == '%') )
continue;
/* 行末の改行コードを捨てる */
if ( *(p + strlen(p) - 1) == '\n' )
*(p + strlen(p) - 1) = NUL;
/* 空行はソート対象から外す */
if ( !(*p) )
continue;
/* 位置の指定が 0 でない場合、区切り文字が含まれない行はソート */
/* 対象外とする */
if ( (elem > 0) && (strchr( p, delm ) == NULL) )
continue;
/* ソート対象の文字列をキューにつなぐ */
cnt += enqueue( p );
}
return ( cnt > 0 ? 0 : 1 );
}
/*
* キューにつながれている要素を入れ換える
*/
void
swapqueue( struct queue *p, struct queue *q )
{
struct queue *r, *s;
if (p->prev)
p->prev->next = q;
if (q->next)
q->next->prev = p;
if ( p->next == q ) { /* p と q が隣合っている場合 */
r = p->prev;
p->prev = q;
p->next = q->next;
q->prev = r;
q->next = p;
}
else {
if (p->next)
p->next->prev = q;
if (q->prev)
q->prev->next = p;
r = p->prev;
s = p->next;
p->prev = q->prev;
p->next = q->next;
q->prev = r;
q->next = s;
}
/* キューの先頭、最後尾を示すポインタを新しい状況に合わせる */
if (target == p)
target = q;
else if (target == q)
target = p;
if (tail == p)
tail = q;
else if (tail == q)
tail = p;
}
/*
* キューにつながれている文字列のソートを行なう
*/
int
jsort( int delm, int elem )
{
int ret = 0, i, cnt1, cnt2;
struct queue *qp, *qq;
char s[BUFSIZ], t[BUFSIZ], *p;
if ( targetSize == 0 )
return ( ret );
if ( targetSize == 1 ) {
/* ソート対象が1行のみなら、ソートする必要はない */
printf( "%s\n", target->object );
free( target->object );
free( (char *)target );
return ( ret );
}
/* ソートを行なう */
for ( qp = target, cnt1 = 0; qp && qp->next; qp = qp->next, cnt1++ ) {
for ( qq = qp->next, cnt2 = 0; qq; qq = qq->next, cnt2++ ) {
/* ソートの基準となる部分の文字列を抽出する */
strcpy(s, qp->object);
strcpy(t, qq->object);
for ( i = 0; i <= elem; i++ ) {
if ( (p = strchr(s, delm)) != NULL ) {
if ( i < elem ) {
p++;
while ((*p == ' ') || (*p == '\t'))
p++;
strcpy(s, p);
}
else {
*p = NUL;
break;
}
}
}
for ( i = 0; i <= elem; i++ ) {
if ( (p = strchr(t, delm)) != NULL ) {
if ( i < elem ) {
p++;
while ((*p == ' ') || (*p == '\t'))
p++;
strcpy(t, p);
}
else {
*p = NUL;
break;
}
}
}
/* 抽出した文字列を比較し、ソートする */
if ( jstrcmp(s, t) > 0 ) {
swapqueue( qp, qq ); /* qp と qq を入れ換える */
/* ポインタを再設定する */
qp = target;
for (i = 0; i < cnt1; i++)
qp = qp->next;
qq = qp->next;
for (i = 0; i < cnt2; i++)
qq = qq->next;
}
}
}
/* ソートされた結果を標準出力に書き出す */
for ( qp = target; qp; qp = qp->next ) {
printf( "%s\n", qp->object );
}
/* キューを解放する */
for ( qp = tail; qp; qp = qq ) {
qq = qp->prev;
free( qp->object );
free( (char *)qp );
}
return ( ret );
}
/*
* jstrcmp.c
* ひらがな文字列の比較(辞書の順による)
*
* by H. Tsujimura 11 Jan 1995 / 15 May 2009
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static struct orderTable {
unsigned char *ch;
int force;
} order[] = {
{ "ぁ", 1 }, { "あ", 1 },
{ "ぃ", 2 }, { "い", 2 },
{ "ぅ", 3 }, { "う", 3 },
{ "ぇ", 4 }, { "え", 4 },
{ "ぉ", 5 }, { "お", 5 },
{ "か", 11 }, { "が", 11 }, { "ヵ", 11 }, { "ヶ", 11 },
{ "き", 12 }, { "ぎ", 12 },
{ "く", 13 }, { "ぐ", 13 },
{ "け", 14 }, { "げ", 14 },
{ "こ", 15 }, { "ご", 15 },
{ "さ", 21 }, { "ざ", 21 },
{ "し", 22 }, { "じ", 22 },
{ "す", 23 }, { "ず", 23 },
{ "せ", 24 }, { "ぜ", 24 },
{ "そ", 25 }, { "ぞ", 25 },
{ "た", 31 }, { "だ", 31 },
{ "ち", 32 }, { "ぢ", 32 },
{ "っ", 33 }, { "つ", 33 }, { "づ", 33 },
{ "て", 34 }, { "で", 34 },
{ "と", 35 }, { "ど", 35 },
{ "な", 41 },
{ "に", 42 },
{ "ぬ", 43 },
{ "ね", 44 },
{ "の", 45 },
{ "は", 51 }, { "ば", 51 }, { "ぱ", 51 },
{ "ひ", 52 }, { "び", 52 }, { "ぴ", 52 },
{ "ふ", 53 }, { "ぶ", 53 }, { "ヴ", 53 }, { "ぷ", 53 },
{ "へ", 54 }, { "べ", 54 }, { "ぺ", 54 },
{ "ほ", 55 }, { "ぼ", 55 }, { "ぽ", 55 },
{ "ま", 61 },
{ "み", 62 },
{ "む", 63 },
{ "め", 64 },
{ "も", 65 },
{ "ゃ", 71 }, { "や", 71 },
{ "ゅ", 73 }, { "ゆ", 73 },
{ "ょ", 75 }, { "よ", 75 },
{ "ら", 81 },
{ "り", 82 },
{ "る", 83 },
{ "れ", 84 },
{ "ろ", 85 },
{ "ゎ", 91 }, { "わ", 91 },
{ "ゐ", 92 },
{ "ゑ", 94 },
{ "を", 95 },
{ "ん", 100 },
};
int
jstrcmp( const unsigned char *s, const unsigned char *t )
{
int len1 = (int)strlen(s);
int len2 = (int)strlen(t);
int len, i, order1, order2;
int preVal = 0;
len = (len1 <= len2) ? len1 : len2;
for ( i = 0; i < len; i += 2 ) {
if ( (*(s+i) == *(t+i)) && (*(s+i+1) == *(t+i+1)) ) {
if ( len1 != len2 )
preVal = 0;
continue;
}
for ( order1 = 0; order1 < sizeof (order) / sizeof (struct orderTable);
order1++ ) {
if ( (*(s+i) == *(order[order1].ch)) &&
(*(s+i+1) == *(order[order1].ch + 1)) )
break;
}
for ( order2 = 0; order2 < sizeof (order) / sizeof (struct orderTable);
order2++ ) {
if ( (*(t+i) == *(order[order2].ch)) &&
(*(t+i+1) == *(order[order2].ch + 1)) )
break;
}
if (order[order1].force == order[order2].force) {
if ( preVal == 0 )
preVal = order1 - order2;
continue;
}
else
return ( order[order1].force - order[order2].force );
}
return ( preVal != 0 ? preVal : len1 - len2 );
}
#ifdef TEST
unsigned char target1[] = "しっけん";
unsigned char target2[] = "じっけん";
unsigned char target3[] = "しつげん";
unsigned char target4[] = "じつげん";
unsigned char target5[] = "しつもん";
unsigned char target6[] = "じっこん";
unsigned char target7[] = "しつけ";
unsigned char target8[] = "しっけ";
unsigned char target9[] = "じっき";
unsigned char targetA[] = "しつぎ";
unsigned char targetB[] = "じつぎ";
unsigned char targetC[] = "じっけんしつ";
unsigned char targetD[] = "じゃぐ";
unsigned char targetE[] = "じゃく";
unsigned char targetF[] = "しやく";
unsigned char targetG[] = "しゃく";
int
main()
{
#if 0
printf( "しっけ:しつけ = %d\n", jstrcmp("しっけ","しつけ") );
printf( "じっけん:しっけん = %d\n", jstrcmp("じっけん","しっけん") );
printf( "しつげん:しっけん = %d\n", jstrcmp("しつげん","しっけん") );
printf( "しつげん:じっけん = %d\n", jstrcmp("しつげん","じっけん") );
printf( "あいうえお:かきくけこ = %d\n",
jstrcmp("あいうえお","かきくけこ") );
#endif
{
unsigned char *p[16], *tmp;
int i, j;
p[0] = target1;
p[1] = target2;
p[2] = target3;
p[3] = target4;
p[4] = target5;
p[5] = target6;
p[6] = target7;
p[7] = target8;
p[8] = target9;
p[9] = targetA;
p[10] = targetB;
p[11] = targetC;
p[12] = targetD;
p[13] = targetE;
p[14] = targetF;
p[15] = targetG;
for ( i = 0; i < sizeof (p) / sizeof (unsigned char *) - 1; i++ ) {
for ( j = i + 1; j < sizeof (p) / sizeof (unsigned char *); j++ ) {
if ( jstrcmp( p[i], p[j] ) > 0 ) {
tmp = p[i];
p[i] = p[j];
p[j] = tmp;
}
}
}
for ( i = 0; i < sizeof (p) / sizeof (unsigned char *); i++ )
printf( "p[%2d] = %s\n", i, p[i] );
}
return ( 0 );
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment