Created
August 27, 2009 01:48
-
-
Save kaichen/176016 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From: http://www.javaeye.com/topic/456482?page=1 | |
#include <stdio.h> | |
#include <string.h> | |
#include <sys/types.h> | |
#include <dirent.h> | |
#include <sys/stat.h> | |
#include <time.h> | |
#include <stdlib.h> | |
#define STEP 10 | |
int count = 0;//文档个数 | |
char* str = NULL;//一个大的字符串,存储所有文档的内容 | |
int* ends;//文档的结束点集合 | |
int ends_len = 0, ends_mem_len = 10;//文档结束点的内存参数(当前长度,内存长度) | |
int str_len = 0, str_mem_len = 10, str_unicode_len=0;//字符串的内存参数(字符串长度,字符串内存长度, 字符串unicode长度:即一个汉字占一个长度时的长度) | |
struct id_map{//一个文档在内存中的映射位置 | |
int id;//文档id | |
int start;//字符串中的开始位置 | |
int end;//字符串中的结束位置 | |
}; | |
struct id_map * idmaps=NULL;//文档在内存中的映射地址 | |
int idmaps_len = 0, idmaps_mem_len=0;//文档映射参数 | |
//添加一个文档映射参数 | |
void addIdMap(struct id_map map){ | |
if(idmaps==NULL){//如果数组还没有建立,就建立一个数组来进行存储 | |
idmaps = (struct id_map *)malloc(sizeof(struct id_map)*10); | |
} | |
//如果当前的文档数已经到达了上一次建立的内存长度,则扩展内存,步长为10 | |
if(idmaps_len==idmaps_mem_len){ | |
idmaps_mem_len += STEP; | |
idmaps = (struct id_map *)realloc(idmaps, sizeof(struct id_map)*idmaps_mem_len); | |
if(idmaps==NULL){ | |
printf("内存不足"); | |
return; | |
} | |
} | |
*(idmaps+idmaps_len) = map; | |
idmaps_len++; | |
} | |
//读取一个文本文件 | |
char* readTextFile(char* path){ | |
char ch;//当前的字符 | |
FILE *fp;//文件指针 | |
int result; | |
fp = fopen(path, "rb"); | |
if(fp!=NULL){//如果文档读取成功 | |
if(str==NULL){ | |
//初始化str,ends的内存。这两个的增长步长均为10 | |
ends = (int *)malloc( sizeof(int) * 10); | |
str = (char *)malloc(10); | |
} | |
if(!str){ | |
printf("内存不足"); | |
fclose(fp); | |
return NULL; | |
} | |
int unicode_ = 0; | |
while((ch=fgetc(fp))!=EOF){//读取文件,一直读到最后,将内容放到str中。 | |
if(str_len == str_mem_len){ | |
str_mem_len += STEP; | |
str = (char *)realloc(str, str_mem_len); | |
if(str == NULL){ | |
printf("内存不足"); | |
fclose(fp); | |
return NULL; | |
} | |
} | |
if(unicode_ == 0){//如果上一个字符不是Unicode字符,则判断如果当前字符为unicode字符,则进入unicode计数。 | |
if(ch>=0 && ch<127){ | |
str_unicode_len++; | |
}else{ | |
unicode_ = 1; | |
} | |
}else if(unicode_ == 1){ | |
unicode_ =2; | |
}else if(unicode_ == 2){//按照utf-8编码进行计算,每个汉字占三个字符。 | |
unicode_ = 0; | |
str_unicode_len++; | |
} | |
*(str+str_len)=ch; | |
str_len++; | |
} | |
//记录结束点 | |
if(ends_len == ends_mem_len){ | |
ends_mem_len += STEP; | |
ends = (int *)realloc(ends, sizeof(int) * ends_mem_len); | |
if(ends == NULL){ | |
printf("内存不足"); | |
fclose(fp); | |
return NULL; | |
} | |
} | |
//printf("---%d,%d,%d\n", ends_len,ends_mem_len,str_unicode_len); | |
//*(ends+ends_len) = str_unicode_len; | |
*(ends+ends_len) = str_unicode_len; | |
ends_len++; | |
str = (char *)realloc(str, str_len); | |
//*(str+len)='\0'; | |
fclose(fp); | |
return str; | |
} | |
return NULL; | |
} | |
//读入一个文件夹内的所有文件 | |
int init_search_dir(char *path) | |
{ | |
DIR *dir; | |
struct dirent *s_dir; | |
struct stat file_stat; | |
char currfile[1024]={0}; | |
int len = strlen(path); | |
printf("%s\n",path); | |
if( (dir=opendir(path)) == NULL) | |
{ | |
printf("opendir(path) error.\n"); | |
return -1; | |
} | |
while((s_dir=readdir(dir))!=NULL) | |
{ | |
if((strcmp(s_dir->d_name,".")==0)||(strcmp(s_dir->d_name,"..")==0)) | |
continue; | |
sprintf(currfile,"%s%s",path,s_dir->d_name); | |
stat(currfile,&file_stat); | |
if(S_ISDIR(file_stat.st_mode)){//如果是文件夹,则递归读取 | |
init_search_dir(currfile); | |
}else{ | |
printf("%-32s\tOK",currfile); | |
//设置一个文档与 str的映射,并读取文档的内容 | |
struct id_map map; | |
map.id=atoi(s_dir->d_name); | |
map.start = str_unicode_len; | |
readTextFile(currfile); | |
map.end = str_unicode_len; | |
addIdMap(map); | |
printf("\t%d\n", str_unicode_len); | |
} | |
count++; | |
} | |
closedir(dir); | |
ends = (int *)realloc(ends, sizeof(int) * ends_len); | |
return 0; | |
} | |
//计算一个utf-8字符串的长度(汉字占一个长度) | |
int utf8_str_len(char* utf8_str){ | |
int length = 0, unicode_ = 0, i=0; | |
for(;i<strlen(utf8_str);i++){ | |
if(unicode_ == 0){ | |
if(utf8_str[i]>=0 && utf8_str[i]<127){ | |
length++; | |
}else{ | |
unicode_ = 1; | |
} | |
}else if(unicode_ == 1){ | |
unicode_ =2; | |
}else if(unicode_ == 2){ | |
unicode_ = 0; | |
length++; | |
} | |
} | |
return length; | |
} | |
//查找该结束点是否存在(2分查找) | |
int find_ends(int num){ | |
if(num>ends[ends_len-1]||num<ends[0]){ | |
return -1; | |
} | |
int end = ends_len; | |
int start = 0; | |
int index=ends_len / 2; | |
while(1){ | |
if(ends[index]==num){ | |
return index; | |
} | |
if(start == end || index == start || index == end){ | |
return -1; | |
} | |
if(ends[index] > num){ | |
end = index; | |
}else{ | |
start = index; | |
} | |
index = start + ((end-start) / 2); | |
} | |
} | |
//主要函数。搜索所有文档中所有存在于该字符串相似的文档,算法出处及JAVA实现参见:http://www.blogjava.net/phyeas/archive/2009/02/15/254743.html | |
void search(char* key){ | |
int key_len = utf8_str_len(key);//计算key的长度 | |
int i=0, j=0, j_ = 0, i_ = 0; | |
//char barr[key_len][str_unicode_len]; | |
char* barr[key_len];// | |
//char narr[key_len][str_unicode_len]; | |
char* narr[key_len]; | |
//char darr[key_len][str_unicode_len]; | |
char* darr[key_len]; | |
//一个按照最大匹配度排序的文档序列。最大匹配度不可能大于key的长度+1,所以声明一个key_len+1长度的数组进行保存即可。数据格式类似:[[],[2,3],[5],[]] | |
int* max_id_maps[key_len + 1];//该数组的第n个下标表示最大匹配度为n的文档有哪些 | |
int max_id_maps_lens[key_len + 1], max_id_maps_mem_lens[key_len + 1]; | |
int key_ascii_len = strlen(key); | |
struct timeval tpstart,tpend; | |
float timeuse; | |
gettimeofday(&tpstart,NULL); | |
//初始化三个数组。i_,j_表示当前的坐标,i,j表示当前左右的字符串中的字符位置 | |
for(i_=key_len-1, i=key_ascii_len-1;i>=0 && i_>=0;i--,i_--){ | |
barr[i_] = (char*) malloc(str_unicode_len);//动态申请内存是为了解决c语言函数内声明数组的长度有限制 | |
narr[i_] = (char*) malloc(str_unicode_len); | |
darr[i_] = (char*) malloc(str_unicode_len); | |
int is_left_ascii = key[i]<0 || key[i] >= 127 ? 0 : 1; | |
for(j=str_len-1, j_=str_unicode_len-1;j>=0&&j_>=0;j--,j_--){ | |
int is_right_ascii = str[j] < 0 || str[j] >= 127 ? 0 : 1; | |
barr[i_][j_] = 0; | |
if(!is_left_ascii || !is_right_ascii){ | |
if(!is_left_ascii && !is_right_ascii){ | |
int k = 2, eq=1; | |
for(;k>=0;k--){ | |
if(i-k >= 0 && j-k>=0 && key[i-k] != str[j-k]){ | |
eq = 0; | |
break; | |
} | |
} | |
barr[i_][j_] = eq; | |
}else{ | |
barr[i_][j_] = 0; | |
} | |
}else{ | |
barr[i_][j_] = str[j] == key[i] || tolower(str[j]) == tolower(key[i]) ? 1 : 0; | |
} | |
darr[i_][j_] = 0; | |
narr[i_][j_] = 0; | |
int indexOfEnds = find_ends(j_); | |
int n_right = 0, n_down = 0, n_rightdown = 0, d_right = 0, d_down = 0, d_rightdown = 0; | |
if(indexOfEnds == -1 && j_!=str_unicode_len - 1){ | |
n_right = narr[i_][j_ + 1]; | |
d_right = darr[i_][j_ + 1]; | |
} | |
if(i_!=key_len -1){ | |
n_down = narr[i_ + 1][j_]; | |
d_down = darr[i_ + 1][j_]; | |
} | |
if(indexOfEnds == -1 && j_!=str_unicode_len - 1 && i_!=key_len -1){ | |
n_rightdown = narr[i_ + 1][j_ + 1]; | |
d_rightdown = darr[i_ + 1][j_ + 1]; | |
} | |
n_rightdown += barr[i_][j_]; | |
narr[i_][j_] = n_right > n_down ? (n_right > n_rightdown ? n_right : n_rightdown) : (n_down > n_rightdown ? n_down : n_rightdown); | |
if(barr[i_][j_]){ | |
darr[i_][j_] = d_rightdown + 1; | |
}else if(n_right >= n_down){ | |
darr[i_][j_] = d_right; | |
}else{ | |
darr[i_][j_] = d_down + 1; | |
} | |
if(!is_right_ascii){ | |
j-=2; | |
} | |
//printf("%d\t", narr[i_][j_]); | |
} | |
//printf("\n"); | |
//max_id_maps[i] = (int *)malloc(sizeof(int)*10); | |
max_id_maps_mem_lens[i_] = 0; | |
max_id_maps_lens[i_] = 0; | |
if(!is_left_ascii){ | |
i-=2; | |
} | |
} | |
//max_id_maps[key_len] = (int *)malloc(sizeof(int)*10); | |
max_id_maps_mem_lens[key_len] = 0; | |
max_id_maps_lens[key_len] = 0; | |
int k=0; | |
//计算最大匹配度和最优匹配路径长度。并将其放到如到max_id_maps中 | |
for(k=0;k<idmaps_len;k++){ | |
int end=idmaps[k].end, j=idmaps[k].start, end_i = key_len, max_ = 0, min_ = -1; | |
while(j<end){ | |
int temp_end_i = -1; | |
for(i=0;i<end_i;i++){ | |
if(barr[i][j]){ | |
if(temp_end_i==-1){ | |
temp_end_i = i; | |
} | |
if(narr[i][j] > max_){ | |
max_ = narr[i][j]; | |
} | |
if(min_ == -1 || darr[i][j] < min_){ | |
min_ = darr[i][j]; | |
} | |
} | |
} | |
if(temp_end_i != -1){ | |
end_i = temp_end_i; | |
} | |
j++; | |
} | |
if(max_ != 0){ | |
if(max_id_maps_mem_lens[max_] == 0){ | |
max_id_maps[max_] = (int *)malloc(sizeof(int)*10); | |
max_id_maps_mem_lens[max_] = 10; | |
}else if(max_id_maps_mem_lens[max_] == max_id_maps_lens[max_]){ | |
max_id_maps_mem_lens[max_] += STEP; | |
max_id_maps[max_] = (int *)realloc(max_id_maps[max_], sizeof(int)*max_id_maps_mem_lens[max_]); | |
} | |
*(max_id_maps[max_] + max_id_maps_lens[max_]) = idmaps[k].id; | |
max_id_maps_lens[max_]++; | |
} | |
} | |
//-----------------计时,计算性能 | |
gettimeofday(&tpend,NULL); | |
timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec; | |
timeuse/=1000000; | |
printf("Used Time:%f\n",timeuse); | |
for(i=0;i<=key_len;i++){ | |
printf("%d -- ",i); | |
for(j=0;j<max_id_maps_lens[i];j++){ | |
printf("%d\t", max_id_maps[i][j]); | |
} | |
printf("\n"); | |
} | |
//--------------计时结束 | |
//释放在这个函数中申请的动态内存。 | |
for(i=0;i<=key_len;i++){ | |
if(max_id_maps_mem_lens[i]>0){ | |
//printf("%d,",max_id_maps_mem_lens[i]); | |
free(max_id_maps[i]); | |
} | |
if(i!=key_len){ | |
free(barr[i]); | |
free(narr[i]); | |
free(darr[i]); | |
} | |
} | |
//testPrint(&narr, key_len, str_unicode_len); | |
} | |
//释放程序中申请的动态内存 | |
void freeMemory(){ | |
free(ends); | |
free(idmaps); | |
free(str); | |
} | |
int main(){ | |
init_search_dir("/home/phyeas/test/"); | |
search("Java云计算"); | |
//search("BCXCADFESBABCACA"); | |
//init_search_dir("/home/phyeas/test/test2/"); | |
//int i=0; | |
freeMemory(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment