cvanweelden · March 6, 2012 14:07 · cvanweelden · Mar 6, 2012
diff --git a/code Makefile b/code Makefile
 CC = gcc #gcc-4.2
 MATLABDIR=/Applications/MATLAB_R2011a.app
 INCLUDES=-I$(MATLABDIR)/extern/include
 LDIRS= -L$(MATLABDIR)/bin/maci64
 EXE_TARGETS = segm_overlap_mex.mexa64 segm_intersection_mex.mexa64

 all: $(EXE_TARGETS) 

 overlap.o: overlap.c
 	$(CC) -D__MAIN__ -O3  -fPIC -c $(INCLUDES) -fopenmp -o overlap.o overlap.c

 intersection.o: intersection.c
 	$(CC) -D__MAIN__ -O3  -fPIC -c $(INCLUDES) -fopenmp -o intersection.o intersection.c

 segm_overlap_mex.o: segm_overlap_mex.c
 	$(CC) -O3 -c $(INCLUDES)  -o segm_overlap_mex.o segm_overlap_mex.c -fPIC

 segm_intersection_mex.o: segm_intersection_mex.c
 	$(CC) -O3 -c $(INCLUDES)  -o segm_intersection_mex.o segm_intersection_mex.c -fPIC


 segm_overlap_mex.mexa64: segm_overlap_mex.o overlap.o
 	$(CC) segm_overlap_mex.o  $(LDIRS) -lmx -lmex -fopenmp  -shared -o segm_overlap_mex.mexmaci64 overlap.o

 segm_intersection_mex.mexa64: segm_intersection_mex.o intersection.o
 	$(CC) segm_intersection_mex.o  $(LDIRS) -lmx -lmex -fopenmp  -shared -o segm_intersection_mex.mexmaci64 intersection.o

 clean:
 	rm -f *.o $(EXE_TARGETS) $(LIB_TARGETS)
diff --git a/compile.m b/compile.m
 % 
 % 32bits and 64bits mex files and Makefiles for chi2_mex are provided
 % the same for segm_overlap_mex.c
 % You need to set the right path to matlab in the Makefiles, in order to
 % recompile them.
 % 
 % These files have makefiles because they use multiple cores with omp
 %

 % for the other files 
 mex -O code/cartprod_mex.c -o code/cartprod_mex
 cd ./code/
 !make
 cd ..
 mex -O code/int_hist.c -o code/int_hist
 mex -O code/intens_pixel_diff_mex.c -o code/intens_pixel_diff_mex
 cd ./external_code/paraFmex/
 make_pseudo()
 cd ../..

 % these two files contributed by andreas mueller
 mex -cxx -I/usr/local/include -O external_code/my_phog_desc_mex.cpp -o external_code/my_phog_desc_mex % requires boost development files
 mex -O external_code/overlap_care.c -o external_code/overlap_care
diff --git a/cpmc_example.m b/cpmc_example.m
 function cpmc_example()  
    addpath('./code/');    
    addpath('./external_code/');
    addpath('./external_code/paraFmex/');
    addpath('./external_code/imrender/vgg/');
    addpath('./external_code/immerge/');
    addpath('./external_code/color_sift/');
    addpath('./external_code/vlfeats/toolbox/');
    vl_setup;
    addpath('./external_code/globalPb/lib/');
    addpath('./external_code/mpi-chi2-v1_5/');        
    
    % create multiple threads (set how many you have)
    N_THREADS = 2;
    if(matlabpool('size')~=N_THREADS)
        matlabpool('open', N_THREADS);
    end

    exp_dir = './data/';
    %img_name = '2010_000238'; % airplane and people   
    img_name = '2007_009084'; % dogs, motorbike, chairs, people    
    %img_name = '2010_002868'; % buses   
    %img_name = '2010_003781'; % cat, bottle, potted plants
        
   [masks, scores] = cpmc(exp_dir, img_name);
            
    I = imread([exp_dir '/JPEGImages/' img_name '.jpg']);

    % visualization and ground truth score for whole pool
    fprintf(['Best segments from initial pool of ' int2str(size(masks,3))]);
    Q = SvmSegm_segment_quality(img_name, exp_dir, masks, 'overlap');
    save('duh_32.mat', 'Q');
    avg_best_overlap = mean(max([Q.q]))
    SvmSegm_show_best_segments(I,Q,masks);
    
    % visualization and ground truth score for top 200 segments    
    top_masks = masks(:,:,1:200);
    figure;
    disp('Best 200 segments after filtering');
    Q = SvmSegm_segment_quality(img_name, exp_dir, top_masks, 'overlap');
    avg_best_overlap = mean(max([Q.q]))
    SvmSegm_show_best_segments(I,Q,top_masks);
    fprintf('Best among top 200 after filtering\n\n');    
 end
diff --git a/external_code mpi-chi2-v1_5 chi2double.c b/external_code mpi-chi2-v1_5 chi2double.c
 // fast chi-squared distance function in x86 compiler intrinsics
 // (C) 2007-2008 Christoph Lampert <[email protected]>

 #include <stdio.h>
 #include <limits.h>
 #include <float.h>	// for FLT_MIN
 /* We calculate calculate chi2=(a-b)**2/(a+b+FLT_MIN) to avoid division-by-zero:
   If a+b != 0, then (a+b+FLT_MIN)==(a+b) and nothing changed.
   If a+b == 0, then the numerator is 0 as well, and we don't divide by 0. 
 */


 /* Using compiler intrinsics (for SSE >=2) can have a huge speedup effect: 
   8x for float and 3.5x for double on Intel Core2.
   You have to compile with the right CPU setting, e.g. gcc -march=k8 or -march=nocona */
 #ifdef __SSE2__
 #include <emmintrin.h> // for float
 #endif

 /* OpenMP allows to achieve almost linear speedup on multiCore CPUs: use gcc-4.2 -fopenmp */
 /*#ifdef _OPENMP*/
 #include <omp.h>
 /*#endif*/

 static inline double chi2_baseline_double(const int n, const double* const x, const double* const y) {
    double result = 0.f;
    int i;
    for (i=0; i<n; i++) {
        const double num = x[i]-y[i];
        const double denom = 1./(x[i]+y[i]+DBL_MIN);
        result += num*num*denom;
    }
    return result;
 }


 /* use compiler intrinsics for 2x parallel processing */
 static inline double chi2_intrinsic_double(int n, const double* x, const double* y) {
    double result=0;
    const __m128d eps = _mm_set1_pd(DBL_MIN);
    const __m128d zero = _mm_setzero_pd();
    __m128d chi2 = _mm_setzero_pd();    

    for ( ; n>1; n-=2) {
        const __m128d a = _mm_loadu_pd(x);
        const __m128d b = _mm_loadu_pd(y);
 	x+=2;
 	y+=2;
        const __m128d a_plus_b = _mm_add_pd(a,b);
        const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps);
        const __m128d a_minus_b = _mm_sub_pd(a,b);
        const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b);
        const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_pd(chi2, quotient);
    }
    const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1));
    const __m128d sum = _mm_add_pd(chi2, shuffle);
 // with SSE3, we could use hadd_pd, but the difference is negligible 

    _mm_store_sd(&result,sum);
    _mm_empty();
    if (n)
        result += chi2_baseline_double(n, x, y); // remaining entries
    return result;
 }


 /* calculate the chi2-distance between two vectors/histograms */
 double chi2_double(const int dim, const double* const x, const double* const y) {
    double (*chi2_double)(const int, const double*, const double*) = chi2_baseline_double;
 #ifdef __SSE2__
    chi2_double = chi2_intrinsic_double;
 #endif
    return chi2_double(dim, x, y);
 }

 /* calculate the chi2-measure between two sets of vectors/histograms */
 double chi2sym_distance_double(const int dim, const int nx, const double* const x, 
                               double* const K) {
    double (*chi2_double)(const int, const double*, const double*) = chi2_baseline_double;
 #ifdef __SSE2__
    chi2_double = chi2_intrinsic_double;
 #endif

    double sumK=0.;
 #pragma omp parallel 
    {
        int i,j;
 #pragma omp for reduction (+:sumK) schedule (dynamic, 2)
        for (i=0;i<nx;i++) {
    	    K[i*nx+i]=0.;
            for (j=0;j<i;j++) {
                const double chi2 = chi2_double(dim, &x[i*dim], &x[j*dim]);
    	    	K[i*nx+j] = chi2;
 	    	    K[j*nx+i] = chi2;    
        		sumK += 2*chi2;
            }
 	    }
    }
    return sumK/((float)(nx*nx)); 
 }

 /* calculate the chi2-measure between two sets of vectors/histograms */
 double chi2_distance_double(const int dim, const int nx, const double* const x, 
                                         const int ny, const double* const y, double* const K) {
    double (*chi2_double)(const int, const double*, const double*) = chi2_baseline_double;
 #ifdef __SSE2__
    chi2_double = chi2_intrinsic_double;
 #endif

    double sumK=0.;
 #pragma omp parallel 
    {
        int i,j;
 #pragma omp for reduction (+:sumK)
        for (i=0;i<nx;i++)
            for (j=0;j<ny;j++) {
                const double chi2 = chi2_double(dim, &x[i*dim], &y[j*dim]);
 		K[i*ny+j] = chi2;
 		sumK += chi2;
 	    }
    }
    return sumK/((float)(nx*ny)); 
 }


 #ifdef __MAIN__

 #include <stdlib.h>

 #include <time.h>
 int main()
 {
    const int dim=3000;
    const int n1=1000;
    const int n2=2000;
    int i,j;

 /* test calculating a kernel with double entries 
    double *data1 = (double*)memalign(16,dim*n1*sizeof(double));
    double *data2 = (double*)memalign(16,dim*n2*sizeof(double));
    double *K = (double*)malloc(n1*n2*sizeof(double));
    if ((!data1) || (!data2) || (!K)) {
       free(data1);
       free(data2);
       free(K);
       return 1;
    }

    const clock_t before_init=clock();
    for (i=0;i<n1*dim;i++)
    	data1[i]=1./(double)(i+1.);
    for (i=0;i<n2*dim;i++)
    	data2[i]=1./(double)(i+1.);
    const clock_t after_init=clock();
    printf("init time: %8.4f\n",(after_init-before_init)*1./CLOCKS_PER_SEC);

    const clock_t before_chi2=clock();
    const double mean_K = chi2_distance_double(dim, n1, data1, n2, data2, K);
    const clock_t after_chi2=clock();
    printf("chi2 time: %8.4f\n",(after_chi2-before_chi2)*1./CLOCKS_PER_SEC);

    printf("result: %e\n",mean_K);

    free(data1);
    free(data2);
    free(K);
    */
    return 0;
 }

 #endif
diff --git a/external_code mpi-chi2-v1_5 chi2float.c b/external_code mpi-chi2-v1_5 chi2float.c
 // fast chi-squared distance function in x86 compiler intrinsics
 // (C) 2007-2008 Christoph Lampert <[email protected]>

 #include <stdio.h>
 #include <limits.h>
 #include <float.h>// for FLT_MIN
 /* We calculate calculate chi2=(a-b)**2/(a+b+FLT_MIN) to avoid division-by-zero:
   If a+b != 0, then (a+b+FLT_MIN)==(a+b) and nothing changed.
   If a+b == 0, then the numerator is 0 as well, and we don't divide by 0. 
 */


 /* Using SSE compiler intrinsics can have a huge speedup effect: 
   8x for float and 3.5x for double on Intel Core2.
   You have to compile with the right CPU setting, e.g. gcc -march=k8 or -march=nocona */
 #ifdef __SSE__
 #include <xmmintrin.h> // for float
 #endif

 /* OpenMP allows to achieve almost linear speedup on multiCore CPUs: use gcc-4.2 -fopenmp */
 #ifdef _OPENMP
 #include <omp.h>
 #endif


 static inline float chi2_baseline_float(const int n, const float* x, const float* y) {
    float result = 0.f;
    int i;
    for (i=0; i<n; i++) {
        const float num = x[i]-y[i];
        const float denom = 1./(x[i]+y[i]+FLT_MIN);
        result += num*num*denom;
    }
    return result;
 }

 /* use compiler intrinsics for 4x parallel processing */
 static inline float chi2_intrinsic_float(int n, const float* x, const float* y) {
    float result=0;
    const __m128 eps = _mm_set1_ps(FLT_MIN);
    const __m128 zero = _mm_setzero_ps();
    __m128 chi2 = _mm_setzero_ps();
    
    for (; n>3; n-=4) {
        const __m128 a = _mm_loadu_ps(x);
        const __m128 b = _mm_loadu_ps(y);
        const __m128 a_plus_eps = _mm_add_ps(a,eps);
        const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b);
        const __m128 a_minus_b = _mm_sub_ps(a,b);
        const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b);
        const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_ps(chi2, prod);
 	x+=4;
 	y+=4;
    }
    const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2));
    const __m128 sum1 = _mm_add_ps(chi2, shuffle1);
    const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1));
    const __m128 sum2 = _mm_add_ps(sum1, shuffle2);
 // with SSE3, we could use hadd_ps, but the difference is negligible 

    _mm_store_ss(&result,sum2);
    _mm_empty();
    
    if (n)
        result += chi2_baseline_float(n, x, y);	// remaining 1-3 entries
    return result;
 }

 /* calculate the chi2-distance between two vectors/histograms */
 float chi2_float(const int dim, const float* const x, const float* const y) {
    float (*chi2_float)(const int, const float*, const float*) = chi2_baseline_float;
 #ifdef __SSE__
    chi2_float = chi2_intrinsic_float;
 #endif
    return chi2_float(dim, x, y);
 }

 /* calculate the chi2-distance matrix between a sets of vectors/histograms. */
 float chi2sym_distance_float(const int dim, const int nx, const float* const x, 
                             float* const K) {
    float (*chi2_float)(const int, const float*, const float*) = chi2_baseline_float;
 #ifdef __SSE__
    chi2_float = chi2_intrinsic_float;
 #endif
 	
    float sumK=0.f;
 #pragma omp parallel
    {
        int i,j;
 #pragma omp for reduction (+:sumK) schedule (dynamic,2)
        for (i=0;i<nx;i++) {
    	    K[i*nx+i]=0.;
            for (j=0;j<i;j++) {
 	    	const float chi2 = (*chi2_float)(dim, &x[i*dim], &x[j*dim]); 
                K[i*nx+j] = chi2;
                K[j*nx+i] = chi2;
 	        	sumK += 2*chi2;
            }
 	    }
    }
    return sumK/((float)(nx*nx)); 
 }

 /* calculate the chi2-distance matrix between two sets of vectors/histograms. */
 float chi2_distance_float(const int dim, const int nx, const float* const x, 
                          const int ny, const float* const y, float* const K) {
    float (*chi2_float)(const int, const float*, const float*) = chi2_baseline_float;
 #ifdef __SSE__
    chi2_float = chi2_intrinsic_float;
 #endif
 	
    float sumK=0.f;
 #pragma omp parallel
    {
        int i,j;
 #pragma omp for reduction (+:sumK) schedule (dynamic,2)
        for (i=0;i<nx;i++) {
            for (j=0;j<ny;j++) {
 	    	float chi2 = (*chi2_float)(dim, &x[i*dim], &y[j*dim]); 
                K[i*ny+j] = chi2;
        		sumK += chi2;
            }
 	    }
    }
    return sumK/((float)(nx*ny)); 
 }


 #ifdef __MAIN__

 #include <stdlib.h>

 #include <time.h>
 int main()
 {
    const int dim=3000;
    const int n1=1000;
    const int n2=2000;
    int i,j;

 /* test calculating a kernel with float entries 
    float *data1 = (float*)memalign(16,dim*n1*sizeof(float));
    float *data2 = (float*)memalign(16,dim*n2*sizeof(float));
    float *K = (float*)malloc(n1*n2*sizeof(float));
    if ((!data1) || (!data2) || (!K)) {
       free(data1);
       free(data2);
       free(K);
       return 1;
    }

    const clock_t before_init=clock();
    for (i=0;i<n1*dim;i++)
    	data1[i]=1./(float)(i+1.);
    for (i=0;i<n2*dim;i++)
    	data2[i]=1./(float)(i+1.);
    const clock_t after_init=clock();
    printf("init time: %8.4f\n",(after_init-before_init)*1./CLOCKS_PER_SEC);

    const clock_t before_chi2=clock();
    const float mean_K = chi2_distance_float(dim, n1, data1, n2, data2, K);
    const clock_t after_chi2=clock();
    printf("chi2 time: %8.4f\n",(after_chi2-before_chi2)*1./CLOCKS_PER_SEC);

    printf("result: %e\n",mean_K);

    free(data1);
    free(data2);
    free(K);
    */
    return 0;
 }

 #endif
diff --git a/external_code mpi-chi2-v1_5 Makefile b/external_code mpi-chi2-v1_5 Makefile
 CC = gcc-4.2
 CFLAGS =  -O3 -fPIC -march=nocona -ffast-math -fomit-frame-pointer 

 #-L/home/joao/matlab/bin/glnx86/
 #CC=icc
 #CFLAGS = -xP -fast -fPIC
 OMPFLAGS = -fopenmp

 MATLABDIR=/Applications/MATLAB_R2011a.app
 INCLUDES=-I$(MATLABDIR)/extern/include
 #LDIRS= -L$(MATLABDIR)/bin/glnx86
 LDIRS= -L$(MATLABDIR)/bin/maci64

 EXE_TARGETS = chi2float chi2double chi2_mex.mexmaci64
 #EXE_TARGETS = chi2float chi2double chi2_mex.mexglx
 LIB_TARGETS = libchi2.a
 all:	$(EXE_TARGETS) $(LIB_TARGETS)

 chi2float:	chi2float.c chi2float.h Makefile
 	$(CC) -D__MAIN__  $(CFLAGS)  $(OMPFLAGS) -o chi2float chi2float.c

 chi2double: chi2double.c chi2double.h Makefile
 	$(CC) -D__MAIN__  $(CFLAGS)  $(OMPFLAGS) -o chi2double chi2double.c

 libchi2.a:	chi2double.c chi2double.h chi2float.c chi2float.h Makefile
 	$(CC) $(CFLAGS) -fopenmp -shared -fPIC chi2double.c chi2float.c -o libchi2.a

 chi2double.o : chi2double.c chi2double.h Makefile
 	$(CC) -D__MAIN__  $(CFLAGS) -c $(OMPFLAGS) -o chi2double.o chi2double.c

 chi2float.o: chi2float.c chi2double.h Makefile
 	$(CC) -D__MAIN__  $(CFLAGS) -c $(OMPFLAGS) -o chi2float.o chi2float.c

 chi2_mex.o: chi2_mex.c
 	$(CC) $(CFLAGS) -c $(INCLUDES) $(OMPFLAGS)  -o chi2_mex.o chi2_mex.c

 chi2_mex.mexglx: 	chi2_mex.c chi2_mex.o chi2float.o
 	$(CC)  -fopenmp chi2_mex.o  $(LDIRS) $(CFLAGS) -lmx -lmex  -shared -o chi2_mex.mexglx chi2float.o
 
 chi2_mex.mexmaci64: 	chi2_mex.c chi2_mex.o chi2float.o
 	$(CC)  -fopenmp chi2_mex.o  $(LDIRS) $(CFLAGS) -lmx -lmex  -shared -o chi2_mex.mexmaci64 chi2float.o

 # default installation of libomp cannot be opened using dlopen() as would be required e.g. for Python


 clean:
 	rm -f *.o $(EXE_TARGETS) $(LIB_TARGETS)

 timing:	$(EXE_TARGETS)
 	time ./chi2float
 	time ./chi2double
	CC = gcc #gcc-4.2
	MATLABDIR=/Applications/MATLAB_R2011a.app
	INCLUDES=-I$(MATLABDIR)/extern/include
	LDIRS= -L$(MATLABDIR)/bin/maci64
	EXE_TARGETS = segm_overlap_mex.mexa64 segm_intersection_mex.mexa64

	all: $(EXE_TARGETS)

	overlap.o: overlap.c
	$(CC) -D__MAIN__ -O3 -fPIC -c $(INCLUDES) -fopenmp -o overlap.o overlap.c

	intersection.o: intersection.c
	$(CC) -D__MAIN__ -O3 -fPIC -c $(INCLUDES) -fopenmp -o intersection.o intersection.c

	segm_overlap_mex.o: segm_overlap_mex.c
	$(CC) -O3 -c $(INCLUDES) -o segm_overlap_mex.o segm_overlap_mex.c -fPIC

	segm_intersection_mex.o: segm_intersection_mex.c
	$(CC) -O3 -c $(INCLUDES) -o segm_intersection_mex.o segm_intersection_mex.c -fPIC


	segm_overlap_mex.mexa64: segm_overlap_mex.o overlap.o
	$(CC) segm_overlap_mex.o $(LDIRS) -lmx -lmex -fopenmp -shared -o segm_overlap_mex.mexmaci64 overlap.o

	segm_intersection_mex.mexa64: segm_intersection_mex.o intersection.o
	$(CC) segm_intersection_mex.o $(LDIRS) -lmx -lmex -fopenmp -shared -o segm_intersection_mex.mexmaci64 intersection.o

	clean:
	rm -f *.o $(EXE_TARGETS) $(LIB_TARGETS)
	%
	% 32bits and 64bits mex files and Makefiles for chi2_mex are provided
	% the same for segm_overlap_mex.c
	% You need to set the right path to matlab in the Makefiles, in order to
	% recompile them.
	%
	% These files have makefiles because they use multiple cores with omp
	%

	% for the other files
	mex -O code/cartprod_mex.c -o code/cartprod_mex
	cd ./code/
	!make
	cd ..
	mex -O code/int_hist.c -o code/int_hist
	mex -O code/intens_pixel_diff_mex.c -o code/intens_pixel_diff_mex
	cd ./external_code/paraFmex/
	make_pseudo()
	cd ../..

	% these two files contributed by andreas mueller
	mex -cxx -I/usr/local/include -O external_code/my_phog_desc_mex.cpp -o external_code/my_phog_desc_mex % requires boost development files
	mex -O external_code/overlap_care.c -o external_code/overlap_care
	function cpmc_example()
	addpath('./code/');
	addpath('./external_code/');
	addpath('./external_code/paraFmex/');
	addpath('./external_code/imrender/vgg/');
	addpath('./external_code/immerge/');
	addpath('./external_code/color_sift/');
	addpath('./external_code/vlfeats/toolbox/');
	vl_setup;
	addpath('./external_code/globalPb/lib/');
	addpath('./external_code/mpi-chi2-v1_5/');

	% create multiple threads (set how many you have)
	N_THREADS = 2;
	if(matlabpool('size')~=N_THREADS)
	matlabpool('open', N_THREADS);
	end

	exp_dir = './data/';
	%img_name = '2010_000238'; % airplane and people
	img_name = '2007_009084'; % dogs, motorbike, chairs, people
	%img_name = '2010_002868'; % buses
	%img_name = '2010_003781'; % cat, bottle, potted plants

	[masks, scores] = cpmc(exp_dir, img_name);

	I = imread([exp_dir '/JPEGImages/' img_name '.jpg']);

	% visualization and ground truth score for whole pool
	fprintf(['Best segments from initial pool of ' int2str(size(masks,3))]);
	Q = SvmSegm_segment_quality(img_name, exp_dir, masks, 'overlap');
	save('duh_32.mat', 'Q');
	avg_best_overlap = mean(max([Q.q]))
	SvmSegm_show_best_segments(I,Q,masks);

	% visualization and ground truth score for top 200 segments
	top_masks = masks(:,:,1:200);
	figure;
	disp('Best 200 segments after filtering');
	Q = SvmSegm_segment_quality(img_name, exp_dir, top_masks, 'overlap');
	avg_best_overlap = mean(max([Q.q]))
	SvmSegm_show_best_segments(I,Q,top_masks);
	fprintf('Best among top 200 after filtering\n\n');
	end
	// fast chi-squared distance function in x86 compiler intrinsics
	// (C) 2007-2008 Christoph Lampert <[email protected]>

	#include <stdio.h>
	#include <limits.h>
	#include <float.h> // for FLT_MIN
	/* We calculate calculate chi2=(a-b)**2/(a+b+FLT_MIN) to avoid division-by-zero:
	If a+b != 0, then (a+b+FLT_MIN)==(a+b) and nothing changed.
	If a+b == 0, then the numerator is 0 as well, and we don't divide by 0.
	*/


	/* Using compiler intrinsics (for SSE >=2) can have a huge speedup effect:
	8x for float and 3.5x for double on Intel Core2.
	You have to compile with the right CPU setting, e.g. gcc -march=k8 or -march=nocona */
	#ifdef __SSE2__
	#include <emmintrin.h> // for float
	#endif

	/* OpenMP allows to achieve almost linear speedup on multiCore CPUs: use gcc-4.2 -fopenmp */
	/#ifdef _OPENMP/
	#include <omp.h>
	/#endif/

	static inline double chi2_baseline_double(const int n, const double* const x, const double* const y) {
	double result = 0.f;
	int i;
	for (i=0; i<n; i++) {
	const double num = x[i]-y[i];
	const double denom = 1./(x[i]+y[i]+DBL_MIN);
	result += numnumdenom;
	}
	return result;
	}


	/* use compiler intrinsics for 2x parallel processing */
	static inline double chi2_intrinsic_double(int n, const double* x, const double* y) {
	double result=0;
	const __m128d eps = _mm_set1_pd(DBL_MIN);
	const __m128d zero = _mm_setzero_pd();
	__m128d chi2 = _mm_setzero_pd();

	for ( ; n>1; n-=2) {
	const __m128d a = _mm_loadu_pd(x);
	const __m128d b = _mm_loadu_pd(y);
	x+=2;
	y+=2;
	const __m128d a_plus_b = _mm_add_pd(a,b);
	const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps);
	const __m128d a_minus_b = _mm_sub_pd(a,b);
	const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b);
	const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps);
	chi2 = _mm_add_pd(chi2, quotient);
	}
	const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1));
	const __m128d sum = _mm_add_pd(chi2, shuffle);
	// with SSE3, we could use hadd_pd, but the difference is negligible

	_mm_store_sd(&result,sum);
	_mm_empty();
	if (n)
	result += chi2_baseline_double(n, x, y); // remaining entries
	return result;
	}


	/* calculate the chi2-distance between two vectors/histograms */
	double chi2_double(const int dim, const double* const x, const double* const y) {
	double (chi2_double)(const int, const double, const double*) = chi2_baseline_double;
	#ifdef __SSE2__
	chi2_double = chi2_intrinsic_double;
	#endif
	return chi2_double(dim, x, y);
	}

	/* calculate the chi2-measure between two sets of vectors/histograms */
	double chi2sym_distance_double(const int dim, const int nx, const double* const x,
	double* const K) {
	double (chi2_double)(const int, const double, const double*) = chi2_baseline_double;
	#ifdef __SSE2__
	chi2_double = chi2_intrinsic_double;
	#endif

	double sumK=0.;
	#pragma omp parallel
	{
	int i,j;
	#pragma omp for reduction (+:sumK) schedule (dynamic, 2)
	for (i=0;i<nx;i++) {
	K[i*nx+i]=0.;
	for (j=0;j<i;j++) {
	const double chi2 = chi2_double(dim, &x[idim], &x[jdim]);
	K[i*nx+j] = chi2;
	K[j*nx+i] = chi2;
	sumK += 2*chi2;
	}
	}
	}
	return sumK/((float)(nx*nx));
	}

	/* calculate the chi2-measure between two sets of vectors/histograms */
	double chi2_distance_double(const int dim, const int nx, const double* const x,
	const int ny, const double* const y, double* const K) {
	double (chi2_double)(const int, const double, const double*) = chi2_baseline_double;
	#ifdef __SSE2__
	chi2_double = chi2_intrinsic_double;
	#endif

	double sumK=0.;
	#pragma omp parallel
	{
	int i,j;
	#pragma omp for reduction (+:sumK)
	for (i=0;i<nx;i++)
	for (j=0;j<ny;j++) {
	const double chi2 = chi2_double(dim, &x[idim], &y[jdim]);
	K[i*ny+j] = chi2;
	sumK += chi2;
	}
	}
	return sumK/((float)(nx*ny));
	}


	#ifdef __MAIN__

	#include <stdlib.h>

	#include <time.h>
	int main()
	{
	const int dim=3000;
	const int n1=1000;
	const int n2=2000;
	int i,j;

	/* test calculating a kernel with double entries
	double data1 = (double)memalign(16,dimn1sizeof(double));
	double data2 = (double)memalign(16,dimn2sizeof(double));
	double K = (double)malloc(n1n2sizeof(double));
	if ((!data1) \|\| (!data2) \|\| (!K)) {
	free(data1);
	free(data2);
	free(K);
	return 1;
	}

	const clock_t before_init=clock();
	for (i=0;i<n1*dim;i++)
	data1[i]=1./(double)(i+1.);
	for (i=0;i<n2*dim;i++)
	data2[i]=1./(double)(i+1.);
	const clock_t after_init=clock();
	printf("init time: %8.4f\n",(after_init-before_init)*1./CLOCKS_PER_SEC);

	const clock_t before_chi2=clock();
	const double mean_K = chi2_distance_double(dim, n1, data1, n2, data2, K);
	const clock_t after_chi2=clock();
	printf("chi2 time: %8.4f\n",(after_chi2-before_chi2)*1./CLOCKS_PER_SEC);

	printf("result: %e\n",mean_K);

	free(data1);
	free(data2);
	free(K);
	*/
	return 0;
	}

	#endif
	CC = gcc-4.2
	CFLAGS = -O3 -fPIC -march=nocona -ffast-math -fomit-frame-pointer

	#-L/home/joao/matlab/bin/glnx86/
	#CC=icc
	#CFLAGS = -xP -fast -fPIC
	OMPFLAGS = -fopenmp

	MATLABDIR=/Applications/MATLAB_R2011a.app
	INCLUDES=-I$(MATLABDIR)/extern/include
	#LDIRS= -L$(MATLABDIR)/bin/glnx86
	LDIRS= -L$(MATLABDIR)/bin/maci64

	EXE_TARGETS = chi2float chi2double chi2_mex.mexmaci64
	#EXE_TARGETS = chi2float chi2double chi2_mex.mexglx
	LIB_TARGETS = libchi2.a
	all: $(EXE_TARGETS) $(LIB_TARGETS)

	chi2float: chi2float.c chi2float.h Makefile
	$(CC) -D__MAIN__ $(CFLAGS) $(OMPFLAGS) -o chi2float chi2float.c

	chi2double: chi2double.c chi2double.h Makefile
	$(CC) -D__MAIN__ $(CFLAGS) $(OMPFLAGS) -o chi2double chi2double.c

	libchi2.a: chi2double.c chi2double.h chi2float.c chi2float.h Makefile
	$(CC) $(CFLAGS) -fopenmp -shared -fPIC chi2double.c chi2float.c -o libchi2.a

	chi2double.o : chi2double.c chi2double.h Makefile
	$(CC) -D__MAIN__ $(CFLAGS) -c $(OMPFLAGS) -o chi2double.o chi2double.c

	chi2float.o: chi2float.c chi2double.h Makefile
	$(CC) -D__MAIN__ $(CFLAGS) -c $(OMPFLAGS) -o chi2float.o chi2float.c

	chi2_mex.o: chi2_mex.c
	$(CC) $(CFLAGS) -c $(INCLUDES) $(OMPFLAGS) -o chi2_mex.o chi2_mex.c

	chi2_mex.mexglx: chi2_mex.c chi2_mex.o chi2float.o
	$(CC) -fopenmp chi2_mex.o $(LDIRS) $(CFLAGS) -lmx -lmex -shared -o chi2_mex.mexglx chi2float.o

	chi2_mex.mexmaci64: chi2_mex.c chi2_mex.o chi2float.o
	$(CC) -fopenmp chi2_mex.o $(LDIRS) $(CFLAGS) -lmx -lmex -shared -o chi2_mex.mexmaci64 chi2float.o

	# default installation of libomp cannot be opened using dlopen() as would be required e.g. for Python


	clean:
	rm -f *.o $(EXE_TARGETS) $(LIB_TARGETS)

	timing: $(EXE_TARGETS)
	time ./chi2float
	time ./chi2double