Skip to content

Instantly share code, notes, and snippets.

@cvanweelden
Created March 6, 2012 14:07
Show Gist options
  • Save cvanweelden/1986460 to your computer and use it in GitHub Desktop.
Save cvanweelden/1986460 to your computer and use it in GitHub Desktop.
Files needed to get cpmc_release1 working on a 64-bit Intel Mac with Matlab R2011a.
CC = gcc #gcc-4.2
MATLABDIR=/Applications/MATLAB_R2011a.app
INCLUDES=-I$(MATLABDIR)/extern/include
LDIRS= -L$(MATLABDIR)/bin/maci64
EXE_TARGETS = segm_overlap_mex.mexa64 segm_intersection_mex.mexa64
all: $(EXE_TARGETS)
overlap.o: overlap.c
$(CC) -D__MAIN__ -O3 -fPIC -c $(INCLUDES) -fopenmp -o overlap.o overlap.c
intersection.o: intersection.c
$(CC) -D__MAIN__ -O3 -fPIC -c $(INCLUDES) -fopenmp -o intersection.o intersection.c
segm_overlap_mex.o: segm_overlap_mex.c
$(CC) -O3 -c $(INCLUDES) -o segm_overlap_mex.o segm_overlap_mex.c -fPIC
segm_intersection_mex.o: segm_intersection_mex.c
$(CC) -O3 -c $(INCLUDES) -o segm_intersection_mex.o segm_intersection_mex.c -fPIC
segm_overlap_mex.mexa64: segm_overlap_mex.o overlap.o
$(CC) segm_overlap_mex.o $(LDIRS) -lmx -lmex -fopenmp -shared -o segm_overlap_mex.mexmaci64 overlap.o
segm_intersection_mex.mexa64: segm_intersection_mex.o intersection.o
$(CC) segm_intersection_mex.o $(LDIRS) -lmx -lmex -fopenmp -shared -o segm_intersection_mex.mexmaci64 intersection.o
clean:
rm -f *.o $(EXE_TARGETS) $(LIB_TARGETS)
%
% 32bits and 64bits mex files and Makefiles for chi2_mex are provided
% the same for segm_overlap_mex.c
% You need to set the right path to matlab in the Makefiles, in order to
% recompile them.
%
% These files have makefiles because they use multiple cores with omp
%
% for the other files
mex -O code/cartprod_mex.c -o code/cartprod_mex
cd ./code/
!make
cd ..
mex -O code/int_hist.c -o code/int_hist
mex -O code/intens_pixel_diff_mex.c -o code/intens_pixel_diff_mex
cd ./external_code/paraFmex/
make_pseudo()
cd ../..
% these two files contributed by andreas mueller
mex -cxx -I/usr/local/include -O external_code/my_phog_desc_mex.cpp -o external_code/my_phog_desc_mex % requires boost development files
mex -O external_code/overlap_care.c -o external_code/overlap_care
function cpmc_example()
addpath('./code/');
addpath('./external_code/');
addpath('./external_code/paraFmex/');
addpath('./external_code/imrender/vgg/');
addpath('./external_code/immerge/');
addpath('./external_code/color_sift/');
addpath('./external_code/vlfeats/toolbox/');
vl_setup;
addpath('./external_code/globalPb/lib/');
addpath('./external_code/mpi-chi2-v1_5/');
% create multiple threads (set how many you have)
N_THREADS = 2;
if(matlabpool('size')~=N_THREADS)
matlabpool('open', N_THREADS);
end
exp_dir = './data/';
%img_name = '2010_000238'; % airplane and people
img_name = '2007_009084'; % dogs, motorbike, chairs, people
%img_name = '2010_002868'; % buses
%img_name = '2010_003781'; % cat, bottle, potted plants
[masks, scores] = cpmc(exp_dir, img_name);
I = imread([exp_dir '/JPEGImages/' img_name '.jpg']);
% visualization and ground truth score for whole pool
fprintf(['Best segments from initial pool of ' int2str(size(masks,3))]);
Q = SvmSegm_segment_quality(img_name, exp_dir, masks, 'overlap');
save('duh_32.mat', 'Q');
avg_best_overlap = mean(max([Q.q]))
SvmSegm_show_best_segments(I,Q,masks);
% visualization and ground truth score for top 200 segments
top_masks = masks(:,:,1:200);
figure;
disp('Best 200 segments after filtering');
Q = SvmSegm_segment_quality(img_name, exp_dir, top_masks, 'overlap');
avg_best_overlap = mean(max([Q.q]))
SvmSegm_show_best_segments(I,Q,top_masks);
fprintf('Best among top 200 after filtering\n\n');
end
// fast chi-squared distance function in x86 compiler intrinsics
// (C) 2007-2008 Christoph Lampert <[email protected]>
#include <stdio.h>
#include <limits.h>
#include <float.h> // for FLT_MIN
/* We calculate calculate chi2=(a-b)**2/(a+b+FLT_MIN) to avoid division-by-zero:
If a+b != 0, then (a+b+FLT_MIN)==(a+b) and nothing changed.
If a+b == 0, then the numerator is 0 as well, and we don't divide by 0.
*/
/* Using compiler intrinsics (for SSE >=2) can have a huge speedup effect:
8x for float and 3.5x for double on Intel Core2.
You have to compile with the right CPU setting, e.g. gcc -march=k8 or -march=nocona */
#ifdef __SSE2__
#include <emmintrin.h> // for float
#endif
/* OpenMP allows to achieve almost linear speedup on multiCore CPUs: use gcc-4.2 -fopenmp */
/*#ifdef _OPENMP*/
#include <omp.h>
/*#endif*/
static inline double chi2_baseline_double(const int n, const double* const x, const double* const y) {
double result = 0.f;
int i;
for (i=0; i<n; i++) {
const double num = x[i]-y[i];
const double denom = 1./(x[i]+y[i]+DBL_MIN);
result += num*num*denom;
}
return result;
}
/* use compiler intrinsics for 2x parallel processing */
static inline double chi2_intrinsic_double(int n, const double* x, const double* y) {
double result=0;
const __m128d eps = _mm_set1_pd(DBL_MIN);
const __m128d zero = _mm_setzero_pd();
__m128d chi2 = _mm_setzero_pd();
for ( ; n>1; n-=2) {
const __m128d a = _mm_loadu_pd(x);
const __m128d b = _mm_loadu_pd(y);
x+=2;
y+=2;
const __m128d a_plus_b = _mm_add_pd(a,b);
const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps);
const __m128d a_minus_b = _mm_sub_pd(a,b);
const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b);
const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps);
chi2 = _mm_add_pd(chi2, quotient);
}
const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1));
const __m128d sum = _mm_add_pd(chi2, shuffle);
// with SSE3, we could use hadd_pd, but the difference is negligible
_mm_store_sd(&result,sum);
_mm_empty();
if (n)
result += chi2_baseline_double(n, x, y); // remaining entries
return result;
}
/* calculate the chi2-distance between two vectors/histograms */
double chi2_double(const int dim, const double* const x, const double* const y) {
double (*chi2_double)(const int, const double*, const double*) = chi2_baseline_double;
#ifdef __SSE2__
chi2_double = chi2_intrinsic_double;
#endif
return chi2_double(dim, x, y);
}
/* calculate the chi2-measure between two sets of vectors/histograms */
double chi2sym_distance_double(const int dim, const int nx, const double* const x,
double* const K) {
double (*chi2_double)(const int, const double*, const double*) = chi2_baseline_double;
#ifdef __SSE2__
chi2_double = chi2_intrinsic_double;
#endif
double sumK=0.;
#pragma omp parallel
{
int i,j;
#pragma omp for reduction (+:sumK) schedule (dynamic, 2)
for (i=0;i<nx;i++) {
K[i*nx+i]=0.;
for (j=0;j<i;j++) {
const double chi2 = chi2_double(dim, &x[i*dim], &x[j*dim]);
K[i*nx+j] = chi2;
K[j*nx+i] = chi2;
sumK += 2*chi2;
}
}
}
return sumK/((float)(nx*nx));
}
/* calculate the chi2-measure between two sets of vectors/histograms */
double chi2_distance_double(const int dim, const int nx, const double* const x,
const int ny, const double* const y, double* const K) {
double (*chi2_double)(const int, const double*, const double*) = chi2_baseline_double;
#ifdef __SSE2__
chi2_double = chi2_intrinsic_double;
#endif
double sumK=0.;
#pragma omp parallel
{
int i,j;
#pragma omp for reduction (+:sumK)
for (i=0;i<nx;i++)
for (j=0;j<ny;j++) {
const double chi2 = chi2_double(dim, &x[i*dim], &y[j*dim]);
K[i*ny+j] = chi2;
sumK += chi2;
}
}
return sumK/((float)(nx*ny));
}
#ifdef __MAIN__
#include <stdlib.h>
#include <time.h>
int main()
{
const int dim=3000;
const int n1=1000;
const int n2=2000;
int i,j;
/* test calculating a kernel with double entries
double *data1 = (double*)memalign(16,dim*n1*sizeof(double));
double *data2 = (double*)memalign(16,dim*n2*sizeof(double));
double *K = (double*)malloc(n1*n2*sizeof(double));
if ((!data1) || (!data2) || (!K)) {
free(data1);
free(data2);
free(K);
return 1;
}
const clock_t before_init=clock();
for (i=0;i<n1*dim;i++)
data1[i]=1./(double)(i+1.);
for (i=0;i<n2*dim;i++)
data2[i]=1./(double)(i+1.);
const clock_t after_init=clock();
printf("init time: %8.4f\n",(after_init-before_init)*1./CLOCKS_PER_SEC);
const clock_t before_chi2=clock();
const double mean_K = chi2_distance_double(dim, n1, data1, n2, data2, K);
const clock_t after_chi2=clock();
printf("chi2 time: %8.4f\n",(after_chi2-before_chi2)*1./CLOCKS_PER_SEC);
printf("result: %e\n",mean_K);
free(data1);
free(data2);
free(K);
*/
return 0;
}
#endif
// fast chi-squared distance function in x86 compiler intrinsics
// (C) 2007-2008 Christoph Lampert <[email protected]>
#include <stdio.h>
#include <limits.h>
#include <float.h>// for FLT_MIN
/* We calculate calculate chi2=(a-b)**2/(a+b+FLT_MIN) to avoid division-by-zero:
If a+b != 0, then (a+b+FLT_MIN)==(a+b) and nothing changed.
If a+b == 0, then the numerator is 0 as well, and we don't divide by 0.
*/
/* Using SSE compiler intrinsics can have a huge speedup effect:
8x for float and 3.5x for double on Intel Core2.
You have to compile with the right CPU setting, e.g. gcc -march=k8 or -march=nocona */
#ifdef __SSE__
#include <xmmintrin.h> // for float
#endif
/* OpenMP allows to achieve almost linear speedup on multiCore CPUs: use gcc-4.2 -fopenmp */
#ifdef _OPENMP
#include <omp.h>
#endif
static inline float chi2_baseline_float(const int n, const float* x, const float* y) {
float result = 0.f;
int i;
for (i=0; i<n; i++) {
const float num = x[i]-y[i];
const float denom = 1./(x[i]+y[i]+FLT_MIN);
result += num*num*denom;
}
return result;
}
/* use compiler intrinsics for 4x parallel processing */
static inline float chi2_intrinsic_float(int n, const float* x, const float* y) {
float result=0;
const __m128 eps = _mm_set1_ps(FLT_MIN);
const __m128 zero = _mm_setzero_ps();
__m128 chi2 = _mm_setzero_ps();
for (; n>3; n-=4) {
const __m128 a = _mm_loadu_ps(x);
const __m128 b = _mm_loadu_ps(y);
const __m128 a_plus_eps = _mm_add_ps(a,eps);
const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b);
const __m128 a_minus_b = _mm_sub_ps(a,b);
const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b);
const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps);
chi2 = _mm_add_ps(chi2, prod);
x+=4;
y+=4;
}
const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2));
const __m128 sum1 = _mm_add_ps(chi2, shuffle1);
const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1));
const __m128 sum2 = _mm_add_ps(sum1, shuffle2);
// with SSE3, we could use hadd_ps, but the difference is negligible
_mm_store_ss(&result,sum2);
_mm_empty();
if (n)
result += chi2_baseline_float(n, x, y); // remaining 1-3 entries
return result;
}
/* calculate the chi2-distance between two vectors/histograms */
float chi2_float(const int dim, const float* const x, const float* const y) {
float (*chi2_float)(const int, const float*, const float*) = chi2_baseline_float;
#ifdef __SSE__
chi2_float = chi2_intrinsic_float;
#endif
return chi2_float(dim, x, y);
}
/* calculate the chi2-distance matrix between a sets of vectors/histograms. */
float chi2sym_distance_float(const int dim, const int nx, const float* const x,
float* const K) {
float (*chi2_float)(const int, const float*, const float*) = chi2_baseline_float;
#ifdef __SSE__
chi2_float = chi2_intrinsic_float;
#endif
float sumK=0.f;
#pragma omp parallel
{
int i,j;
#pragma omp for reduction (+:sumK) schedule (dynamic,2)
for (i=0;i<nx;i++) {
K[i*nx+i]=0.;
for (j=0;j<i;j++) {
const float chi2 = (*chi2_float)(dim, &x[i*dim], &x[j*dim]);
K[i*nx+j] = chi2;
K[j*nx+i] = chi2;
sumK += 2*chi2;
}
}
}
return sumK/((float)(nx*nx));
}
/* calculate the chi2-distance matrix between two sets of vectors/histograms. */
float chi2_distance_float(const int dim, const int nx, const float* const x,
const int ny, const float* const y, float* const K) {
float (*chi2_float)(const int, const float*, const float*) = chi2_baseline_float;
#ifdef __SSE__
chi2_float = chi2_intrinsic_float;
#endif
float sumK=0.f;
#pragma omp parallel
{
int i,j;
#pragma omp for reduction (+:sumK) schedule (dynamic,2)
for (i=0;i<nx;i++) {
for (j=0;j<ny;j++) {
float chi2 = (*chi2_float)(dim, &x[i*dim], &y[j*dim]);
K[i*ny+j] = chi2;
sumK += chi2;
}
}
}
return sumK/((float)(nx*ny));
}
#ifdef __MAIN__
#include <stdlib.h>
#include <time.h>
int main()
{
const int dim=3000;
const int n1=1000;
const int n2=2000;
int i,j;
/* test calculating a kernel with float entries
float *data1 = (float*)memalign(16,dim*n1*sizeof(float));
float *data2 = (float*)memalign(16,dim*n2*sizeof(float));
float *K = (float*)malloc(n1*n2*sizeof(float));
if ((!data1) || (!data2) || (!K)) {
free(data1);
free(data2);
free(K);
return 1;
}
const clock_t before_init=clock();
for (i=0;i<n1*dim;i++)
data1[i]=1./(float)(i+1.);
for (i=0;i<n2*dim;i++)
data2[i]=1./(float)(i+1.);
const clock_t after_init=clock();
printf("init time: %8.4f\n",(after_init-before_init)*1./CLOCKS_PER_SEC);
const clock_t before_chi2=clock();
const float mean_K = chi2_distance_float(dim, n1, data1, n2, data2, K);
const clock_t after_chi2=clock();
printf("chi2 time: %8.4f\n",(after_chi2-before_chi2)*1./CLOCKS_PER_SEC);
printf("result: %e\n",mean_K);
free(data1);
free(data2);
free(K);
*/
return 0;
}
#endif
CC = gcc-4.2
CFLAGS = -O3 -fPIC -march=nocona -ffast-math -fomit-frame-pointer
#-L/home/joao/matlab/bin/glnx86/
#CC=icc
#CFLAGS = -xP -fast -fPIC
OMPFLAGS = -fopenmp
MATLABDIR=/Applications/MATLAB_R2011a.app
INCLUDES=-I$(MATLABDIR)/extern/include
#LDIRS= -L$(MATLABDIR)/bin/glnx86
LDIRS= -L$(MATLABDIR)/bin/maci64
EXE_TARGETS = chi2float chi2double chi2_mex.mexmaci64
#EXE_TARGETS = chi2float chi2double chi2_mex.mexglx
LIB_TARGETS = libchi2.a
all: $(EXE_TARGETS) $(LIB_TARGETS)
chi2float: chi2float.c chi2float.h Makefile
$(CC) -D__MAIN__ $(CFLAGS) $(OMPFLAGS) -o chi2float chi2float.c
chi2double: chi2double.c chi2double.h Makefile
$(CC) -D__MAIN__ $(CFLAGS) $(OMPFLAGS) -o chi2double chi2double.c
libchi2.a: chi2double.c chi2double.h chi2float.c chi2float.h Makefile
$(CC) $(CFLAGS) -fopenmp -shared -fPIC chi2double.c chi2float.c -o libchi2.a
chi2double.o : chi2double.c chi2double.h Makefile
$(CC) -D__MAIN__ $(CFLAGS) -c $(OMPFLAGS) -o chi2double.o chi2double.c
chi2float.o: chi2float.c chi2double.h Makefile
$(CC) -D__MAIN__ $(CFLAGS) -c $(OMPFLAGS) -o chi2float.o chi2float.c
chi2_mex.o: chi2_mex.c
$(CC) $(CFLAGS) -c $(INCLUDES) $(OMPFLAGS) -o chi2_mex.o chi2_mex.c
chi2_mex.mexglx: chi2_mex.c chi2_mex.o chi2float.o
$(CC) -fopenmp chi2_mex.o $(LDIRS) $(CFLAGS) -lmx -lmex -shared -o chi2_mex.mexglx chi2float.o
chi2_mex.mexmaci64: chi2_mex.c chi2_mex.o chi2float.o
$(CC) -fopenmp chi2_mex.o $(LDIRS) $(CFLAGS) -lmx -lmex -shared -o chi2_mex.mexmaci64 chi2float.o
# default installation of libomp cannot be opened using dlopen() as would be required e.g. for Python
clean:
rm -f *.o $(EXE_TARGETS) $(LIB_TARGETS)
timing: $(EXE_TARGETS)
time ./chi2float
time ./chi2double
@cvanweelden
Copy link
Author

Put each file in the correct directories (replace the " " in the filenames with "/").

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment