zhangce · June 28, 2015 03:29
diff --git a/gistfile1.cpp b/gistfile1.cpp
 // Copyright 2014 Hazy Research (http://i.stanford.edu/hazy)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #define NCLASS 2
 //#define STEPSIZE 0.0001
 //#define STEPSIZE 0.1

 float STEPSIZE = 0.0005; // This is good for most clases
 float LAMBDA = 0.1;
 float DECAY = 0.99;

 //float STEPSIZE = 0.0005;
 //float LAMBDA = 0.3;
 //float DECAY = 0.999;

 #define BATCHSIZE 1
 #define BATCHSIZE_TEST 1

 #include <stdio.h>
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>

 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <fcntl.h>

 #include "dimmwitted.h"

 int small_class = -1;
 int large_class = -1;


 #define LOG_2   0.693147180559945
 #define MINUS_LOG_THRESHOLD   -18.42

 float dot_dense(const float * const x,
                            const float * const y,
                                           int N){

  const float reminder = N % 8;
  float rs[8] = {0,0,0,0,0,0,0,0};
  for(int i=reminder;i<N;i+=8){
    rs[0] += x[i] * y[i];
    rs[1] += x[i+1] * y[i+1];
    rs[2] += x[i+2] * y[i+2];
    rs[3] += x[i+3] * y[i+3];
    rs[4] += x[i+4] * y[i+4];
    rs[5] += x[i+5] * y[i+5];
    rs[6] += x[i+6] * y[i+6];
    rs[7] += x[i+7] * y[i+7];
  }
  float toreturn = (rs[0]+rs[1]+rs[2]+rs[3]+rs[4]+rs[5]+rs[6]+rs[7]);
  for(int i=0;i<reminder;i++){
    toreturn += x[i] * y[i];
  }
  return toreturn;
 }



 inline bool fast_exact_is_equal(float a, float b){
    return (a <= b && b <= a);
 }

 inline float logadd(float log_a, float log_b){
    
    if (log_a < log_b)
    { // swap them
        float tmp = log_a;
        log_a = log_b;
        log_b = tmp;
    } else if (fast_exact_is_equal(log_a, log_b)) {
        // Special case when log_a == log_b. In particular this works when both
        // log_a and log_b are (+-) INFINITY: it will return (+-) INFINITY
        // instead of NaN.
        return LOG_2 + log_a;
    }
    float negative_absolute_difference = log_b - log_a;
    if (negative_absolute_difference < MINUS_LOG_THRESHOLD)
        return (log_a);
    return (log_a + log1p(exp(negative_absolute_difference)));
    
 }


 /**
 * \brief A model object. This model contains
 * two elements: (1) p: the pointers
 * to the paramters, and (2) n: the number
 * of paramters that this model contains.
 *
 * Note that, to use PerNode and PerCore
 * strategy, this object needs to have a copy
 * constructor.
 *
 */
 class GLMModelExample{
 public:
  float * const p;
  int n;
  
  GLMModelExample(int _n):
    n(_n), p(new float[_n]){}

  GLMModelExample( const GLMModelExample& other ) :
     n(other.n), p(new float[other.n]){
    for(int i=0;i<n;i++){
      p[i] = other.p[i];
    }
  }

 };

 /**
 * \brief The function that takes input a series of models,
 * and update one of them according to others. You
 * need to register this one if you want to use
 * PerNode and PerCore strategy.
 */
 void f_lr_modelavg(GLMModelExample** const p_models, /**< set of models*/
                   int nreplicas, /**< number of models in the above set */
                   int ireplica /**< id of the model that needs updates*/
                   ){
  assert(false);
 }

 double f_lr_top1acc(const DenseVector<float>* const ex, GLMModelExample* const p_model){
  
  double rs = 0.0;
  for(int ibatch=0;ibatch<BATCHSIZE_TEST;ibatch++){

    float * model = p_model->p;
    int nfeature = ex->n-1;
    int label = ex->p[ibatch*(nfeature+1)];

    //if(label != 0 && label != 1){
    //  continue;
    //}
    //assert(label == small_class || label == large_class);

    float * features = &ex->p[ibatch*(nfeature+1) + 1];

    float logprobs[NCLASS];

    float dot = 0.0;
    float logsum = -1.00000;
    float max = -1000;
    float imax = -1;
    for(int iclass = 0; iclass < NCLASS ; iclass ++){
      float * mi = &model[iclass * nfeature];
      dot = dot_dense(features, mi, nfeature);
      //std::cout << dot << std::endl;
      //for(int i=0;i<nfeature;i++){
      //  dot += features[i] * mi[i];
      //}
      logprobs[iclass] = dot;
      logsum = logadd(logsum, logprobs[iclass]);
    }

    for(int iclass = 0; iclass < NCLASS ; iclass ++){
      float prob = exp(logprobs[iclass] - logsum);
      if(prob >= max){
        max = prob;
        imax = iclass;
      }
    }

    if(imax == 0) imax = small_class;
    else if(imax == 1) imax = large_class;

    std::cout << "RESULT " << label << " -> " << imax << std::endl;

    if(imax == label){
      rs += 1.0;
    }else{
      rs += 0.0;
    }
  }
  return rs;
 }


 double f_lr_top5acc(const DenseVector<float>* const ex, GLMModelExample* const p_model){

  double rs = 0.0;
  for(int ibatch=0;ibatch<BATCHSIZE_TEST;ibatch++){

    float * model = p_model->p;
    int nfeature = ex->n-1;
    int label = ex->p[ibatch*(nfeature+1)];

    if(label != 0 && label != 1){
      continue;
    }

    float * features = &ex->p[ibatch*(nfeature+1) + 1];

    float logprobs[NCLASS];

    float dot = 0.0;
    float logsum = -1.00000;
    for(int iclass = 0; iclass < NCLASS ; iclass ++){
      float * mi = &model[iclass * nfeature];
      dot = dot_dense(features, mi, nfeature);
      //std::cout << dot << std::endl;
      //for(int i=0;i<nfeature;i++){
      //  dot += features[i] * mi[i];
      //}
      logprobs[iclass] = dot;
      logsum = logadd(logsum, logprobs[iclass]);
    }

    for(int i=0;i<5;i++){
      float max = -1000;
      int imax = -1;
      for(int iclass = 0; iclass < NCLASS ; iclass ++){
        float prob = exp(logprobs[iclass] - logsum);
        if(prob >= max){
          max = prob;
          imax = iclass;
        }
      }
      //std::cout << imax << std::endl;
      if(imax == label){
        rs += 1.0;
        break;
      }
      logprobs[imax] = -10000;
    }
  }

  return rs;
 }

 /**
 * \brief One example of the function that can be register to
 * Row-wise access (DW_ROW). This function takes as input
 * one row of the data (ex), and the current model,
 * returns the loss.
 */
 double f_lr_loss(const DenseVector<float>* const ex, GLMModelExample* const p_model){

  float * model = p_model->p;
  int label = ex->p[0];
  float * features = &ex->p[1];
  int nfeature = ex->n-1;
  float logprobs[NCLASS];

  //std::cout << label << std::endl;

  float dot = 0.0;
  float logsum = -1.00000;
  for(int iclass = 0; iclass < NCLASS ; iclass ++){
    float * mi = &model[iclass * nfeature];
    dot = 0.0;
    dot = dot_dense(features, mi, nfeature);
    //std::cout << dot << std::endl;

    //for(int i=0;i<nfeature;i++){
    //  dot += features[i] * mi[i];
    //}
    //std::cout << dot << std::endl;
    logprobs[iclass] = dot;
    logsum = logadd(logsum, logprobs[iclass]);
  }

  float logprob = logprobs[label] - logsum;
  return -logprob;
 }


 /**
 * \brief One example of the function that can be register to
 * Row-wise access (DW_ROW). This function takes as input
 * one row of the data (ex), and the current model (p_model),
 * and update the model with the gradient.
 * 
 */
 double f_lr_grad(const DenseVector<float>* const ex, GLMModelExample* const p_model){

  float logprobs[BATCHSIZE][NCLASS];
  float logsums[BATCHSIZE];
  int nfeature = ex->n-1;
  float oldloss = 0.0;

  for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){

    float * model = p_model->p;
    int label = ex->p[ibatch*(nfeature+1)];

    //if(label != 0 && label != 1){
    //  continue;
    //}
    assert(label == small_class || label == large_class);

    float * features = &ex->p[ibatch*(nfeature+1) + 1];
    
    float dot = 0.0;
    logsums[ibatch] = -100000.0;
    for(int iclass = 0; iclass < NCLASS ; iclass ++){
      float * mi = &model[iclass * nfeature];
      dot = 0.0;
      //for(int i=0;i<nfeature;i++){
      //  dot += features[i] * mi[i];
      //}
      dot = dot_dense(features, mi, nfeature);
      logprobs[ibatch][iclass] = dot;
      logsums[ibatch] = logadd(logsums[ibatch], logprobs[ibatch][iclass]);

      //std::cout << dot << "    " << logsums[ibatch] << std::endl;

    }

    float max = -100000;
    int imax = -1;
    for(int iclass = 0; iclass < NCLASS ; iclass ++){
      float prob = exp(logprobs[ibatch][iclass] - logsums[ibatch]);
      if(prob >= max){
        max = prob;
        imax = iclass;
      }
    }

    if(imax == 0) imax = small_class;
    else if(imax == 1) imax = large_class;

    //oldloss += -(logprobs[ibatch][label] - logsums[ibatch]);
    oldloss += (imax != label);
  }

  for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){
    for(int iclass = 0; iclass < NCLASS ; iclass ++){

      float * model = p_model->p;
      float * mi = &model[iclass * nfeature];
    
      
      int label = ex->p[ibatch*(nfeature+1)];
      float isequal;// = (iclass == label);

      if(iclass == 0){
        isequal = (small_class == label);
      }else{
        isequal = (large_class == label);
      }

      int nfeature = ex->n-1;
      float * features = &ex->p[ibatch*(nfeature+1) + 1];
      float prob_over_sum = exp(logprobs[ibatch][iclass] - logsums[ibatch]);

      //std::cout << label << " " << features[0] << " " << features[1] << " " << features[2] << std::endl;

      for(int i=0;i<nfeature;i++){
        mi[i] += STEPSIZE * (isequal * features[i] - features[i] * prob_over_sum - LAMBDA * mi[i]);
      }

    }
  }

  return oldloss;
 }


 double f_lr_grad_no_batch(const DenseVector<float>* const ex, GLMModelExample* const p_model){

  float logprobs[BATCHSIZE][NCLASS];
  float logsums[BATCHSIZE];
  int nfeature = ex->n-1;
  float oldloss = 0.0;
  float dot;

  float * model = p_model->p;
  int label = ex->p[0];

  //std::cout << label << std::endl;

  assert(label == small_class || label == large_class);
  float * features = &ex->p[1];

  for(int iclass = 0; iclass < NCLASS ; iclass ++){
    float * mi = &model[iclass * nfeature];
    dot = 0.0;
    dot = dot_dense(features, mi, nfeature);
    logprobs[0][iclass] = dot;
    logsums[0] = logadd(logsums[0], logprobs[0][iclass]);
  }

  float max = -100000;
  int imax = -1;
  for(int iclass = 0; iclass < NCLASS ; iclass ++){
    float prob = exp(logprobs[0][iclass] - logsums[0]);
    if(prob >= max){
      max = prob;
      imax = iclass;
    }
  }

  if(imax == 0) imax = small_class;
  else if(imax == 1) imax = large_class;

  oldloss += (imax != label);

  for(int iclass = 0; iclass < NCLASS ; iclass ++){

    float * model = p_model->p;
    float * mi = &model[iclass * nfeature];
  
    int label = ex->p[0];
    float isequal;
    if(iclass == 0){
      isequal = (small_class == label);
    }else{
      isequal = (large_class == label);
    }
    int nfeature = ex->n-1;
    float * features = &ex->p[1];
    float prob_over_sum = exp(logprobs[0][iclass] - logsums[0]);

    for(int i=0;i<nfeature;i++){
      mi[i] += STEPSIZE * (isequal * features[i] - features[i] * prob_over_sum - LAMBDA * mi[i]);
    }

  }

  return oldloss;

 }

 /**
 * \brief One example of the function that can be register to
 * Row-wise access (DW_ROW). This function takes as input
 * one row of the data (ex), and the current model (p_model),
 * and update the model with the gradient.
 * 
 */
 double f_lr_grad_old(const DenseVector<float>* const ex, GLMModelExample* const p_model){

  float logprobs[BATCHSIZE][NCLASS];
  float logsums[BATCHSIZE];
  int nfeature = ex->n-1;
  float oldloss = 0.0;

  for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){

    float * model = p_model->p;
    int label = ex->p[ibatch*(nfeature+1)];

    if(label != 0 && label != 1){
      continue;
    }

    float * features = &ex->p[ibatch*(nfeature+1) + 1];
    
    float dot = 0.0;
    logsums[ibatch] = -100000.0;
    for(int iclass = 0; iclass < NCLASS ; iclass ++){
      float * mi = &model[iclass * nfeature];
      dot = 0.0;
      //for(int i=0;i<nfeature;i++){
      //  dot += features[i] * mi[i];
      //}
      dot = dot_dense(features, mi, nfeature);
      logprobs[ibatch][iclass] = dot;
      logsums[ibatch] = logadd(logsums[ibatch], logprobs[ibatch][iclass]);
    }

    float max = -100000;
    int imax = -1;
    for(int iclass = 0; iclass < NCLASS ; iclass ++){
      float prob = exp(logprobs[ibatch][iclass] - logsums[ibatch]);
      if(prob >= max){
        max = prob;
        imax = iclass;
      }
    }

    //oldloss += -(logprobs[ibatch][label] - logsums[ibatch]);
    oldloss += (imax != label);
  }

  for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){
    for(int iclass = 0; iclass < NCLASS ; iclass ++){

      float * model = p_model->p;
      float * mi = &model[iclass * nfeature];
    
      
      int label = ex->p[ibatch*(nfeature+1)];
      float isequal = (iclass == label);

      int nfeature = ex->n-1;
      float * features = &ex->p[ibatch*(nfeature+1) + 1];
      float prob_over_sum = exp(logprobs[ibatch][iclass] - logsums[ibatch]);

      for(int i=0;i<nfeature;i++){
        mi[i] += STEPSIZE * (isequal * features[i] - features[i] * prob_over_sum - LAMBDA * mi[i]);
      }

    }
  }

  return oldloss;
 }

 /**
 * \brief One example main entry of how to use DimmWitted.
 * The application is Stochastic Gradient Descent (SGD)
 * with Row-wise Access.
 * 
 * \tparam MODELREPL Model replication strategy.
 * \tparam DATAREPL Data replication strategy.
 */
 template<ModelReplType MODELREPL, DataReplType DATAREPL>
 float test_glm_dense_sgd(long nfeat, long nexp, float * content, float * content2, long nexp_test, float * content_test){

  long nbatches = 2 * nexp / BATCHSIZE;
  std::cout << "NBATCHES = " << nbatches << std::endl;

  float ** examples = new float* [nbatches];  // pointers to each row
  for(long i=0;i<nbatches;i+=2){
    examples[i] = &content[i*(nfeat+1)*BATCHSIZE/2];
    examples[i+1] = &content2[i*(nfeat+1)*BATCHSIZE/2];
  }

  float ** examples_test = new float* [nexp_test];
  for(int i=0;i<nexp_test;i++){
    examples_test[i] = &content_test[i*(nfeat+1)];
  }

  GLMModelExample model(nfeat * NCLASS); // for each class, there is a vector
  for(int i=0;i<model.n;i++){
    model.p[i] = 0.0;
  }

  DenseDimmWitted<float, GLMModelExample, MODELREPL, DATAREPL, DW_ACCESS_ROW> 
    dw(examples, nbatches, nfeat+1, &model);

  DenseDimmWitted<float, GLMModelExample, MODELREPL, DATAREPL, DW_ACCESS_ROW> 
    dw_test(examples_test, nexp_test, nfeat+1, &model);

  unsigned int f_handle_grad = dw.register_row(f_lr_grad);
  unsigned int f_handle_loss = dw.register_row(f_lr_loss);
  unsigned int f_handle_acc  = dw.register_row(f_lr_top1acc);
  dw.register_model_avg(f_handle_grad, f_lr_modelavg);
  dw.register_model_avg(f_handle_loss, f_lr_modelavg);
  dw.register_model_avg(f_handle_acc, f_lr_modelavg);

  dw.set_n_thread_per_node(1);
  dw.set_n_numa_node(1);

  dw_test.set_n_thread_per_node(1);
  dw_test.set_n_numa_node(1);

  unsigned int f_handle_acc_test  = dw_test.register_row(f_lr_top1acc);
  dw_test.register_model_avg(f_handle_acc_test, f_lr_modelavg);
  unsigned int f_handle_acc5_test  = dw_test.register_row(f_lr_top5acc);
  dw_test.register_model_avg(f_handle_acc5_test, f_lr_modelavg);

  float sum = 0.0;
  for(int i_epoch=0;i_epoch<5000;i_epoch++){
    /*
    if(i_epoch % 5 == 0){
      //float loss = dw.exec(f_handle_loss)/nexp;
      //float acctop1 = dw.exec(f_handle_acc)/nexp;
      float acctop1_test = dw_test.exec(f_handle_acc_test)/nexp_test;
      float acctop5_test = dw_test.exec(f_handle_acc5_test)/nexp_test;
      std::cout << "loss=" << loss << "   acc(top1-test)=" << acctop1_test << "   acc(top5-test)=" << acctop5_test  << std::endl;
    }else{
      float acctop1_test = dw_test.exec(f_handle_acc_test)/nexp_test;
      float acctop5_test = dw_test.exec(f_handle_acc5_test)/nexp_test;
      std::cout << "-------------   acc(top1-test)=" << acctop1_test << "   acc(top5-test)=" << acctop5_test  << std::endl;
    }
    */
    std::cout << "EPOCH: " << i_epoch << " STEPSIZE " << STEPSIZE << std::endl;
    Timer t;
    float oldloss = dw.exec(f_handle_grad);
    float data_byte = 1.0 * sizeof(float) * 2*nexp * nfeat;
    float te = t.elapsed();
    float throughput_gb = data_byte / te / 1024 / 1024 / 1024;
    //float acctop1 = dw.exec(f_handle_acc)/nexp/2;
    float acctop1_test;
    if(i_epoch % 10 == 0){
      acctop1_test = dw_test.exec(f_handle_acc_test)/nexp_test;
    }else{
      acctop1_test = -1;
    }
    //float acctop5_test = dw_test.exec(f_handle_acc5_test)/nexp_test;
    //std::cout << "old loss=" << oldloss << "    acc(train)=" << acctop1 << "   acc(top1-test)=" << acctop1_test << "   acc(top5-test)=" << acctop5_test  << std::endl;
    std::cout << "old loss=" << oldloss << "   acc(top1-test)=" << acctop1_test << std::endl;
    std::cout << "TIME=" << te << " secs" << " THROUGHPUT=" << throughput_gb << " GB/sec." << std::endl;
  
    //std::ofstream fout((std::string("snapshot_") + std::to_string(i_epoch)).c_str());
    //for(long i=0;i<nfeat * NCLASS;i++){
    //  fout << model.p[i] << std::endl;
    //}
    //fout.close();

    STEPSIZE = STEPSIZE * DECAY;

    if(oldloss < 50){
      break;
    }

  }
  dw_test.exec(f_handle_acc_test);

  // Return the sum of the model. This value should be 
  // around 1.3-1.4 for this example.
  //
  return sum;
 }

 float * mmap_open(int nfeature, char * filename, long &nelem){
  int fdin;
  float * src;
  struct stat statbuf;

  fdin = open (filename, O_RDONLY);
  fstat (fdin,&statbuf);
  nelem = statbuf.st_size / sizeof(float) / (nfeature + 1);

  src = (float *) mmap (0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0);

  return src;
 }

 int main(int argc, char** argv){

  int nfeature = atof(argv[1]);
  char * train_filename = argv[2];
  long train_nelem = 1250;
  float * train;

  char * train_filenme2 = argv[3];
  float * train2;

  char * test_filename = argv[4];
  long test_nelem = 100;
  float * test;

  //train = mmap_open(nfeature, train_filename, train_nelem);
  //train2 = mmap_open(nfeature, train_filenme2, train_nelem);


  /*
  for(long i=0;i<train_nelem;i++){
    assert(train[i*(nfeature+1)] == small_class || train[i*(nfeature+1)] == large_class);
    assert(train2[i*(nfeature+1)] == small_class || train2[i*(nfeature+1)] == large_class);  
  }
  */

  /*
  for(long i=0;i<train_nelem;i++){
    std::cout << i*(nfeature+1) << std::endl;
    train[i*(nfeature+1)] = 0;
    train2[i*(nfeature+1)] = 1;
  }
  */


  float * train_mm_1 = new float[(nfeature+1)*train_nelem];
  float * train_mm_2 = new float[(nfeature+1)*train_nelem];
  float * test_mm = new float[(nfeature+1)*test_nelem];

  FILE * f1 = fopen(train_filename, "rb");
  FILE * f2 = fopen(train_filenme2, "rb");
  FILE * f3 = fopen(test_filename, "rb");

  //std::ifstream fin1("db_NoPreprocess_train1250_class_10_4_features_plain.txt");
 	//std::ifstream fin2("db_NoPreprocess_val50_class_10_4_features_plain.txt");

  std::cout << (nfeature+1)*train_nelem << std::endl;
  std::cout << (nfeature+1)*test_nelem << std::endl;

  /*
  float tmpp;
  for(long i=0;i<train_nelem;i++){
    fin1 >> train_mm_1[i*(nfeature+1)] >> tmpp;
    for(long j=0;j<nfeature;j++) fin1 >> train_mm_1[i*(nfeature+1)+j+1];
  }
  for(long i=0;i<train_nelem;i++){
 		fin1 >> train_mm_2[i*(nfeature+1)] >> tmpp;
 		for(long j=0;j<nfeature;j++) fin1 >> train_mm_2[i*(nfeature+1)+j+1];
  }
  for(long i=0;i<test_nelem;i++){
    fin2 >> test_mm[i*(nfeature+1)] >> tmpp;
    for(long j=0;j<nfeature;j++) fin2 >> test_mm[i*(nfeature+1)+j+1];
  }
  */

  long a = fread((char*) train_mm_1, sizeof(float), (nfeature+1)*train_nelem, f1);
  long b = fread((char*) train_mm_2, sizeof(float), (nfeature+1)*train_nelem, f2);
  long c = fread((char*) test_mm, sizeof(float), (nfeature+1)*test_nelem, f3);

  std::cout << a << std::endl;
  std::cout << c << std::endl;

  assert(a == (nfeature+1)*train_nelem);
  assert(b == (nfeature+1)*train_nelem);
  assert(c == (nfeature+1)*test_nelem);

  fclose(f1);
  fclose(f2);
  fclose(f3);

  //small_class = train_mm_1[0] < train_mm_2[0] ? train_mm_1[0] : train_mm_2[0];
  //large_class = train_mm_1[0] < train_mm_2[0] ? train_mm_2[0] : train_mm_1[0];

  small_class = 0;
  large_class = 1;

  const long NN = train_nelem*(nfeature+1);

  //munmap(train, train_nelem*(nfeature+1)*sizeof(float));
  //munmap(train2, train_nelem*(nfeature+1)*sizeof(float));

  /*
  for(long i=0;i<train_nelem;i++){
    train_mm_1[i*(nfeature+1)] = 0;
    train_mm_2[i*(nfeature+1)] = 1;
  }
  */

  //test = mmap_open(nfeature, test_filename, test_nelem);

  std::cout << "NEW" << std::endl;
  //train_nelem = 2600;
  std::cout << "# examples = " << train_nelem << std::endl;
  std::cout << "# examples (test) = " << test_nelem << std::endl;
  std::cout << "# features = " << nfeature << std::endl;
  std::cout << "# nclass   = " << NCLASS << std::endl;
  std::cout << "# small_class = " << small_class << std::endl;
  std::cout << "# large_class = " << large_class << std::endl;

  float rs = test_glm_dense_sgd<DW_MODELREPL_PERMACHINE, DW_DATAREPL_SHARDING>(
    nfeature, train_nelem, train_mm_1, train_mm_2, test_nelem, test_mm);

  return 0;
 }
	// Copyright 2014 Hazy Research (http://i.stanford.edu/hazy)
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#define NCLASS 2
	//#define STEPSIZE 0.0001
	//#define STEPSIZE 0.1

	float STEPSIZE = 0.0005; // This is good for most clases
	float LAMBDA = 0.1;
	float DECAY = 0.99;

	//float STEPSIZE = 0.0005;
	//float LAMBDA = 0.3;
	//float DECAY = 0.999;

	#define BATCHSIZE 1
	#define BATCHSIZE_TEST 1

	#include <stdio.h>
	#include <stdlib.h>
	#include <iostream>
	#include <fstream>

	#include <sys/types.h>
	#include <sys/stat.h>
	#include <sys/mman.h>
	#include <fcntl.h>

	#include "dimmwitted.h"

	int small_class = -1;
	int large_class = -1;


	#define LOG_2 0.693147180559945
	#define MINUS_LOG_THRESHOLD -18.42

	float dot_dense(const float * const x,
	const float * const y,
	int N){

	const float reminder = N % 8;
	float rs[8] = {0,0,0,0,0,0,0,0};
	for(int i=reminder;i<N;i+=8){
	rs[0] += x[i] * y[i];
	rs[1] += x[i+1] * y[i+1];
	rs[2] += x[i+2] * y[i+2];
	rs[3] += x[i+3] * y[i+3];
	rs[4] += x[i+4] * y[i+4];
	rs[5] += x[i+5] * y[i+5];
	rs[6] += x[i+6] * y[i+6];
	rs[7] += x[i+7] * y[i+7];
	}
	float toreturn = (rs[0]+rs[1]+rs[2]+rs[3]+rs[4]+rs[5]+rs[6]+rs[7]);
	for(int i=0;i<reminder;i++){
	toreturn += x[i] * y[i];
	}
	return toreturn;
	}



	inline bool fast_exact_is_equal(float a, float b){
	return (a <= b && b <= a);
	}

	inline float logadd(float log_a, float log_b){

	if (log_a < log_b)
	{ // swap them
	float tmp = log_a;
	log_a = log_b;
	log_b = tmp;
	} else if (fast_exact_is_equal(log_a, log_b)) {
	// Special case when log_a == log_b. In particular this works when both
	// log_a and log_b are (+-) INFINITY: it will return (+-) INFINITY
	// instead of NaN.
	return LOG_2 + log_a;
	}
	float negative_absolute_difference = log_b - log_a;
	if (negative_absolute_difference < MINUS_LOG_THRESHOLD)
	return (log_a);
	return (log_a + log1p(exp(negative_absolute_difference)));

	}


	/**
	* \brief A model object. This model contains
	* two elements: (1) p: the pointers
	* to the paramters, and (2) n: the number
	* of paramters that this model contains.
	*
	* Note that, to use PerNode and PerCore
	* strategy, this object needs to have a copy
	* constructor.
	*
	*/
	class GLMModelExample{
	public:
	float * const p;
	int n;

	GLMModelExample(int _n):
	n(_n), p(new float[_n]){}

	GLMModelExample( const GLMModelExample& other ) :
	n(other.n), p(new float[other.n]){
	for(int i=0;i<n;i++){
	p[i] = other.p[i];
	}
	}

	};

	/**
	* \brief The function that takes input a series of models,
	* and update one of them according to others. You
	* need to register this one if you want to use
	* PerNode and PerCore strategy.
	*/
	void f_lr_modelavg(GLMModelExample const p_models, /< set of models*/
	int nreplicas, /*< number of models in the above set /
	int ireplica /*< id of the model that needs updates/
	){
	assert(false);
	}

	double f_lr_top1acc(const DenseVector<float>* const ex, GLMModelExample* const p_model){

	double rs = 0.0;
	for(int ibatch=0;ibatch<BATCHSIZE_TEST;ibatch++){

	float * model = p_model->p;
	int nfeature = ex->n-1;
	int label = ex->p[ibatch*(nfeature+1)];

	//if(label != 0 && label != 1){
	// continue;
	//}
	//assert(label == small_class \|\| label == large_class);

	float * features = &ex->p[ibatch*(nfeature+1) + 1];

	float logprobs[NCLASS];

	float dot = 0.0;
	float logsum = -1.00000;
	float max = -1000;
	float imax = -1;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float * mi = &model[iclass * nfeature];
	dot = dot_dense(features, mi, nfeature);
	//std::cout << dot << std::endl;
	//for(int i=0;i<nfeature;i++){
	// dot += features[i] * mi[i];
	//}
	logprobs[iclass] = dot;
	logsum = logadd(logsum, logprobs[iclass]);
	}

	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float prob = exp(logprobs[iclass] - logsum);
	if(prob >= max){
	max = prob;
	imax = iclass;
	}
	}

	if(imax == 0) imax = small_class;
	else if(imax == 1) imax = large_class;

	std::cout << "RESULT " << label << " -> " << imax << std::endl;

	if(imax == label){
	rs += 1.0;
	}else{
	rs += 0.0;
	}
	}
	return rs;
	}


	double f_lr_top5acc(const DenseVector<float>* const ex, GLMModelExample* const p_model){

	double rs = 0.0;
	for(int ibatch=0;ibatch<BATCHSIZE_TEST;ibatch++){

	float * model = p_model->p;
	int nfeature = ex->n-1;
	int label = ex->p[ibatch*(nfeature+1)];

	if(label != 0 && label != 1){
	continue;
	}

	float * features = &ex->p[ibatch*(nfeature+1) + 1];

	float logprobs[NCLASS];

	float dot = 0.0;
	float logsum = -1.00000;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float * mi = &model[iclass * nfeature];
	dot = dot_dense(features, mi, nfeature);
	//std::cout << dot << std::endl;
	//for(int i=0;i<nfeature;i++){
	// dot += features[i] * mi[i];
	//}
	logprobs[iclass] = dot;
	logsum = logadd(logsum, logprobs[iclass]);
	}

	for(int i=0;i<5;i++){
	float max = -1000;
	int imax = -1;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float prob = exp(logprobs[iclass] - logsum);
	if(prob >= max){
	max = prob;
	imax = iclass;
	}
	}
	//std::cout << imax << std::endl;
	if(imax == label){
	rs += 1.0;
	break;
	}
	logprobs[imax] = -10000;
	}
	}

	return rs;
	}

	/**
	* \brief One example of the function that can be register to
	* Row-wise access (DW_ROW). This function takes as input
	* one row of the data (ex), and the current model,
	* returns the loss.
	*/
	double f_lr_loss(const DenseVector<float>* const ex, GLMModelExample* const p_model){

	float * model = p_model->p;
	int label = ex->p[0];
	float * features = &ex->p[1];
	int nfeature = ex->n-1;
	float logprobs[NCLASS];

	//std::cout << label << std::endl;

	float dot = 0.0;
	float logsum = -1.00000;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float * mi = &model[iclass * nfeature];
	dot = 0.0;
	dot = dot_dense(features, mi, nfeature);
	//std::cout << dot << std::endl;

	//for(int i=0;i<nfeature;i++){
	// dot += features[i] * mi[i];
	//}
	//std::cout << dot << std::endl;
	logprobs[iclass] = dot;
	logsum = logadd(logsum, logprobs[iclass]);
	}

	float logprob = logprobs[label] - logsum;
	return -logprob;
	}


	/**
	* \brief One example of the function that can be register to
	* Row-wise access (DW_ROW). This function takes as input
	* one row of the data (ex), and the current model (p_model),
	* and update the model with the gradient.
	*
	*/
	double f_lr_grad(const DenseVector<float>* const ex, GLMModelExample* const p_model){

	float logprobs[BATCHSIZE][NCLASS];
	float logsums[BATCHSIZE];
	int nfeature = ex->n-1;
	float oldloss = 0.0;

	for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){

	float * model = p_model->p;
	int label = ex->p[ibatch*(nfeature+1)];

	//if(label != 0 && label != 1){
	// continue;
	//}
	assert(label == small_class \|\| label == large_class);

	float * features = &ex->p[ibatch*(nfeature+1) + 1];

	float dot = 0.0;
	logsums[ibatch] = -100000.0;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float * mi = &model[iclass * nfeature];
	dot = 0.0;
	//for(int i=0;i<nfeature;i++){
	// dot += features[i] * mi[i];
	//}
	dot = dot_dense(features, mi, nfeature);
	logprobs[ibatch][iclass] = dot;
	logsums[ibatch] = logadd(logsums[ibatch], logprobs[ibatch][iclass]);

	//std::cout << dot << " " << logsums[ibatch] << std::endl;

	}

	float max = -100000;
	int imax = -1;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float prob = exp(logprobs[ibatch][iclass] - logsums[ibatch]);
	if(prob >= max){
	max = prob;
	imax = iclass;
	}
	}

	if(imax == 0) imax = small_class;
	else if(imax == 1) imax = large_class;

	//oldloss += -(logprobs[ibatch][label] - logsums[ibatch]);
	oldloss += (imax != label);
	}

	for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){
	for(int iclass = 0; iclass < NCLASS ; iclass ++){

	float * model = p_model->p;
	float * mi = &model[iclass * nfeature];


	int label = ex->p[ibatch*(nfeature+1)];
	float isequal;// = (iclass == label);

	if(iclass == 0){
	isequal = (small_class == label);
	}else{
	isequal = (large_class == label);
	}

	int nfeature = ex->n-1;
	float * features = &ex->p[ibatch*(nfeature+1) + 1];
	float prob_over_sum = exp(logprobs[ibatch][iclass] - logsums[ibatch]);

	//std::cout << label << " " << features[0] << " " << features[1] << " " << features[2] << std::endl;

	for(int i=0;i<nfeature;i++){
	mi[i] += STEPSIZE * (isequal * features[i] - features[i] * prob_over_sum - LAMBDA * mi[i]);
	}

	}
	}

	return oldloss;
	}


	double f_lr_grad_no_batch(const DenseVector<float>* const ex, GLMModelExample* const p_model){

	float logprobs[BATCHSIZE][NCLASS];
	float logsums[BATCHSIZE];
	int nfeature = ex->n-1;
	float oldloss = 0.0;
	float dot;

	float * model = p_model->p;
	int label = ex->p[0];

	//std::cout << label << std::endl;

	assert(label == small_class \|\| label == large_class);
	float * features = &ex->p[1];

	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float * mi = &model[iclass * nfeature];
	dot = 0.0;
	dot = dot_dense(features, mi, nfeature);
	logprobs[0][iclass] = dot;
	logsums[0] = logadd(logsums[0], logprobs[0][iclass]);
	}

	float max = -100000;
	int imax = -1;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float prob = exp(logprobs[0][iclass] - logsums[0]);
	if(prob >= max){
	max = prob;
	imax = iclass;
	}
	}

	if(imax == 0) imax = small_class;
	else if(imax == 1) imax = large_class;

	oldloss += (imax != label);

	for(int iclass = 0; iclass < NCLASS ; iclass ++){

	float * model = p_model->p;
	float * mi = &model[iclass * nfeature];

	int label = ex->p[0];
	float isequal;
	if(iclass == 0){
	isequal = (small_class == label);
	}else{
	isequal = (large_class == label);
	}
	int nfeature = ex->n-1;
	float * features = &ex->p[1];
	float prob_over_sum = exp(logprobs[0][iclass] - logsums[0]);

	for(int i=0;i<nfeature;i++){
	mi[i] += STEPSIZE * (isequal * features[i] - features[i] * prob_over_sum - LAMBDA * mi[i]);
	}

	}

	return oldloss;

	}

	/**
	* \brief One example of the function that can be register to
	* Row-wise access (DW_ROW). This function takes as input
	* one row of the data (ex), and the current model (p_model),
	* and update the model with the gradient.
	*
	*/
	double f_lr_grad_old(const DenseVector<float>* const ex, GLMModelExample* const p_model){

	float logprobs[BATCHSIZE][NCLASS];
	float logsums[BATCHSIZE];
	int nfeature = ex->n-1;
	float oldloss = 0.0;

	for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){

	float * model = p_model->p;
	int label = ex->p[ibatch*(nfeature+1)];

	if(label != 0 && label != 1){
	continue;
	}

	float * features = &ex->p[ibatch*(nfeature+1) + 1];

	float dot = 0.0;
	logsums[ibatch] = -100000.0;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float * mi = &model[iclass * nfeature];
	dot = 0.0;
	//for(int i=0;i<nfeature;i++){
	// dot += features[i] * mi[i];
	//}
	dot = dot_dense(features, mi, nfeature);
	logprobs[ibatch][iclass] = dot;
	logsums[ibatch] = logadd(logsums[ibatch], logprobs[ibatch][iclass]);
	}

	float max = -100000;
	int imax = -1;
	for(int iclass = 0; iclass < NCLASS ; iclass ++){
	float prob = exp(logprobs[ibatch][iclass] - logsums[ibatch]);
	if(prob >= max){
	max = prob;
	imax = iclass;
	}
	}

	//oldloss += -(logprobs[ibatch][label] - logsums[ibatch]);
	oldloss += (imax != label);
	}

	for(int ibatch=0;ibatch<BATCHSIZE;ibatch++){
	for(int iclass = 0; iclass < NCLASS ; iclass ++){

	float * model = p_model->p;
	float * mi = &model[iclass * nfeature];


	int label = ex->p[ibatch*(nfeature+1)];
	float isequal = (iclass == label);

	int nfeature = ex->n-1;
	float * features = &ex->p[ibatch*(nfeature+1) + 1];
	float prob_over_sum = exp(logprobs[ibatch][iclass] - logsums[ibatch]);

	for(int i=0;i<nfeature;i++){
	mi[i] += STEPSIZE * (isequal * features[i] - features[i] * prob_over_sum - LAMBDA * mi[i]);
	}

	}
	}

	return oldloss;
	}

	/**
	* \brief One example main entry of how to use DimmWitted.
	* The application is Stochastic Gradient Descent (SGD)
	* with Row-wise Access.
	*
	* \tparam MODELREPL Model replication strategy.
	* \tparam DATAREPL Data replication strategy.
	*/
	template<ModelReplType MODELREPL, DataReplType DATAREPL>
	float test_glm_dense_sgd(long nfeat, long nexp, float * content, float * content2, long nexp_test, float * content_test){

	long nbatches = 2 * nexp / BATCHSIZE;
	std::cout << "NBATCHES = " << nbatches << std::endl;

	float ** examples = new float* [nbatches]; // pointers to each row
	for(long i=0;i<nbatches;i+=2){
	examples[i] = &content[i(nfeat+1)BATCHSIZE/2];
	examples[i+1] = &content2[i(nfeat+1)BATCHSIZE/2];
	}

	float ** examples_test = new float* [nexp_test];
	for(int i=0;i<nexp_test;i++){
	examples_test[i] = &content_test[i*(nfeat+1)];
	}

	GLMModelExample model(nfeat * NCLASS); // for each class, there is a vector
	for(int i=0;i<model.n;i++){
	model.p[i] = 0.0;
	}

	DenseDimmWitted<float, GLMModelExample, MODELREPL, DATAREPL, DW_ACCESS_ROW>
	dw(examples, nbatches, nfeat+1, &model);

	DenseDimmWitted<float, GLMModelExample, MODELREPL, DATAREPL, DW_ACCESS_ROW>
	dw_test(examples_test, nexp_test, nfeat+1, &model);

	unsigned int f_handle_grad = dw.register_row(f_lr_grad);
	unsigned int f_handle_loss = dw.register_row(f_lr_loss);
	unsigned int f_handle_acc = dw.register_row(f_lr_top1acc);
	dw.register_model_avg(f_handle_grad, f_lr_modelavg);
	dw.register_model_avg(f_handle_loss, f_lr_modelavg);
	dw.register_model_avg(f_handle_acc, f_lr_modelavg);

	dw.set_n_thread_per_node(1);
	dw.set_n_numa_node(1);

	dw_test.set_n_thread_per_node(1);
	dw_test.set_n_numa_node(1);

	unsigned int f_handle_acc_test = dw_test.register_row(f_lr_top1acc);
	dw_test.register_model_avg(f_handle_acc_test, f_lr_modelavg);
	unsigned int f_handle_acc5_test = dw_test.register_row(f_lr_top5acc);
	dw_test.register_model_avg(f_handle_acc5_test, f_lr_modelavg);

	float sum = 0.0;
	for(int i_epoch=0;i_epoch<5000;i_epoch++){
	/*
	if(i_epoch % 5 == 0){
	//float loss = dw.exec(f_handle_loss)/nexp;
	//float acctop1 = dw.exec(f_handle_acc)/nexp;
	float acctop1_test = dw_test.exec(f_handle_acc_test)/nexp_test;
	float acctop5_test = dw_test.exec(f_handle_acc5_test)/nexp_test;
	std::cout << "loss=" << loss << " acc(top1-test)=" << acctop1_test << " acc(top5-test)=" << acctop5_test << std::endl;
	}else{
	float acctop1_test = dw_test.exec(f_handle_acc_test)/nexp_test;
	float acctop5_test = dw_test.exec(f_handle_acc5_test)/nexp_test;
	std::cout << "------------- acc(top1-test)=" << acctop1_test << " acc(top5-test)=" << acctop5_test << std::endl;
	}
	*/
	std::cout << "EPOCH: " << i_epoch << " STEPSIZE " << STEPSIZE << std::endl;
	Timer t;
	float oldloss = dw.exec(f_handle_grad);
	float data_byte = 1.0 * sizeof(float) * 2nexp nfeat;
	float te = t.elapsed();
	float throughput_gb = data_byte / te / 1024 / 1024 / 1024;
	//float acctop1 = dw.exec(f_handle_acc)/nexp/2;
	float acctop1_test;
	if(i_epoch % 10 == 0){
	acctop1_test = dw_test.exec(f_handle_acc_test)/nexp_test;
	}else{
	acctop1_test = -1;
	}
	//float acctop5_test = dw_test.exec(f_handle_acc5_test)/nexp_test;
	//std::cout << "old loss=" << oldloss << " acc(train)=" << acctop1 << " acc(top1-test)=" << acctop1_test << " acc(top5-test)=" << acctop5_test << std::endl;
	std::cout << "old loss=" << oldloss << " acc(top1-test)=" << acctop1_test << std::endl;
	std::cout << "TIME=" << te << " secs" << " THROUGHPUT=" << throughput_gb << " GB/sec." << std::endl;

	//std::ofstream fout((std::string("snapshot_") + std::to_string(i_epoch)).c_str());
	//for(long i=0;i<nfeat * NCLASS;i++){
	// fout << model.p[i] << std::endl;
	//}
	//fout.close();

	STEPSIZE = STEPSIZE * DECAY;

	if(oldloss < 50){
	break;
	}

	}
	dw_test.exec(f_handle_acc_test);

	// Return the sum of the model. This value should be
	// around 1.3-1.4 for this example.
	//
	return sum;
	}

	float * mmap_open(int nfeature, char * filename, long &nelem){
	int fdin;
	float * src;
	struct stat statbuf;

	fdin = open (filename, O_RDONLY);
	fstat (fdin,&statbuf);
	nelem = statbuf.st_size / sizeof(float) / (nfeature + 1);

	src = (float *) mmap (0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0);

	return src;
	}

	int main(int argc, char** argv){

	int nfeature = atof(argv[1]);
	char * train_filename = argv[2];
	long train_nelem = 1250;
	float * train;

	char * train_filenme2 = argv[3];
	float * train2;

	char * test_filename = argv[4];
	long test_nelem = 100;
	float * test;

	//train = mmap_open(nfeature, train_filename, train_nelem);
	//train2 = mmap_open(nfeature, train_filenme2, train_nelem);


	/*
	for(long i=0;i<train_nelem;i++){
	assert(train[i(nfeature+1)] == small_class \|\| train[i(nfeature+1)] == large_class);
	assert(train2[i(nfeature+1)] == small_class \|\| train2[i(nfeature+1)] == large_class);
	}
	*/

	/*
	for(long i=0;i<train_nelem;i++){
	std::cout << i*(nfeature+1) << std::endl;
	train[i*(nfeature+1)] = 0;
	train2[i*(nfeature+1)] = 1;
	}
	*/


	float * train_mm_1 = new float[(nfeature+1)*train_nelem];
	float * train_mm_2 = new float[(nfeature+1)*train_nelem];
	float * test_mm = new float[(nfeature+1)*test_nelem];

	FILE * f1 = fopen(train_filename, "rb");
	FILE * f2 = fopen(train_filenme2, "rb");
	FILE * f3 = fopen(test_filename, "rb");

	//std::ifstream fin1("db_NoPreprocess_train1250_class_10_4_features_plain.txt");
	//std::ifstream fin2("db_NoPreprocess_val50_class_10_4_features_plain.txt");

	std::cout << (nfeature+1)*train_nelem << std::endl;
	std::cout << (nfeature+1)*test_nelem << std::endl;

	/*
	float tmpp;
	for(long i=0;i<train_nelem;i++){
	fin1 >> train_mm_1[i*(nfeature+1)] >> tmpp;
	for(long j=0;j<nfeature;j++) fin1 >> train_mm_1[i*(nfeature+1)+j+1];
	}
	for(long i=0;i<train_nelem;i++){
	fin1 >> train_mm_2[i*(nfeature+1)] >> tmpp;
	for(long j=0;j<nfeature;j++) fin1 >> train_mm_2[i*(nfeature+1)+j+1];
	}
	for(long i=0;i<test_nelem;i++){
	fin2 >> test_mm[i*(nfeature+1)] >> tmpp;
	for(long j=0;j<nfeature;j++) fin2 >> test_mm[i*(nfeature+1)+j+1];
	}
	*/

	long a = fread((char) train_mm_1, sizeof(float), (nfeature+1)train_nelem, f1);
	long b = fread((char) train_mm_2, sizeof(float), (nfeature+1)train_nelem, f2);
	long c = fread((char) test_mm, sizeof(float), (nfeature+1)test_nelem, f3);

	std::cout << a << std::endl;
	std::cout << c << std::endl;

	assert(a == (nfeature+1)*train_nelem);
	assert(b == (nfeature+1)*train_nelem);
	assert(c == (nfeature+1)*test_nelem);

	fclose(f1);
	fclose(f2);
	fclose(f3);

	//small_class = train_mm_1[0] < train_mm_2[0] ? train_mm_1[0] : train_mm_2[0];
	//large_class = train_mm_1[0] < train_mm_2[0] ? train_mm_2[0] : train_mm_1[0];

	small_class = 0;
	large_class = 1;

	const long NN = train_nelem*(nfeature+1);

	//munmap(train, train_nelem(nfeature+1)sizeof(float));
	//munmap(train2, train_nelem(nfeature+1)sizeof(float));

	/*
	for(long i=0;i<train_nelem;i++){
	train_mm_1[i*(nfeature+1)] = 0;
	train_mm_2[i*(nfeature+1)] = 1;
	}
	*/

	//test = mmap_open(nfeature, test_filename, test_nelem);

	std::cout << "NEW" << std::endl;
	//train_nelem = 2600;
	std::cout << "# examples = " << train_nelem << std::endl;
	std::cout << "# examples (test) = " << test_nelem << std::endl;
	std::cout << "# features = " << nfeature << std::endl;
	std::cout << "# nclass = " << NCLASS << std::endl;
	std::cout << "# small_class = " << small_class << std::endl;
	std::cout << "# large_class = " << large_class << std::endl;

	float rs = test_glm_dense_sgd<DW_MODELREPL_PERMACHINE, DW_DATAREPL_SHARDING>(
	nfeature, train_nelem, train_mm_1, train_mm_2, test_nelem, test_mm);

	return 0;
	}
No results found