support weighted examples

root · root · commit 461ef32627aa · 2016-02-02T23:07:51.000Z
fix the loss reduction from weighted ranking
diff --git a/Config.cpp b/Config.cpp
@@ -44,13 +44,17 @@ bool Config::readConfig(const std::string& fileName) {
     cmpIdx_ = (it != cfg.items().end())
       ? columnIdx[it->second.asString()] : -1;
 
+    it = cfg.find("weight_column");
+    weightIdx_ = (it != cfg.items().end())
+      ? columnIdx[it->second.asString()] : -1;
+
     it = cfg.find("loss_function");
     if (it != cfg.items().end() && it->second.asString() == "logistic") {
       lossFunction_ = L2Logistic;
     } else {
       lossFunction_ = L2Regression;
     }
-    
+
     const dynamic& trainColumns = cfg["train_columns"];
     for (auto it = trainColumns.begin(); it != trainColumns.end(); ++it) {
       trainIdx_.push_back(columnIdx.at(it->asString()));
diff --git a/Config.h b/Config.h
@@ -10,7 +10,7 @@ enum LossFunction {
   L2Regression = 0,
   L2Logistic   = 1
 };
- 
+
 // Specifying the training parameters and data format
 struct Config {
 
@@ -46,6 +46,10 @@ struct Config {
     return targetIdx_;
   }
 
+  int getWeightIdx() const {
+    return weightIdx_;
+  }
+
   int getCompareIdx() const {
     return cmpIdx_;
   }
@@ -93,8 +97,10 @@ struct Config {
 
   int targetIdx_;
   int cmpIdx_;
+  int weightIdx_;
+
   LossFunction lossFunction_;
-  
+
   std::vector<int> trainIdx_;
   std::vector<int> weakIdx_;
   std::vector<int> evalIdx_;
diff --git a/DataSet.cpp b/DataSet.cpp
@@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh)
     features_[i].fvec.reset(new vector<double>());
     features_[i].encoding = DOUBLE;
   }
+
+  if (cfg_.getWeightIdx() != -1) {
+    weights_.reset(new vector<double>());
+  }
 }
 
 bool DataSet::getEvalColumns(const std::string& line,
@@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line,
 
 bool DataSet::getRow(const string& line, double* target,
                      boost::scoped_array<double>& fvec,
+                     double* weight,
                      double* cmpValue) const {
   try {
     vector<folly::StringPiece> sv;
@@ -64,7 +69,9 @@ bool DataSet::getRow(const string& line, double* target,
     if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) {
       *cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str());
     }
-
+    if (cfg_.getWeightIdx() != -1 && weight != NULL) {
+      *weight = atof(sv[cfg_.getWeightIdx()].toString().c_str());
+    }
   } catch (...) {
     LOG(ERROR) << "fail to process line: " << line;
     return false;
@@ -100,7 +107,7 @@ double DataSet::getPrediction(TreeNode<uint16_t>* rt, int eid) const {
 }
 
 bool DataSet::addVector(const boost::scoped_array<double>& fvec,
-                        double target) {
+                        double target, double* weight) {
   if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) {
     return false;
   }
@@ -128,6 +135,10 @@ bool DataSet::addVector(const boost::scoped_array<double>& fvec,
       }
     }
   }
+  if (weights_) {
+    weights_->push_back(*weight);
+  }
+
   targets_.push_back(target);
   numExamples_++;
 
diff --git a/DataSet.h b/DataSet.h
@@ -49,11 +49,13 @@ class DataSet {
  public:
   DataSet(const Config& cfg, int bucketingThresh, int examplesThresh=-1);
 
-  bool addVector(const boost::scoped_array<double>& fvec, double target);
+  bool addVector(const boost::scoped_array<double>& fvec,
+                 double target, double* weight = NULL);
 
   bool getRow(const std::string& line,
               double* target,
               boost::scoped_array<double>& fvec,
+              double* weight = NULL,
               double* cmpValue = NULL) const;
 
   bool getEvalColumns(const std::string& line,
@@ -63,6 +65,10 @@ class DataSet {
     return numExamples_;
   }
 
+  const std::unique_ptr<std::vector<double>>& getWeights() const {
+    return weights_;
+  }
+
   void getFeatureVec(const int eid, boost::scoped_array<uint16_t>& fvec) const {
     for (int i = 0; i < numFeatures_; i++) {
       if (features_[i].encoding == EMPTY) {
@@ -103,6 +109,7 @@ class DataSet {
 
   boost::scoped_array<FeatureData> features_;
   std::vector<double> targets_;
+  std::unique_ptr<std::vector<double>> weights_;
 
   friend class TreeRegressor;
   friend class Gbm;
@@ -126,4 +133,3 @@ template<class T> void split(const std::vector<int>& subset,
 }
 
 }
-
diff --git a/Gbm.cpp b/Gbm.cpp
@@ -49,7 +49,8 @@ class ParallelEval : public apache::thrift::concurrency::Runnable {
         //double score = weakModel_->eval(fvec);
         double score = ds_.getPrediction(weakModel_.get(), i);
         F_[i] += score;
-        subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]);
+        const double* wt = ds_.getWeights() ? &((*ds_.getWeights())[i]) : NULL;
+        subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i], wt);
       }
     }
     monitor_.decrement();
@@ -78,14 +79,14 @@ void Gbm::getModel(
   boost::scoped_array<double> F(new double[numExamples]);
   boost::scoped_array<double> y(new double[numExamples]);
 
-  double f0 = fun_.getF0(ds_.targets_);
+  double f0 = fun_.getF0(ds_.targets_, ds_.getWeights().get());
   for (int i = 0; i < numExamples; i++) {
     F[i] = f0;
   }
 
   model->push_back(new LeafNode<double>(f0));
 
-  double initLoss = fun_.getInitLoss(ds_.targets_);
+  double initLoss = fun_.getInitLoss(ds_.targets_, ds_.getWeights().get());
 
   LOG(INFO) << "init avg loss " << initLoss / numExamples;
 
@@ -131,7 +132,8 @@ void Gbm::getModel(
         // double score = weakModel->eval(fvec);
         double score = ds_.getPrediction(weakModel.get(), i);
         F[i] += score;
-        newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]);
+        const double* wt = ds_.getWeights() ? &((*ds_.getWeights())[i]) : NULL;
+        newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i], wt);
       }
     }
 
diff --git a/GbmFun.h b/GbmFun.h
@@ -12,19 +12,23 @@ namespace boosting {
 class GbmFun {
  public:
   virtual double getLeafVal(const std::vector<int>& subset,
-                            const boost::scoped_array<double>& y) const = 0;
+                            const boost::scoped_array<double>& y,
+                            const std::vector<double>* wts = NULL) const = 0;
 
-  virtual double getF0(const std::vector<double>& y) const = 0;
+  virtual double getF0(const std::vector<double>& y,
+                       const std::vector<double>* wts = NULL) const = 0;
 
   virtual void getGradient(const std::vector<double>& y,
                            const boost::scoped_array<double>& F,
-                           boost::scoped_array<double>& grad) const = 0;
+                           boost::scoped_array<double>& grad,
+                           const std::vector<double>* wts = NULL) const = 0;
 
-  virtual double getInitLoss(const std::vector<double>& y) const = 0;
+  virtual double getInitLoss(const std::vector<double>& y,
+                             const std::vector<double>* wts = NULL) const = 0;
 
-  virtual double getExampleLoss(const double y, const double f) const = 0;
+  virtual double getExampleLoss(const double y, const double f, const double* w = NULL) const = 0;
 
-  virtual void accumulateExampleLoss(const double y, const double f) = 0;
+  virtual void accumulateExampleLoss(const double y, const double f, const double* w = NULL) = 0;
 
   virtual double getReduction() const = 0;
 
@@ -36,63 +40,81 @@ class GbmFun {
 
 class LeastSquareFun : public GbmFun {
  public:
-  LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0) {
+  LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0), sumw_(0.0) {
   }
 
   double getLeafVal(const std::vector<int>& subset,
-                    const boost::scoped_array<double>& y) const {
-
-    double sum = 0;
+                    const boost::scoped_array<double>& y, const std::vector<double>* wts = NULL) const {
+    double sumwy = 0;
+    double sumw = 0;
     for (const auto& id : subset) {
-      sum += y[id];
+      double w = ((wts != NULL) ? (*wts)[id] : 1.0);
+      sumw += w;
+      sumwy += w * y[id];
     }
-    return sum/subset.size();
+    return sumwy/sumw;
   }
 
-  double getF0(const std::vector<double>& yvec) const {
-    double sum = 0.0;
-    for (const auto& y : yvec) {
-      sum += y;
+  double getF0(const std::vector<double>& yvec, const std::vector<double>* wts = NULL) const {
+    double sumwy = 0;
+    double sumw = 0;
+    for (int i = 0; i < yvec.size(); i++) {
+      double w = ((wts != NULL) ? (*wts)[i] : 1.0);
+      sumw += w;
+      sumwy += w * yvec[i];
     }
-    return sum/yvec.size();
+    return sumwy/sumw;
   }
 
   void getGradient(const std::vector<double>& y,
                    const boost::scoped_array<double>& F,
-                   boost::scoped_array<double>& grad) const {
+                   boost::scoped_array<double>& grad,
+                   const std::vector<double>* wts = NULL) const {
 
     int size = y.size();
 
     for (int i = 0; i < size; i++) {
-      grad[i] = y[i] - F[i];
+      double w = ((wts != NULL) ? (*wts)[i] : 1.0);
+      grad[i] = w * (y[i] - F[i]);
     }
   }
 
-  double getInitLoss(const std::vector<double>& yvec) const {
+  double getInitLoss(const std::vector<double>& yvec,
+                     const std::vector<double>* wts = NULL) const {
+
     double sumy = 0.0;
     double sumy2 = 0.0;
+    double sumw = 0.0;
 
-    for (const auto& y : yvec) {
-      sumy += y;
-      sumy2 += y*y;
+    for (int i = 0; i < yvec.size(); i++) {
+      double w = ((wts != NULL) ? (*wts)[i] : 1.0);
+      double y = yvec[i];
+
+      sumw += w;
+      sumy += w*y;
+      sumy2 += w*y*y;
     }
 
-    return sumy2 - sumy * sumy/yvec.size();
+    return sumy2 - sumy * sumy/sumw;
   }
 
-  double getExampleLoss(const double y, const double f) const {
-    return (y - f) * (y - f);
+  double getExampleLoss(const double y, const double f, const double* wt = NULL) const {
+    double w = ((wt != NULL) ? *wt : 1.0);
+    return w * (y - f) * (y - f);
   }
 
-  void accumulateExampleLoss(const double y, const double f) {
-    sumy_ += y;
+  void accumulateExampleLoss(const double y, const double f, const double *wt = NULL) {
+    double w = (wt != NULL) ? *wt : 1.0;
+    sumy_ += w * y;
     numExamples_ += 1;
-    sumy2_ += y * y;
-    l2_ += getExampleLoss(y, f);
+    sumw_ += w;
+    sumy2_ += w * y * y;
+
+    l2_ += getExampleLoss(y, f, wt);
   }
 
   double getReduction() const {
-    return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/numExamples_);
+    return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/sumw_);
   }
 
   int getNumExamples() const {
@@ -108,6 +130,7 @@ class LeastSquareFun : public GbmFun {
   double sumy_;
   double sumy2_;
   double l2_;
+  double sumw_;
 };
 
 }
diff --git a/LogisticFun.h b/LogisticFun.h
@@ -7,7 +7,8 @@ namespace boosting {
 class LogisticFun : public GbmFun {
  public:
   double getLeafVal(const std::vector<int>& subset,
-		    const boost::scoped_array<double>& y) const {
+		    const boost::scoped_array<double>& y,
+                    const std::vector<double>* wts = NULL) const {
     double wx = 0.0, wy = 0.0;
     for (const auto& id : subset) {
       double yi = y[id];
@@ -17,7 +18,8 @@ class LogisticFun : public GbmFun {
     return wy / wx;
   }
 
-  double getF0(const std::vector<double>& y) const {
+  double getF0(const std::vector<double>& y,
+               const std::vector<double>* wts = NULL) const {
     double sumy = 0.0;
     for (const auto yi  : y) {
       sumy += yi;
@@ -28,14 +30,15 @@ class LogisticFun : public GbmFun {
 
   void getGradient(const std::vector<double>& y,
 		   const boost::scoped_array<double>& F,
-		   boost::scoped_array<double>& grad) const {
+		   boost::scoped_array<double>& grad,
+                   const std::vector<double>* wts = NULL) const {
     int size = y.size();
     for (int i = 0; i < size; i++) {
       grad[i] = 2.0 * y[i]/(1.0 + exp(2.0 * y[i] * F[i]));
     }
   }
 
-  double getInitLoss(const std::vector<double>& y) const {
+  double getInitLoss(const std::vector<double>& y, const std::vector<double>* wts = NULL) const {
     int posCount = 0;
     for (const auto yi : y) {
       if (yi > 0) {
@@ -45,11 +48,11 @@ class LogisticFun : public GbmFun {
     return getEntropy(posCount, y.size()) * y.size();
   }
 
-  double getExampleLoss(const double y, const double f) const {
+  double getExampleLoss(const double y, const double f, const double* w = NULL) const {
     return log(1.0 + exp(-2.0 * y * f));
   }
 
-  void accumulateExampleLoss(const double y, const double f) {
+  void accumulateExampleLoss(const double y, const double f, const double* w = NULL) {
     numExamples_ += 1;
     if (y > 0) {
       posCount_ += 1;
@@ -75,7 +78,7 @@ class LogisticFun : public GbmFun {
     double posProb = double(posCount)/numExamples;
     return -(posProb * log(posProb) + (1 - posProb) * log(1.0 - posProb));
   }
-  
+
   int numExamples_;
   int posCount_;
   double logloss_;
diff --git a/Train.cpp b/Train.cpp
diff --git a/TreeRegressor.cpp b/TreeRegressor.cpp
diff --git a/TreeRegressor.h b/TreeRegressor.h

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh)`
`25`	`25`	`features_[i].fvec.reset(new vector<double>());`
`26`	`26`	`features_[i].encoding = DOUBLE;`
`27`	`27`	`}`
	`28`	`+`
	`29`	`+ if (cfg_.getWeightIdx() != -1) {`
	`30`	`+ weights_.reset(new vector<double>());`
	`31`	`+ }`
`28`	`32`	`}`
`29`	`33`
`30`	`34`	`bool DataSet::getEvalColumns(const std::string& line,`
`@@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line,`
`41`	`45`
`42`	`46`	`bool DataSet::getRow(const string& line, double* target,`
`43`	`47`	`boost::scoped_array<double>& fvec,`
	`48`	`+ double* weight,`
`44`	`49`	`double* cmpValue) const {`
`45`	`50`	`try {`
`46`	`51`	`vector<folly::StringPiece> sv;`
`@@ -64,7 +69,9 @@ bool DataSet::getRow(const string& line, double* target,`
`64`	`69`	`if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) {`
`65`	`70`	`*cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str());`
`66`	`71`	`}`
`67`		`-`
	`72`	`+ if (cfg_.getWeightIdx() != -1 && weight != NULL) {`
	`73`	`+ *weight = atof(sv[cfg_.getWeightIdx()].toString().c_str());`
	`74`	`+ }`
`68`	`75`	`} catch (...) {`
`69`	`76`	`LOG(ERROR) << "fail to process line: " << line;`
`70`	`77`	`return false;`
`@@ -100,7 +107,7 @@ double DataSet::getPrediction(TreeNode<uint16_t>* rt, int eid) const {`
`100`	`107`	`}`
`101`	`108`
`102`	`109`	`bool DataSet::addVector(const boost::scoped_array<double>& fvec,`
`103`		`- double target) {`
	`110`	`+ double target, double* weight) {`
`104`	`111`	`if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) {`
`105`	`112`	`return false;`
`106`	`113`	`}`
`@@ -128,6 +135,10 @@ bool DataSet::addVector(const boost::scoped_array<double>& fvec,`
`128`	`135`	`}`
`129`	`136`	`}`
`130`	`137`	`}`
	`138`	`+ if (weights_) {`
	`139`	`+ weights_->push_back(*weight);`
	`140`	`+ }`
	`141`	`+`
`131`	`142`	`targets_.push_back(target);`
`132`	`143`	`numExamples_++;`
`133`	`144`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,8 @@ class ParallelEval : public apache::thrift::concurrency::Runnable {`
`49`	`49`	`//double score = weakModel_->eval(fvec);`
`50`	`50`	`double score = ds_.getPrediction(weakModel_.get(), i);`
`51`	`51`	`F_[i] += score;`
`52`		`- subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]);`
	`52`	`+ const double* wt = ds_.getWeights() ? &((*ds_.getWeights())[i]) : NULL;`
	`53`	`+ subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i], wt);`
`53`	`54`	`}`
`54`	`55`	`}`
`55`	`56`	`monitor_.decrement();`
`@@ -78,14 +79,14 @@ void Gbm::getModel(`
`78`	`79`	`boost::scoped_array<double> F(new double[numExamples]);`
`79`	`80`	`boost::scoped_array<double> y(new double[numExamples]);`
`80`	`81`
`81`		`- double f0 = fun_.getF0(ds_.targets_);`
	`82`	`+ double f0 = fun_.getF0(ds_.targets_, ds_.getWeights().get());`
`82`	`83`	`for (int i = 0; i < numExamples; i++) {`
`83`	`84`	`F[i] = f0;`
`84`	`85`	`}`
`85`	`86`
`86`	`87`	`model->push_back(new LeafNode<double>(f0));`
`87`	`88`
`88`		`- double initLoss = fun_.getInitLoss(ds_.targets_);`
	`89`	`+ double initLoss = fun_.getInitLoss(ds_.targets_, ds_.getWeights().get());`
`89`	`90`
`90`	`91`	`LOG(INFO) << "init avg loss " << initLoss / numExamples;`
`91`	`92`
`@@ -131,7 +132,8 @@ void Gbm::getModel(`
`131`	`132`	`// double score = weakModel->eval(fvec);`
`132`	`133`	`double score = ds_.getPrediction(weakModel.get(), i);`
`133`	`134`	`F[i] += score;`
`134`		`- newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]);`
	`135`	`+ const double* wt = ds_.getWeights() ? &((*ds_.getWeights())[i]) : NULL;`
	`136`	`+ newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i], wt);`
`135`	`137`	`}`
`136`	`138`	`}`
`137`	`139`