diff --git a/Config.cpp b/Config.cpp index 82d12fc..b5c4d81 100644 --- a/Config.cpp +++ b/Config.cpp @@ -44,13 +44,17 @@ bool Config::readConfig(const std::string& fileName) { cmpIdx_ = (it != cfg.items().end()) ? columnIdx[it->second.asString()] : -1; + it = cfg.find("weight_column"); + weightIdx_ = (it != cfg.items().end()) + ? columnIdx[it->second.asString()] : -1; + it = cfg.find("loss_function"); if (it != cfg.items().end() && it->second.asString() == "logistic") { lossFunction_ = L2Logistic; } else { lossFunction_ = L2Regression; } - + const dynamic& trainColumns = cfg["train_columns"]; for (auto it = trainColumns.begin(); it != trainColumns.end(); ++it) { trainIdx_.push_back(columnIdx.at(it->asString())); diff --git a/Config.h b/Config.h index de1229e..54a4d41 100644 --- a/Config.h +++ b/Config.h @@ -10,7 +10,7 @@ enum LossFunction { L2Regression = 0, L2Logistic = 1 }; - + // Specifying the training parameters and data format struct Config { @@ -46,6 +46,10 @@ struct Config { return targetIdx_; } + int getWeightIdx() const { + return weightIdx_; + } + int getCompareIdx() const { return cmpIdx_; } @@ -93,8 +97,10 @@ struct Config { int targetIdx_; int cmpIdx_; + int weightIdx_; + LossFunction lossFunction_; - + std::vector trainIdx_; std::vector weakIdx_; std::vector evalIdx_; diff --git a/DataSet.cpp b/DataSet.cpp index 149b7ac..bbf7dcf 100644 --- a/DataSet.cpp +++ b/DataSet.cpp @@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh) features_[i].fvec.reset(new vector()); features_[i].encoding = DOUBLE; } + + if (cfg_.getWeightIdx() != -1) { + weights_.reset(new vector()); + } } bool DataSet::getEvalColumns(const std::string& line, @@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line, bool DataSet::getRow(const string& line, double* target, boost::scoped_array& fvec, + double* weight, double* cmpValue) const { try { vector sv; @@ -64,7 +69,11 @@ bool DataSet::getRow(const string& line, double* target, if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) { *cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str()); } - + if (cfg_.getWeightIdx() != -1 && weight != NULL) { + *weight = atof(sv[cfg_.getWeightIdx()].toString().c_str()); + } else { + *weight = 1.0; + } } catch (...) { LOG(ERROR) << "fail to process line: " << line; return false; @@ -100,7 +109,7 @@ double DataSet::getPrediction(TreeNode* rt, int eid) const { } bool DataSet::addVector(const boost::scoped_array& fvec, - double target) { + double target, double weight) { if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) { return false; } @@ -128,6 +137,10 @@ bool DataSet::addVector(const boost::scoped_array& fvec, } } } + if (weights_) { + weights_->push_back(weight); + } + targets_.push_back(target); numExamples_++; diff --git a/DataSet.h b/DataSet.h index d4334bb..6aa032e 100644 --- a/DataSet.h +++ b/DataSet.h @@ -49,12 +49,14 @@ class DataSet { public: DataSet(const Config& cfg, int bucketingThresh, int examplesThresh=-1); - bool addVector(const boost::scoped_array& fvec, double target); + bool addVector(const boost::scoped_array& fvec, + double target, double weight); bool getRow(const std::string& line, double* target, boost::scoped_array& fvec, - double* cmpValue = NULL) const; + double* weight, + double* cmpValue) const; bool getEvalColumns(const std::string& line, boost::scoped_array& feval) const; @@ -63,6 +65,10 @@ class DataSet { return numExamples_; } + const std::unique_ptr>& getWeights() const { + return weights_; + } + void getFeatureVec(const int eid, boost::scoped_array& fvec) const { for (int i = 0; i < numFeatures_; i++) { if (features_[i].encoding == EMPTY) { @@ -103,6 +109,7 @@ class DataSet { boost::scoped_array features_; std::vector targets_; + std::unique_ptr> weights_; friend class TreeRegressor; friend class Gbm; @@ -126,4 +133,3 @@ template void split(const std::vector& subset, } } - diff --git a/Gbm.cpp b/Gbm.cpp index 86ecc6a..5be65da 100644 --- a/Gbm.cpp +++ b/Gbm.cpp @@ -49,7 +49,8 @@ class ParallelEval : public apache::thrift::concurrency::Runnable { //double score = weakModel_->eval(fvec); double score = ds_.getPrediction(weakModel_.get(), i); F_[i] += score; - subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]); + const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0; + subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i], wt); } } monitor_.decrement(); @@ -78,14 +79,14 @@ void Gbm::getModel( boost::scoped_array F(new double[numExamples]); boost::scoped_array y(new double[numExamples]); - double f0 = fun_.getF0(ds_.targets_); + double f0 = fun_.getF0(ds_.targets_, ds_.getWeights().get()); for (int i = 0; i < numExamples; i++) { F[i] = f0; } model->push_back(new LeafNode(f0)); - double initLoss = fun_.getInitLoss(ds_.targets_); + double initLoss = fun_.getInitLoss(ds_.targets_, ds_.getWeights().get()); LOG(INFO) << "init avg loss " << initLoss / numExamples; @@ -93,7 +94,7 @@ void Gbm::getModel( LOG(INFO) << "------- iteration " << it << " -------"; - fun_.getGradient(ds_.targets_, F, y); + fun_.getGradient(ds_.targets_, F, y, ds_.getWeights().get()); TreeRegressor regressor(ds_, y, fun_); std::unique_ptr> weakModel( @@ -131,7 +132,8 @@ void Gbm::getModel( // double score = weakModel->eval(fvec); double score = ds_.getPrediction(weakModel.get(), i); F[i] += score; - newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]); + const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0; + newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i], wt); } } diff --git a/GbmFun.h b/GbmFun.h index a02fd2d..0d79baa 100644 --- a/GbmFun.h +++ b/GbmFun.h @@ -2,6 +2,7 @@ #include #include +#include "glog/logging.h" namespace boosting { @@ -12,19 +13,23 @@ namespace boosting { class GbmFun { public: virtual double getLeafVal(const std::vector& subset, - const boost::scoped_array& y) const = 0; + const boost::scoped_array& y, + const std::vector* wts = NULL) const = 0; - virtual double getF0(const std::vector& y) const = 0; + virtual double getF0(const std::vector& y, + const std::vector* wts = NULL) const = 0; virtual void getGradient(const std::vector& y, const boost::scoped_array& F, - boost::scoped_array& grad) const = 0; + boost::scoped_array& grad, + const std::vector* wts = NULL) const = 0; - virtual double getInitLoss(const std::vector& y) const = 0; + virtual double getInitLoss(const std::vector& y, + const std::vector* wts = NULL) const = 0; - virtual double getExampleLoss(const double y, const double f) const = 0; + virtual double getExampleLoss(const double y, const double f, const double w) const = 0; - virtual void accumulateExampleLoss(const double y, const double f) = 0; + virtual void accumulateExampleLoss(const double y, const double f, const double w) = 0; virtual double getReduction() const = 0; @@ -36,30 +41,36 @@ class GbmFun { class LeastSquareFun : public GbmFun { public: - LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0) { + LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0), sumw_(0.0) { } double getLeafVal(const std::vector& subset, - const boost::scoped_array& y) const { - - double sum = 0; + const boost::scoped_array& y, const std::vector* wts = NULL) const { + double sumwy = 0; + double sumw = 0; for (const auto& id : subset) { - sum += y[id]; + double w = ((wts != NULL) ? (*wts)[id] : 1.0); + sumw += w; + sumwy += w * y[id]; } - return sum/subset.size(); + return sumwy/sumw; } - double getF0(const std::vector& yvec) const { - double sum = 0.0; - for (const auto& y : yvec) { - sum += y; + double getF0(const std::vector& yvec, const std::vector* wts = NULL) const { + double sumwy = 0; + double sumw = 0; + for (int i = 0; i < yvec.size(); i++) { + double w = ((wts != NULL) ? (*wts)[i] : 1.0); + sumw += w; + sumwy += w * yvec[i]; } - return sum/yvec.size(); + return sumwy/sumw; } void getGradient(const std::vector& y, const boost::scoped_array& F, - boost::scoped_array& grad) const { + boost::scoped_array& grad, + const std::vector* wts = NULL) const { int size = y.size(); @@ -68,31 +79,40 @@ class LeastSquareFun : public GbmFun { } } - double getInitLoss(const std::vector& yvec) const { + double getInitLoss(const std::vector& yvec, + const std::vector* wts = NULL) const { + double sumy = 0.0; double sumy2 = 0.0; + double sumw = 0.0; - for (const auto& y : yvec) { - sumy += y; - sumy2 += y*y; + for (int i = 0; i < yvec.size(); i++) { + double w = ((wts != NULL) ? (*wts)[i] : 1.0); + double y = yvec[i]; + + sumw += w; + sumy += w*y; + sumy2 += w*y*y; } - return sumy2 - sumy * sumy/yvec.size(); + return sumy2 - sumy * sumy/sumw; } - double getExampleLoss(const double y, const double f) const { - return (y - f) * (y - f); + double getExampleLoss(const double y, const double f, const double w) const { + return w * (y - f) * (y - f); } - void accumulateExampleLoss(const double y, const double f) { - sumy_ += y; + void accumulateExampleLoss(const double y, const double f, const double w) { + sumy_ += w * y; numExamples_ += 1; - sumy2_ += y * y; - l2_ += getExampleLoss(y, f); + sumw_ += w; + sumy2_ += w * y * y; + + l2_ += getExampleLoss(y, f, w); } double getReduction() const { - return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/numExamples_); + return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/sumw_); } int getNumExamples() const { @@ -108,6 +128,7 @@ class LeastSquareFun : public GbmFun { double sumy_; double sumy2_; double l2_; + double sumw_; }; } diff --git a/LogisticFun.h b/LogisticFun.h index 94b4d00..3420833 100644 --- a/LogisticFun.h +++ b/LogisticFun.h @@ -7,7 +7,8 @@ namespace boosting { class LogisticFun : public GbmFun { public: double getLeafVal(const std::vector& subset, - const boost::scoped_array& y) const { + const boost::scoped_array& y, + const std::vector* wts = NULL) const { double wx = 0.0, wy = 0.0; for (const auto& id : subset) { double yi = y[id]; @@ -17,7 +18,8 @@ class LogisticFun : public GbmFun { return wy / wx; } - double getF0(const std::vector& y) const { + double getF0(const std::vector& y, + const std::vector* wts = NULL) const { double sumy = 0.0; for (const auto yi : y) { sumy += yi; @@ -28,14 +30,15 @@ class LogisticFun : public GbmFun { void getGradient(const std::vector& y, const boost::scoped_array& F, - boost::scoped_array& grad) const { + boost::scoped_array& grad, + const std::vector* wts = NULL) const { int size = y.size(); for (int i = 0; i < size; i++) { grad[i] = 2.0 * y[i]/(1.0 + exp(2.0 * y[i] * F[i])); } } - double getInitLoss(const std::vector& y) const { + double getInitLoss(const std::vector& y, const std::vector* wts = NULL) const { int posCount = 0; for (const auto yi : y) { if (yi > 0) { @@ -45,16 +48,16 @@ class LogisticFun : public GbmFun { return getEntropy(posCount, y.size()) * y.size(); } - double getExampleLoss(const double y, const double f) const { + double getExampleLoss(const double y, const double f, const double w) const { return log(1.0 + exp(-2.0 * y * f)); } - void accumulateExampleLoss(const double y, const double f) { + void accumulateExampleLoss(const double y, const double f, const double w) { numExamples_ += 1; if (y > 0) { posCount_ += 1; } - logloss_ += getExampleLoss(y, f); + logloss_ += getExampleLoss(y, f, 1.0); } double getReduction() const { @@ -75,7 +78,7 @@ class LogisticFun : public GbmFun { double posProb = double(posCount)/numExamples; return -(posProb * log(posProb) + (1 - posProb) * log(1.0 - posProb)); } - + int numExamples_; int posCount_; double logloss_; diff --git a/Train.cpp b/Train.cpp index 0ae31b1..7ab20b3 100644 --- a/Train.cpp +++ b/Train.cpp @@ -67,7 +67,11 @@ class DataChunk : public apache::thrift::concurrency::Runnable { DataChunk(const Config& cfg, const DataSet& dataSet, CounterMonitor* monitorPtr = NULL) : - cfg_(cfg), dataSet_(dataSet), monitorPtr_(monitorPtr) {} + cfg_(cfg), dataSet_(dataSet), monitorPtr_(monitorPtr) { + if (cfg_.getWeightIdx() != -1) { + weights_.reset(new vector()); + } + } bool addLine(const string& s) { if (s.empty()) { @@ -81,10 +85,14 @@ class DataChunk : public apache::thrift::concurrency::Runnable { featureVectors_.reserve(lines_.size()); targets_.reserve(lines_.size()); boost::scoped_array farr(new double[cfg_.getNumFeatures()]); - double target; + double target, weight, cmpValue; + for (const string& line : lines_) { - if (dataSet_.getRow(line, &target, farr)) { + if (dataSet_.getRow(line, &target, farr, &weight, &cmpValue)) { targets_.push_back(target); + if (weights_) { + weights_->push_back(weight); + } featureVectors_.emplace_back(farr.get(), farr.get() + cfg_.getNumFeatures()); } @@ -102,10 +110,6 @@ class DataChunk : public apache::thrift::concurrency::Runnable { return featureVectors_; } - const vector& getTargets() const { - return targets_; - } - size_t getLineBufferSize() const { return lines_.size(); } @@ -121,10 +125,12 @@ class DataChunk : public apache::thrift::concurrency::Runnable { << "featureVectors_ and targets_ vectors must be the same size"; boost::scoped_array farr(new double[cfg_.getNumFeatures()]); size_t size = featureVectors_.size(); + for (size_t i = 0; i < size; ++i) { const auto fvec = featureVectors_[i]; copy(fvec.begin(), fvec.end(), farr.get()); - if (!dataSet->addVector(farr, targets_[i])) { + double weight = weights_ ? (*weights_)[i] : 1.0; + if (!dataSet->addVector(farr, targets_[i], weight)) { return i; } } @@ -139,7 +145,7 @@ class DataChunk : public apache::thrift::concurrency::Runnable { vector lines_; vector> featureVectors_; vector targets_; - + unique_ptr> weights_; }; // Divide training data file's lines into chunks, @@ -237,6 +243,8 @@ int main(int argc, char **argv) { LOG(INFO) << "loading config"; CHECK(cfg.readConfig(FLAGS_config_file)); + LOG(INFO) << "Examples has weights: " << cfg.getWeightIdx(); + unique_ptr pfun = getGbmFun(cfg.getLossFunction()); GbmFun& fun = *pfun; @@ -272,6 +280,9 @@ int main(int argc, char **argv) { ds.close(); + LOG(INFO) << "weights: " << (*ds.getWeights())[0]; + LOG(INFO) << "weights 1024: " << (*ds.getWeights())[1024]; + // Second, train the models Gbm engine(fun, ds, cfg); double* fimps = new double[cfg.getNumFeatures()]; @@ -313,7 +324,7 @@ int main(int argc, char **argv) { } // See how well the model performs on testing data - double target, score; + double target, score, wt, cmpValue; boost::scoped_array fvec(new double[cfg.getNumFeatures()]); int numEvalColumns = cfg.getEvalIdx().size(); boost::scoped_array feval(new string[numEvalColumns]); @@ -338,13 +349,14 @@ int main(int argc, char **argv) { } string line; vector scores; + while(getline(*is, line)) { - ds.getRow(line, &target, fvec, &score); + ds.getRow(line, &target, fvec, &wt, &cmpValue); double f; if (FLAGS_find_optimal_num_trees) { f = predict_vec(model, fvec, &scores); for (int i = 0; i < model.size(); i++) { - funs[i]->accumulateExampleLoss(target, scores[i]); + funs[i]->accumulateExampleLoss(target, scores[i], wt); } scores.clear(); } else { @@ -359,7 +371,7 @@ int main(int argc, char **argv) { (*os) << f << endl; } - fun.accumulateExampleLoss(target, f); + fun.accumulateExampleLoss(target, f, wt); if (fun.getNumExamples() % 1000 == 0) { LOG(INFO) << "test loss reduction: " << fun.getReduction() @@ -388,4 +400,3 @@ int main(int argc, char **argv) { << " on num examples: " << fun.getNumExamples(); } } - diff --git a/TreeRegressor.cpp b/TreeRegressor.cpp index 4f2fd59..0d56246 100644 --- a/TreeRegressor.cpp +++ b/TreeRegressor.cpp @@ -77,12 +77,13 @@ void TreeRegressor::getBestSplitFromHistogram( // Since the first term (sum of squares of all y-values) is independent // of our choice of where to split, it makes no difference, so we ignore it // in calculating loss. + // with weights, loss = - sum(w*y) * sum(w*y) / sum(w) // loss function if we don't split at all - double lossBefore = -1.0 * hist.totalSum * hist.totalSum / hist.totalCnt; + double lossBefore = -1.0 * hist.totalWeightedSum * hist.totalWeightedSum / hist.totalWeight; - int cntLeft = 0; // number of observations on or to left of idx - double sumLeft = 0.0; // number of observations strictly to right of idx + double weightLeft = 0; // sum of weights for observations on or to left of idx + double weightedSumLeft = 0.0; // sum of (w*y) for observations on or to left of idx double bestGain = 0.0; int bestIdx = -1; // everything strictly to right of idx @@ -91,22 +92,22 @@ void TreeRegressor::getBestSplitFromHistogram( for (int i = 0; i < hist.num - 1; i++) { - cntLeft += hist.cnt[i]; - sumLeft += hist.sumy[i]; + weightLeft += hist.weight[i]; + weightedSumLeft += hist.sumwy[i]; - double sumRight = hist.totalSum - sumLeft; - int cntRight = hist.totalCnt - cntLeft; + double weightedSumRight = hist.totalWeightedSum - weightedSumLeft; + int weightRight = hist.totalWeight - weightLeft; - if (cntLeft < FLAGS_min_leaf_examples) { + if (weightLeft < FLAGS_min_leaf_examples) { continue; } - if (cntRight < FLAGS_min_leaf_examples) { + if (weightRight < FLAGS_min_leaf_examples) { break; } double lossAfter = - -1.0 * sumLeft * sumLeft / cntLeft - - 1.0 * sumRight * sumRight / cntRight; + -1.0 * weightedSumLeft * weightedSumLeft / weightLeft + - 1.0 * weightedSumRight * weightedSumRight / weightRight; double gain = lossBefore - lossAfter; if (gain > bestGain) { @@ -139,10 +140,19 @@ TreeRegressor::getBestSplit(const vector* subset, // return a valid but degenerate split double bestGain = 0.0; - double totalSum = 0.0; // sum of all target values - - for (auto& id : *subset) { - totalSum += y_[id]; + double totalWeightedSum = 0.0; // sum of all target values + double totalWeights = 0.0; + const std::unique_ptr>& wv = ds_.getWeights(); + if (wv) { + for (auto& id : *subset) { + totalWeights += (*wv)[id]; + totalWeightedSum += (*wv)[id] * y_[id]; + } + } else { + totalWeights = subset->size(); + for (auto& id : *subset) { + totalWeightedSum += y_[id]; + } } // For each of a random sampling of features, see if splitting on that @@ -155,13 +165,13 @@ TreeRegressor::getBestSplit(const vector* subset, continue; } - Histogram hist(f.transitions.size() + 1, subset->size(), totalSum); + Histogram hist(f.transitions.size() + 1, totalWeights, totalWeightedSum); if (f.encoding == BYTE) { - buildHistogram(*subset, *(f.bvec), hist); + buildHistogram(*subset, *(f.bvec), ds_.getWeights().get(), hist); } else { CHECK(f.encoding == SHORT); - buildHistogram(*subset, *(f.svec), hist); + buildHistogram(*subset, *(f.svec), ds_.getWeights().get(), hist); } int fv; @@ -214,11 +224,9 @@ TreeNode* TreeRegressor::getTreeHelper( return NULL; } else if (!split->selected) { // leaf of decision tree - double fvote = fun_.getLeafVal(*(split->subset), y_); + double fvote = fun_.getLeafVal(*(split->subset), y_, ds_.getWeights().get()); LOG(INFO) << "leaf: " << fvote << ", #examples:" << split->subset->size(); - CHECK(split->subset->size() >= FLAGS_min_leaf_examples); - return new LeafNode(fvote); } else { // internal node of decision tree @@ -228,7 +236,7 @@ TreeNode* TreeRegressor::getTreeHelper( << std::min(split->left->subset->size(), split->right->subset->size()); fimps[split->fid] += split->gain; - double fvote = fun_.getLeafVal(*(split->subset), y_); + double fvote = fun_.getLeafVal(*(split->subset), y_, ds_.getWeights().get()); PartitionNode* node = new PartitionNode(split->fid, split->fv); node->setLeft(getTreeHelper(split->left, fimps)); node->setRight(getTreeHelper(split->right, fimps)); @@ -289,4 +297,3 @@ TreeRegressor::SplitNode* TreeRegressor::getBestSplits( } } - diff --git a/TreeRegressor.h b/TreeRegressor.h index 34e9c11..a280cd3 100644 --- a/TreeRegressor.h +++ b/TreeRegressor.h @@ -57,24 +57,25 @@ class TreeRegressor { // observations (as in a basic histogram), but also the sum of y-values // of those observations. struct Histogram { - const int num; // number of buckets - std::vector cnt; // number of observations in each bucket - std::vector sumy; // sum of y-values of those observations - const int totalCnt; - const double totalSum; + const int num; // number of buckets + std::vector weight; // number of observations in each bucket + std::vector sumwy; // sum of y-values of those observations + const int totalWeight; + const double totalWeightedSum; - Histogram(int n, int cnt, double sum) + Histogram(int n, double sumw, double sumwy) : num(n), - cnt(num, 0), - sumy(num, 0.0), - totalCnt(cnt), - totalSum(sum) { + weight(num, 0.0), + sumwy(num, 0.0), + totalWeight(sumw), + totalWeightedSum(sumwy) { } }; template void buildHistogram(const std::vector& subset, const std::vector& fvec, + const std::vector* weights, Histogram& hist) const; // Choose the x-value such that, by splitting the data at that value, we @@ -128,14 +129,24 @@ class TreeRegressor { template void TreeRegressor::buildHistogram(const std::vector& subset, const std::vector& fvec, + const std::vector* weights, Histogram& hist) const { - for(auto id : subset) { - const T& v = fvec[id]; + if (weights == NULL) { + for(auto id : subset) { + const T& v = fvec[id]; - hist.cnt[v] += 1; - hist.sumy[v] += y_[id]; + hist.weight[v] += 1.0; + hist.sumwy[v] += y_[id]; + } + } else { + for(auto id : subset) { + const T& v = fvec[id]; + double w = (*weights)[id]; + + hist.weight[v] += w; + hist.sumwy[v] += w * y_[id]; + } } } - }