Skip to content

Commit 461ef32

Browse files
author
root
committed
support weighted examples
fix the loss reduction from weighted ranking
1 parent 834c9b1 commit 461ef32

File tree

10 files changed

+186
-100
lines changed

10 files changed

+186
-100
lines changed

Config.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,17 @@ bool Config::readConfig(const std::string& fileName) {
4444
cmpIdx_ = (it != cfg.items().end())
4545
? columnIdx[it->second.asString()] : -1;
4646

47+
it = cfg.find("weight_column");
48+
weightIdx_ = (it != cfg.items().end())
49+
? columnIdx[it->second.asString()] : -1;
50+
4751
it = cfg.find("loss_function");
4852
if (it != cfg.items().end() && it->second.asString() == "logistic") {
4953
lossFunction_ = L2Logistic;
5054
} else {
5155
lossFunction_ = L2Regression;
5256
}
53-
57+
5458
const dynamic& trainColumns = cfg["train_columns"];
5559
for (auto it = trainColumns.begin(); it != trainColumns.end(); ++it) {
5660
trainIdx_.push_back(columnIdx.at(it->asString()));

Config.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ enum LossFunction {
1010
L2Regression = 0,
1111
L2Logistic = 1
1212
};
13-
13+
1414
// Specifying the training parameters and data format
1515
struct Config {
1616

@@ -46,6 +46,10 @@ struct Config {
4646
return targetIdx_;
4747
}
4848

49+
int getWeightIdx() const {
50+
return weightIdx_;
51+
}
52+
4953
int getCompareIdx() const {
5054
return cmpIdx_;
5155
}
@@ -93,8 +97,10 @@ struct Config {
9397

9498
int targetIdx_;
9599
int cmpIdx_;
100+
int weightIdx_;
101+
96102
LossFunction lossFunction_;
97-
103+
98104
std::vector<int> trainIdx_;
99105
std::vector<int> weakIdx_;
100106
std::vector<int> evalIdx_;

DataSet.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh)
2525
features_[i].fvec.reset(new vector<double>());
2626
features_[i].encoding = DOUBLE;
2727
}
28+
29+
if (cfg_.getWeightIdx() != -1) {
30+
weights_.reset(new vector<double>());
31+
}
2832
}
2933

3034
bool DataSet::getEvalColumns(const std::string& line,
@@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line,
4145

4246
bool DataSet::getRow(const string& line, double* target,
4347
boost::scoped_array<double>& fvec,
48+
double* weight,
4449
double* cmpValue) const {
4550
try {
4651
vector<folly::StringPiece> sv;
@@ -64,7 +69,9 @@ bool DataSet::getRow(const string& line, double* target,
6469
if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) {
6570
*cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str());
6671
}
67-
72+
if (cfg_.getWeightIdx() != -1 && weight != NULL) {
73+
*weight = atof(sv[cfg_.getWeightIdx()].toString().c_str());
74+
}
6875
} catch (...) {
6976
LOG(ERROR) << "fail to process line: " << line;
7077
return false;
@@ -100,7 +107,7 @@ double DataSet::getPrediction(TreeNode<uint16_t>* rt, int eid) const {
100107
}
101108

102109
bool DataSet::addVector(const boost::scoped_array<double>& fvec,
103-
double target) {
110+
double target, double* weight) {
104111
if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) {
105112
return false;
106113
}
@@ -128,6 +135,10 @@ bool DataSet::addVector(const boost::scoped_array<double>& fvec,
128135
}
129136
}
130137
}
138+
if (weights_) {
139+
weights_->push_back(*weight);
140+
}
141+
131142
targets_.push_back(target);
132143
numExamples_++;
133144

DataSet.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,13 @@ class DataSet {
4949
public:
5050
DataSet(const Config& cfg, int bucketingThresh, int examplesThresh=-1);
5151

52-
bool addVector(const boost::scoped_array<double>& fvec, double target);
52+
bool addVector(const boost::scoped_array<double>& fvec,
53+
double target, double* weight = NULL);
5354

5455
bool getRow(const std::string& line,
5556
double* target,
5657
boost::scoped_array<double>& fvec,
58+
double* weight = NULL,
5759
double* cmpValue = NULL) const;
5860

5961
bool getEvalColumns(const std::string& line,
@@ -63,6 +65,10 @@ class DataSet {
6365
return numExamples_;
6466
}
6567

68+
const std::unique_ptr<std::vector<double>>& getWeights() const {
69+
return weights_;
70+
}
71+
6672
void getFeatureVec(const int eid, boost::scoped_array<uint16_t>& fvec) const {
6773
for (int i = 0; i < numFeatures_; i++) {
6874
if (features_[i].encoding == EMPTY) {
@@ -103,6 +109,7 @@ class DataSet {
103109

104110
boost::scoped_array<FeatureData> features_;
105111
std::vector<double> targets_;
112+
std::unique_ptr<std::vector<double>> weights_;
106113

107114
friend class TreeRegressor;
108115
friend class Gbm;
@@ -126,4 +133,3 @@ template<class T> void split(const std::vector<int>& subset,
126133
}
127134

128135
}
129-

Gbm.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ class ParallelEval : public apache::thrift::concurrency::Runnable {
4949
//double score = weakModel_->eval(fvec);
5050
double score = ds_.getPrediction(weakModel_.get(), i);
5151
F_[i] += score;
52-
subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]);
52+
const double* wt = ds_.getWeights() ? &((*ds_.getWeights())[i]) : NULL;
53+
subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i], wt);
5354
}
5455
}
5556
monitor_.decrement();
@@ -78,14 +79,14 @@ void Gbm::getModel(
7879
boost::scoped_array<double> F(new double[numExamples]);
7980
boost::scoped_array<double> y(new double[numExamples]);
8081

81-
double f0 = fun_.getF0(ds_.targets_);
82+
double f0 = fun_.getF0(ds_.targets_, ds_.getWeights().get());
8283
for (int i = 0; i < numExamples; i++) {
8384
F[i] = f0;
8485
}
8586

8687
model->push_back(new LeafNode<double>(f0));
8788

88-
double initLoss = fun_.getInitLoss(ds_.targets_);
89+
double initLoss = fun_.getInitLoss(ds_.targets_, ds_.getWeights().get());
8990

9091
LOG(INFO) << "init avg loss " << initLoss / numExamples;
9192

@@ -131,7 +132,8 @@ void Gbm::getModel(
131132
// double score = weakModel->eval(fvec);
132133
double score = ds_.getPrediction(weakModel.get(), i);
133134
F[i] += score;
134-
newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]);
135+
const double* wt = ds_.getWeights() ? &((*ds_.getWeights())[i]) : NULL;
136+
newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i], wt);
135137
}
136138
}
137139

GbmFun.h

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,23 @@ namespace boosting {
1212
class GbmFun {
1313
public:
1414
virtual double getLeafVal(const std::vector<int>& subset,
15-
const boost::scoped_array<double>& y) const = 0;
15+
const boost::scoped_array<double>& y,
16+
const std::vector<double>* wts = NULL) const = 0;
1617

17-
virtual double getF0(const std::vector<double>& y) const = 0;
18+
virtual double getF0(const std::vector<double>& y,
19+
const std::vector<double>* wts = NULL) const = 0;
1820

1921
virtual void getGradient(const std::vector<double>& y,
2022
const boost::scoped_array<double>& F,
21-
boost::scoped_array<double>& grad) const = 0;
23+
boost::scoped_array<double>& grad,
24+
const std::vector<double>* wts = NULL) const = 0;
2225

23-
virtual double getInitLoss(const std::vector<double>& y) const = 0;
26+
virtual double getInitLoss(const std::vector<double>& y,
27+
const std::vector<double>* wts = NULL) const = 0;
2428

25-
virtual double getExampleLoss(const double y, const double f) const = 0;
29+
virtual double getExampleLoss(const double y, const double f, const double* w = NULL) const = 0;
2630

27-
virtual void accumulateExampleLoss(const double y, const double f) = 0;
31+
virtual void accumulateExampleLoss(const double y, const double f, const double* w = NULL) = 0;
2832

2933
virtual double getReduction() const = 0;
3034

@@ -36,63 +40,81 @@ class GbmFun {
3640

3741
class LeastSquareFun : public GbmFun {
3842
public:
39-
LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0) {
43+
LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0), sumw_(0.0) {
4044
}
4145

4246
double getLeafVal(const std::vector<int>& subset,
43-
const boost::scoped_array<double>& y) const {
44-
45-
double sum = 0;
47+
const boost::scoped_array<double>& y, const std::vector<double>* wts = NULL) const {
48+
double sumwy = 0;
49+
double sumw = 0;
4650
for (const auto& id : subset) {
47-
sum += y[id];
51+
double w = ((wts != NULL) ? (*wts)[id] : 1.0);
52+
sumw += w;
53+
sumwy += w * y[id];
4854
}
49-
return sum/subset.size();
55+
return sumwy/sumw;
5056
}
5157

52-
double getF0(const std::vector<double>& yvec) const {
53-
double sum = 0.0;
54-
for (const auto& y : yvec) {
55-
sum += y;
58+
double getF0(const std::vector<double>& yvec, const std::vector<double>* wts = NULL) const {
59+
double sumwy = 0;
60+
double sumw = 0;
61+
for (int i = 0; i < yvec.size(); i++) {
62+
double w = ((wts != NULL) ? (*wts)[i] : 1.0);
63+
sumw += w;
64+
sumwy += w * yvec[i];
5665
}
57-
return sum/yvec.size();
66+
return sumwy/sumw;
5867
}
5968

6069
void getGradient(const std::vector<double>& y,
6170
const boost::scoped_array<double>& F,
62-
boost::scoped_array<double>& grad) const {
71+
boost::scoped_array<double>& grad,
72+
const std::vector<double>* wts = NULL) const {
6373

6474
int size = y.size();
6575

6676
for (int i = 0; i < size; i++) {
67-
grad[i] = y[i] - F[i];
77+
double w = ((wts != NULL) ? (*wts)[i] : 1.0);
78+
grad[i] = w * (y[i] - F[i]);
6879
}
6980
}
7081

71-
double getInitLoss(const std::vector<double>& yvec) const {
82+
double getInitLoss(const std::vector<double>& yvec,
83+
const std::vector<double>* wts = NULL) const {
84+
7285
double sumy = 0.0;
7386
double sumy2 = 0.0;
87+
double sumw = 0.0;
7488

75-
for (const auto& y : yvec) {
76-
sumy += y;
77-
sumy2 += y*y;
89+
for (int i = 0; i < yvec.size(); i++) {
90+
double w = ((wts != NULL) ? (*wts)[i] : 1.0);
91+
double y = yvec[i];
92+
93+
sumw += w;
94+
sumy += w*y;
95+
sumy2 += w*y*y;
7896
}
7997

80-
return sumy2 - sumy * sumy/yvec.size();
98+
return sumy2 - sumy * sumy/sumw;
8199
}
82100

83-
double getExampleLoss(const double y, const double f) const {
84-
return (y - f) * (y - f);
101+
double getExampleLoss(const double y, const double f, const double* wt = NULL) const {
102+
double w = ((wt != NULL) ? *wt : 1.0);
103+
return w * (y - f) * (y - f);
85104
}
86105

87-
void accumulateExampleLoss(const double y, const double f) {
88-
sumy_ += y;
106+
void accumulateExampleLoss(const double y, const double f, const double *wt = NULL) {
107+
double w = (wt != NULL) ? *wt : 1.0;
108+
sumy_ += w * y;
89109
numExamples_ += 1;
90-
sumy2_ += y * y;
91-
l2_ += getExampleLoss(y, f);
110+
sumw_ += w;
111+
sumy2_ += w * y * y;
112+
113+
l2_ += getExampleLoss(y, f, wt);
92114
}
93115

94116
double getReduction() const {
95-
return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/numExamples_);
117+
return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/sumw_);
96118
}
97119

98120
int getNumExamples() const {
@@ -108,6 +130,7 @@ class LeastSquareFun : public GbmFun {
108130
double sumy_;
109131
double sumy2_;
110132
double l2_;
133+
double sumw_;
111134
};
112135

113136
}

LogisticFun.h

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ namespace boosting {
77
class LogisticFun : public GbmFun {
88
public:
99
double getLeafVal(const std::vector<int>& subset,
10-
const boost::scoped_array<double>& y) const {
10+
const boost::scoped_array<double>& y,
11+
const std::vector<double>* wts = NULL) const {
1112
double wx = 0.0, wy = 0.0;
1213
for (const auto& id : subset) {
1314
double yi = y[id];
@@ -17,7 +18,8 @@ class LogisticFun : public GbmFun {
1718
return wy / wx;
1819
}
1920

20-
double getF0(const std::vector<double>& y) const {
21+
double getF0(const std::vector<double>& y,
22+
const std::vector<double>* wts = NULL) const {
2123
double sumy = 0.0;
2224
for (const auto yi : y) {
2325
sumy += yi;
@@ -28,14 +30,15 @@ class LogisticFun : public GbmFun {
2830

2931
void getGradient(const std::vector<double>& y,
3032
const boost::scoped_array<double>& F,
31-
boost::scoped_array<double>& grad) const {
33+
boost::scoped_array<double>& grad,
34+
const std::vector<double>* wts = NULL) const {
3235
int size = y.size();
3336
for (int i = 0; i < size; i++) {
3437
grad[i] = 2.0 * y[i]/(1.0 + exp(2.0 * y[i] * F[i]));
3538
}
3639
}
3740

38-
double getInitLoss(const std::vector<double>& y) const {
41+
double getInitLoss(const std::vector<double>& y, const std::vector<double>* wts = NULL) const {
3942
int posCount = 0;
4043
for (const auto yi : y) {
4144
if (yi > 0) {
@@ -45,11 +48,11 @@ class LogisticFun : public GbmFun {
4548
return getEntropy(posCount, y.size()) * y.size();
4649
}
4750

48-
double getExampleLoss(const double y, const double f) const {
51+
double getExampleLoss(const double y, const double f, const double* w = NULL) const {
4952
return log(1.0 + exp(-2.0 * y * f));
5053
}
5154

52-
void accumulateExampleLoss(const double y, const double f) {
55+
void accumulateExampleLoss(const double y, const double f, const double* w = NULL) {
5356
numExamples_ += 1;
5457
if (y > 0) {
5558
posCount_ += 1;
@@ -75,7 +78,7 @@ class LogisticFun : public GbmFun {
7578
double posProb = double(posCount)/numExamples;
7679
return -(posProb * log(posProb) + (1 - posProb) * log(1.0 - posProb));
7780
}
78-
81+
7982
int numExamples_;
8083
int posCount_;
8184
double logloss_;

0 commit comments

Comments
 (0)