Skip to content

Commit b27b51f

Browse files
committed
[PLUGIN] Add densify parser
1 parent 88e3627 commit b27b51f

File tree

5 files changed

+90
-2
lines changed

5 files changed

+90
-2
lines changed

dmlc-core

make/travis.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,4 @@ LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
3131
#
3232
XGB_PLUGINS += plugin/example/plugin.mk
3333
XGB_PLUGINS += plugin/lz4/plugin.mk
34+
XGB_PLUGINS += plugin/dense_libsvm/plugin.mk
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*!
2+
* Copyright 2015 by Contributors
3+
* \file dense_libsvm.cc
4+
* \brief Plugin to load in libsvm, but fill all the missing entries with zeros.
5+
* This plugin is mainly used for benchmark purposes and do not need to be included.
6+
*/
7+
#include <dmlc/data.h>
8+
#include <memory>
9+
10+
namespace dmlc {
11+
namespace data {
12+
13+
template<typename IndexType>
14+
class DensifyParser : public dmlc::Parser<IndexType> {
15+
public:
16+
DensifyParser(dmlc::Parser<IndexType>* parser, uint32_t num_col)
17+
: parser_(parser), num_col_(num_col) {
18+
}
19+
20+
void BeforeFirst() override {
21+
parser_->BeforeFirst();
22+
}
23+
24+
bool Next() override {
25+
if (!parser_->Next()) return false;
26+
const RowBlock<IndexType>& batch = parser_->Value();
27+
LOG(INFO) << batch.size;
28+
dense_index_.resize(num_col_ * batch.size);
29+
dense_value_.resize(num_col_ * batch.size);
30+
std::fill(dense_value_.begin(), dense_value_.end(), 0.0f);
31+
offset_.resize(batch.size + 1);
32+
offset_[0] = 0;
33+
34+
for (size_t i = 0; i < batch.size; ++i) {
35+
offset_[i + 1] = (i + 1) * num_col_;
36+
Row<IndexType> row = batch[i];
37+
for (uint32_t j = 0; j < num_col_; ++j) {
38+
dense_index_[i * num_col_ + j] = j;
39+
}
40+
for (unsigned k = 0; k < row.length; ++k) {
41+
uint32_t index = row.get_index(k);
42+
CHECK_LT(index, num_col_)
43+
<< "Featuere index larger than num_col";
44+
dense_value_[i * num_col_ + index] = row.get_value(k);
45+
}
46+
}
47+
out_ = batch;
48+
out_.index = dmlc::BeginPtr(dense_index_);
49+
out_.value = dmlc::BeginPtr(dense_value_);
50+
out_.offset = dmlc::BeginPtr(offset_);
51+
return true;
52+
}
53+
54+
const dmlc::RowBlock<IndexType>& Value() const override {
55+
return out_;
56+
}
57+
58+
size_t BytesRead() const override {
59+
return parser_->BytesRead();
60+
}
61+
62+
private:
63+
RowBlock<IndexType> out_;
64+
std::unique_ptr<Parser<IndexType> > parser_;
65+
uint32_t num_col_;
66+
std::vector<size_t> offset_;
67+
std::vector<IndexType> dense_index_;
68+
std::vector<float> dense_value_;
69+
};
70+
71+
template<typename IndexType>
72+
Parser<IndexType> *
73+
CreateDenseLibSVMParser(const std::string& path,
74+
const std::map<std::string, std::string>& args,
75+
unsigned part_index,
76+
unsigned num_parts) {
77+
CHECK_NE(args.count("num_col"), 0) << "expect num_col in dense_libsvm";
78+
return new DensifyParser<IndexType>(
79+
Parser<IndexType>::Create(path.c_str(), part_index, num_parts, "libsvm"),
80+
uint32_t(atoi(args.at("num_col").c_str())));
81+
}
82+
} // namespace data
83+
84+
DMLC_REGISTER_DATA_PARSER(uint32_t, dense_libsvm, data::CreateDenseLibSVMParser<uint32_t>);
85+
} // namespace dmlc

plugin/dense_parser/plugin.mk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
PLUGIN_OBJS += build_plugin/dense_parser/dense_libsvm.o
2+
PLUGIN_LDFLAGS +=

src/data/data.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ DMatrix* DMatrix::Load(const std::string& uri,
181181
std::string ftype = file_format;
182182
if (file_format == "auto") ftype = "libsvm";
183183
std::unique_ptr<dmlc::Parser<uint32_t> > parser(
184-
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, ftype.c_str()));
184+
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
185185
DMatrix* dmat = DMatrix::Create(parser.get(), cache_file);
186186
if (!silent) {
187187
LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "

0 commit comments

Comments
 (0)