|
| 1 | +/*! |
| 2 | + * Copyright 2015 by Contributors |
| 3 | + * \file dense_libsvm.cc |
| 4 | + * \brief Plugin to load in libsvm, but fill all the missing entries with zeros. |
| 5 | + * This plugin is mainly used for benchmark purposes and do not need to be included. |
| 6 | + */ |
| 7 | +#include <dmlc/data.h> |
| 8 | +#include <memory> |
| 9 | + |
| 10 | +namespace dmlc { |
| 11 | +namespace data { |
| 12 | + |
| 13 | +template<typename IndexType> |
| 14 | +class DensifyParser : public dmlc::Parser<IndexType> { |
| 15 | + public: |
| 16 | + DensifyParser(dmlc::Parser<IndexType>* parser, uint32_t num_col) |
| 17 | + : parser_(parser), num_col_(num_col) { |
| 18 | + } |
| 19 | + |
| 20 | + void BeforeFirst() override { |
| 21 | + parser_->BeforeFirst(); |
| 22 | + } |
| 23 | + |
| 24 | + bool Next() override { |
| 25 | + if (!parser_->Next()) return false; |
| 26 | + const RowBlock<IndexType>& batch = parser_->Value(); |
| 27 | + LOG(INFO) << batch.size; |
| 28 | + dense_index_.resize(num_col_ * batch.size); |
| 29 | + dense_value_.resize(num_col_ * batch.size); |
| 30 | + std::fill(dense_value_.begin(), dense_value_.end(), 0.0f); |
| 31 | + offset_.resize(batch.size + 1); |
| 32 | + offset_[0] = 0; |
| 33 | + |
| 34 | + for (size_t i = 0; i < batch.size; ++i) { |
| 35 | + offset_[i + 1] = (i + 1) * num_col_; |
| 36 | + Row<IndexType> row = batch[i]; |
| 37 | + for (uint32_t j = 0; j < num_col_; ++j) { |
| 38 | + dense_index_[i * num_col_ + j] = j; |
| 39 | + } |
| 40 | + for (unsigned k = 0; k < row.length; ++k) { |
| 41 | + uint32_t index = row.get_index(k); |
| 42 | + CHECK_LT(index, num_col_) |
| 43 | + << "Featuere index larger than num_col"; |
| 44 | + dense_value_[i * num_col_ + index] = row.get_value(k); |
| 45 | + } |
| 46 | + } |
| 47 | + out_ = batch; |
| 48 | + out_.index = dmlc::BeginPtr(dense_index_); |
| 49 | + out_.value = dmlc::BeginPtr(dense_value_); |
| 50 | + out_.offset = dmlc::BeginPtr(offset_); |
| 51 | + return true; |
| 52 | + } |
| 53 | + |
| 54 | + const dmlc::RowBlock<IndexType>& Value() const override { |
| 55 | + return out_; |
| 56 | + } |
| 57 | + |
| 58 | + size_t BytesRead() const override { |
| 59 | + return parser_->BytesRead(); |
| 60 | + } |
| 61 | + |
| 62 | + private: |
| 63 | + RowBlock<IndexType> out_; |
| 64 | + std::unique_ptr<Parser<IndexType> > parser_; |
| 65 | + uint32_t num_col_; |
| 66 | + std::vector<size_t> offset_; |
| 67 | + std::vector<IndexType> dense_index_; |
| 68 | + std::vector<float> dense_value_; |
| 69 | +}; |
| 70 | + |
| 71 | +template<typename IndexType> |
| 72 | +Parser<IndexType> * |
| 73 | +CreateDenseLibSVMParser(const std::string& path, |
| 74 | + const std::map<std::string, std::string>& args, |
| 75 | + unsigned part_index, |
| 76 | + unsigned num_parts) { |
| 77 | + CHECK_NE(args.count("num_col"), 0) << "expect num_col in dense_libsvm"; |
| 78 | + return new DensifyParser<IndexType>( |
| 79 | + Parser<IndexType>::Create(path.c_str(), part_index, num_parts, "libsvm"), |
| 80 | + uint32_t(atoi(args.at("num_col").c_str()))); |
| 81 | +} |
| 82 | +} // namespace data |
| 83 | + |
| 84 | +DMLC_REGISTER_DATA_PARSER(uint32_t, dense_libsvm, data::CreateDenseLibSVMParser<uint32_t>); |
| 85 | +} // namespace dmlc |
0 commit comments