update ctr config

mli · mli · commit 695a7ee4806d · 2015-01-30T22:25:44.000-05:00
diff --git a/docker/local.sh b/docker/local.sh
@@ -37,7 +37,7 @@ shift
 
 port=8000
 bin="muli/parameter-server /build/ps"
-bin_v="-v /home/muli/work/ps/build:/build"
+# bin_v="-v /home/muli/work/ps/build:/build"
 app_v="-v $app:/app.conf"
 data_v="-v $data:/data -v $model:/model"
 mount="$bin_v $app_v $data_v"
diff --git a/docker/rm_local.sh b/docker/rm_local.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker rm -f $(docker ps -a -q)
diff --git a/example/linear/README.org b/example/linear/README.org
@@ -1,25 +1,26 @@
-Examples run applications on data RCV1.
+* Sample configurations to run linear method
 
-First, download the dataset by =./rcv1_small.sh= or =./rcv1_large.sh=
+** Data
 
-Then we can run the system by either building the binary or downloading a docker images.
+Use the script such as =rcv1/download.sh= and =ctr/download.sh= to download data
 
-*Run in local*
+** Run in local machine
+
+The system can be run by either building the binary or downloading a docker images.
+
+*by build binary*
 
 Use the binary you compiled to run l1-regularized logistic regression:
 
 #+BEGIN_SRC bash
-# block coordinate descent with 1 server and 4 workers:
-../../../script/local.sh ../../../build/ps 1 4 -app_file batch_l1lr.conf
-# online gradient descent with 1 server and 4 workers:
-../../../script/local.sh ../../../build/ps 1 4 -app_file online_l1lr.conf
-# test the trained models
-../../../script/local.sh ../../../build/ps 0 0 -app_file eval_batch.conf
-../../../script/local.sh ../../../build/ps 0 0 -app_file eval_online.conf
+# run block coordinate descent with 2 servers and 2 workers:
+../../script/local.sh ../../build/ps 2 2 -app_file ctr/batch_l1lr.conf
+# evaluate the model
+../../script/local.sh ../../build/ps 0 0 -app_file ctr/eval_batch.conf
 #+END_SRC
 
-Or run the same application by [[www.docker.com][docker:]]
+*by [[www.docker.com][docker]]*
 
 #+BEGIN_SRC bash
-sudo ../../../docker/local.sh 1 1 batch_l1lr.conf data model
+sudo ../../docker/local.sh 2 2 ctr/batch_l1lr.conf data model
 #+END_SRC
diff --git a/example/linear/ctr/batch_l1lr.conf b/example/linear/ctr/batch_l1lr.conf
@@ -1,28 +1,19 @@
-#j configuration to run l1-regularized logistic regression on the ctr dataset
 linear_method {
 
 training_data {
 format: TEXT
 text: SPARSE_BINARY
-# file: "data/train/part.*"
-file: "/home/muli/work/data/ctra/train/part.*"
-}
+file: "data/ctr/train/part.*"
 
-# training_data {
-# format: TEXT
-# text: ADFEA
-# max_num_files_per_worker: 10
-# # file: "/user/muli/ctrb/part.*"
-# # hdfs {
-# # # "which hadoop" returns /usr/bin/hadoop
-# # home: "/usr"
-# # }
-# file: "/home/muli/work/data/ctrd/part.*"
+# If the data is placed on hdfs and HADOOP_HOME="/usr"
+# hdfs {
+# home: "/usr"
 # }
+}
 
 model_output {
 format: TEXT
-file: "model/ctr_batch_l1lr"
+file: "model/ctr_batch"
 }
 
 loss {
@@ -52,52 +43,57 @@ epsilon : 2e-5
 # blocks. A larger ratio often accelerate the convergence, however, it may slow
 # down the system performance because of the increased number of global barriers.
 feature_block_ratio : 4
+
 # The maximal number of blocks can be updating in parallel (bounded-delay
 # consistency). A larger delay may slow down the convergence rate, but improves
 # the system performance.
 max_block_delay: 8
 
 # important feature groups, update them earlier to get a better model
 # initialization.
-prior_fea_group: 127            # the bias feature (all one)
-prior_fea_group: 120            # the position rank feature
+prior_fea_group: 127
+prior_fea_group: 120
 
 # features which occurs <= *tail_feature_freq* will be filtered before
 # training. it save both memory and bandwidth.
-tail_feature_freq: 10
+tail_feature_freq: 4
+
 # It controls the countmin size. We filter the tail features by countmin, which
 # is more efficient than hash, but still is the memory bottleneck for servers. A
 # smaller ratio reduces the memory footprint, but may increase the size of
 # filtered feature.
+
 countmin_n_ratio: .66
 
 # In preprocessing, feature group is processed one by one. It is the main memory
 # bottleneck for workers. This number control how many feature groups can be in
 # memory at the same time. A smaller number reduce the workers' memory
 # footprint, but may slow down the preprocessing speed.
-max_num_parallel_groups_in_preprocessing: 1000
 
-# A random order accelerate the convergence. Turn it off only when debugging.
-random_feature_block_order : true
+# max_num_parallel_groups_in_preprocessing: 1000
 
 # During preprocessing, each (text) file is parsed and then write into the local
 # cache in binary format to save the memory. These data are then used by the
 # preprocessing stage, and also can be re-used when running next time.
 local_cache {
 format: BIN
-file: "/tmp/ctrc/"
+file: "data/cache/ctr_train_"
 }
 
 # Parameters used by the trust region method. The change of w_i (the i-th
 # parameter) is bouned by [-delta_i, delta_i], where delta_i is an adaptive
 # value according to the convergence. The initial value of delta_i is
 # *delta_init_value* and maximal value is *delta_max_value*. You can increase
 # these parameters for easy datasets.
-[PS.LM.delta_init_value] : 1
-[PS.LM.delta_max_value] : 5
+
+# [PS.LM.delta_init_value] : 1
+# [PS.LM.delta_max_value] : 5
+
 # This parameter controls the aggressiveness of the KKT filter.  Increasing this
 # number will decrease the effect of KKT filter. a very large number, such as
 # 1e20 will turn off the KKT filter.
-[PS.LM.kkt_filter_threshold_ratio] : 10
+
+# [PS.LM.kkt_filter_threshold_ratio] : 10
 }
+
 }
diff --git a/example/linear/ctr/eval_batch.conf b/example/linear/ctr/eval_batch.conf
@@ -3,13 +3,12 @@ linear_method {
 validation_data {
 format: TEXT
 text: SPARSE_BINARY
-# file: "data/train/part.*"
-file: "/home/muli/work/data/ctra/test/part.*"
+file: "data/ctr/test/part.*"
 }
 
 model_input {
 format: TEXT
-file: "model/ctr_batch_l1lr.*"
+file: "model/ctr_batch.*"
 }
 
 }
diff --git a/example/linear/ctr/eval_online.conf b/example/linear/ctr/eval_online.conf
@@ -2,15 +2,13 @@ linear_method {
 
 validation_data {
 format: TEXT
-text: ADFEA
-max_num_files_per_worker: 10
-file: "/home/muli/work/data/ctrd/part.*"
+text: SPARSE_BINARY
+file: "data/ctr/test/part.*"
 }
 
 model_input {
 format: TEXT
-# file: "../output/ctr_online.*"
-file: "/home/muli/work/ps_bak/output/ctr.*"
+file: "model/ctr_online.*"
 }
 
 }
diff --git a/example/linear/ctr/online_l1lr.conf b/example/linear/ctr/online_l1lr.conf
@@ -3,8 +3,7 @@ linear_method {
 training_data {
 format: TEXT
 text: SPARSE_BINARY
-# file: "data/train/part.*"
-file: "/home/muli/work/data/ctra/train/part.*"
+file: "data/ctr/train/part.*"
 ignore_feature_group: true
 }
 
@@ -17,13 +16,14 @@ loss {
 type: LOGIT
 }
 
-# coef * |w|_1
+# lambda_0 * |w|_1 + lambda_1 * |w|^2_2
 penalty {
 type: L1
 lambda: 10
 lambda: 1
 }
 
+# lr = alpha / (beta + x), where x dependes on the progress
 learning_rate {
 type: DECAY
 alpha: .01
@@ -32,10 +32,22 @@ beta: 10
 
 async_sgd {
 algo: FTRL
+
+# The size of minibatch
 minibatch : 10000
+
+# The number of data passes
+num_data_pass: 10
+
+# features which occurs <= *tail_feature_freq* will be filtered before
+# training. it save both memory and bandwidth.
 tail_feature_freq : 4
-countmin_n : 1e9
-report_interval: 1
+
+# It controls the countmin size. We filter the tail features by countmin, which
+# is more efficient than hash, but still is the memory bottleneck for servers. A
+# smaller ratio reduces the memory footprint, but may increase the size of
+# filtered feature.
+countmin_n : 1e8
 }
 
 }
diff --git a/src/app/linear_method/async_sgd.h b/src/app/linear_method/async_sgd.h
@@ -197,7 +197,23 @@ class AsyncSGDWorker : public ISGDCompNode, public LinearMethod {
   void updateModel(const SGDCall& call) {
     const auto& sgd = conf_.async_sgd();
     MinibatchReader<V> reader;
-    reader.setReader(call.data(), sgd.minibatch(), sgd.data_buf());
+
+    // random shuffle the file order
+    int n = std::max(sgd.num_data_pass(), 1);
+    int m = call.data().file_size();
+    std::vector<int> idx(m);
+    for (int i = 0; i < m; ++i) idx[i] = i;
+
+    DataConfig data = call.data();
+    data.clear_file();
+    for (int i = 0; i < n; ++i) {
+      std::random_shuffle(idx.begin(), idx.end());
+      for (int j = 0; j < m; ++j) {
+        data.add_file(call.data().file(idx[j]));
+      }
+    }
+
+    reader.setReader(data, sgd.minibatch(), sgd.data_buf());
     reader.setFilter(sgd.countmin_n(), sgd.countmin_k(), sgd.tail_feature_freq());
     reader.start();
 
diff --git a/src/app/linear_method/proto/linear.pb.h b/src/app/linear_method/proto/linear.pb.h
diff --git a/src/app/linear_method/proto/linear.proto b/src/app/linear_method/proto/linear.proto

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#!/bin/bash`
	`2`	`+docker rm -f $(docker ps -a -q)`
Original file line number	Diff line number	Diff line change
`@@ -3,13 +3,12 @@ linear_method {`
`3`	`3`	`validation_data {`
`4`	`4`	`format: TEXT`
`5`	`5`	`text: SPARSE_BINARY`
`6`		`-# file: "data/train/part.*"`
`7`		`-file: "/home/muli/work/data/ctra/test/part.*"`
	`6`	`+file: "data/ctr/test/part.*"`
`8`	`7`	`}`
`9`	`8`
`10`	`9`	`model_input {`
`11`	`10`	`format: TEXT`
`12`		`-file: "model/ctr_batch_l1lr.*"`
	`11`	`+file: "model/ctr_batch.*"`
`13`	`12`	`}`
`14`	`13`
`15`	`14`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2,15 +2,13 @@ linear_method {`
`2`	`2`
`3`	`3`	`validation_data {`
`4`	`4`	`format: TEXT`
`5`		`-text: ADFEA`
`6`		`-max_num_files_per_worker: 10`
`7`		`-file: "/home/muli/work/data/ctrd/part.*"`
	`5`	`+text: SPARSE_BINARY`
	`6`	`+file: "data/ctr/test/part.*"`
`8`	`7`	`}`
`9`	`8`
`10`	`9`	`model_input {`
`11`	`10`	`format: TEXT`
`12`		`-# file: "../output/ctr_online.*"`
`13`		`-file: "/home/muli/work/ps_bak/output/ctr.*"`
	`11`	`+file: "model/ctr_online.*"`
`14`	`12`	`}`
`15`	`13`
`16`	`14`	`}`