Adding ARX model from pull #51 with fixed predict().

dmsuehir · dmsuehir · commit 48dc639760a3 · 2016-02-25T10:18:32.000-08:00
diff --git a/src/main/scala/com/cloudera/sparkts/Lag.scala b/src/main/scala/com/cloudera/sparkts/Lag.scala
@@ -97,4 +97,28 @@ private[sparkts] object Lag {
       }
     }
   }
+
+  /**
+   * Creates a lagged matrix from a current matrix (represented in row-array form). Lags each column
+   * the appropriate amount of times and then concatenates the columns.
+   * So given a matrix [a b c], where a/b/c are column vectors, and calling with lag of 2, becomes a
+   * matrix of the form [a_-1 a_-2 b_-1 b_-2 c_-1 c_-2]
+   */
+  def lagMatTrimBoth(
+                      x: Array[Array[Double]],
+                      maxLag: Int,
+                      includeOriginal: Boolean): Array[Array[Double]] = {
+    val xt = x.transpose
+    // one matrix per column, consisting of all its lags
+    val matrices = for (col <- xt) yield {
+      Lag.lagMatTrimBoth(col, maxLag, includeOriginal)
+    }
+    // merge the matrices into 1 matrix by concatenating col-wise
+    matrices.transpose.map(_.reduceLeft(_ ++ _))
+  }
+
+  def lagMatTrimBoth(x: Array[Array[Double]], maxLag: Int)
+  : Array[Array[Double]] = {
+    lagMatTrimBoth(x, maxLag, false)
+  }
 }
diff --git a/src/main/scala/com/cloudera/sparkts/MatrixUtil.scala b/src/main/scala/com/cloudera/sparkts/MatrixUtil.scala
@@ -36,6 +36,14 @@ private[sparkts] object MatrixUtil {
     arrs
   }
 
+  def matToRowArrs(mat: Matrix[Double]): Array[Array[Double]] = {
+    val arrs = new Array[Array[Double]](mat.rows)
+    for (r <- 0 until mat.rows) {
+      arrs(r) = mat(r to r, 0 to mat.cols - 1).toDenseMatrix.toArray
+    }
+    arrs
+  }
+
   def arrsToMat(arrs: Iterator[Array[Double]]): DenseMatrix[Double] = {
     vecArrsToMats(arrs, arrs.length).next()
   }
diff --git a/src/main/scala/com/cloudera/sparkts/models/AutoregressionX.scala b/src/main/scala/com/cloudera/sparkts/models/AutoregressionX.scala
@@ -0,0 +1,131 @@
+/**
+ * Copyright (c) 2016, Cloudera, Inc. All Rights Reserved.
+ *
+ * Cloudera, Inc. licenses this file to you under the Apache License,
+ * Version 2.0 (the "License"). You may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for
+ * the specific language governing permissions and limitations under the
+ * License.
+ */
+
+package com.cloudera.sparkts.models
+
+import breeze.linalg._
+import com.cloudera.sparkts.Lag
+import com.cloudera.sparkts.MatrixUtil.{matToRowArrs, toBreeze}
+import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression
+
+/**
+ * Models a timeseries as a function of itself (autoregressive terms) and exogenous variables, which
+ * are lagged up to degree xMaxLag.
+ */
+object AutoregressionX {
+  /**
+   * Fit an autoregressive model with additional exogenous variables. The model predicts a value
+   * at time t of a dependent variable, Y, as a function of previous values of Y, and a combination
+   * of previous values of exogenous regressors X_i, and current values of exogenous regressors X_i.
+   * This is a generalization of an AR model, which is simple an ARX with no exogenous regressors.
+   * The fitting procedure here is the same, using least squares. Note that all lags up to the
+   * maxlag are included. In the case of the dependent variable the max lag is 'yMaxLag', while
+   * for the exogenous variables the max lag is 'xMaxLag', with which each column in the original
+   * matrix provided is lagged accordingly.
+   * @param y the dependent variable, time series
+   * @param x a matrix of exogenous variables
+   * @param yMaxLag the maximum lag order for the dependent variable
+   * @param xMaxLag the maximum lag order for exogenous variables
+   * @param includeOriginalX a boolean flag indicating if the non-lagged exogenous variables should
+   *                         be included. Default is true
+   * @param noIntercept a boolean flag indicating if the intercept should be dropped. Default is
+   *                    false
+   * @return an ARXModel, which is an autoregressive model with exogenous variables
+   */
+  def fitModel(
+                y: Vector[Double],
+                x: Matrix[Double],
+                yMaxLag: Int,
+                xMaxLag: Int,
+                includeOriginalX: Boolean = true,
+                noIntercept: Boolean = false): ARXModel = {
+    val maxLag = max(yMaxLag, xMaxLag)
+    val arrY = y.toArray
+    // Make left hand side, note that we must drop the first maxLag terms
+    val trimY = arrY.drop(maxLag)
+    // Create predictors
+    val predictors = assemblePredictors(arrY, matToRowArrs(x), yMaxLag, xMaxLag, includeOriginalX)
+    val regression = new OLSMultipleLinearRegression()
+    regression.setNoIntercept(noIntercept) // drop intercept in regression
+    regression.newSampleData(trimY, predictors)
+    val params = regression.estimateRegressionParameters()
+    val (c, coeffs) = if (noIntercept) (0.0, params) else (params.head, params.tail)
+
+    new ARXModel(c, coeffs, yMaxLag, xMaxLag, includeOriginalX)
+  }
+
+
+  private[sparkts] def assemblePredictors(
+                                           y: Array[Double],
+                                           x: Array[Array[Double]],
+                                           yMaxLag: Int,
+                                           xMaxLag: Int,
+                                           includeOriginalX: Boolean = true): Array[Array[Double]] = {
+    val maxLag = max(yMaxLag, xMaxLag)
+    // AR terms from dependent variable (autoregressive portion)
+    val arY = Lag.lagMatTrimBoth(y, yMaxLag)
+    // exogenous variables lagged as appropriate
+    val laggedX = Lag.lagMatTrimBoth(x, xMaxLag)
+
+    // adjust difference in size for arY and laggedX so that they match up
+    val arYAdj = arY.drop(maxLag - yMaxLag)
+
+    val laggedXAdj = laggedX.drop(maxLag - xMaxLag)
+
+    val trimmedX = if (includeOriginalX) x.drop(maxLag) else Array[Array[Double]]()
+
+    // combine matrices by concatenating column-wise
+    Array(arYAdj, laggedXAdj, trimmedX).transpose.map(_.reduceLeft(_ ++_))
+  }
+}
+
+// Jose note: not extending timeseries model, since seems to me to be a different type of model
+// addingTimeDpendent...etc wouldn't apply here with the original signature, since we need
+// exogenous variables provided
+/**
+ * An autoregressive model with exogenous variables
+ * @param c an intercept term, zero if none desired
+ * @param coefficients the coefficients for the various terms. The order of coefficients is as
+ *                     follows:
+ *                     - Autoregressive terms for the dependent variable, in increasing order of lag
+ *                     - For each column in the exogenous matrix (in their original order), the
+ *                     lagged terms in increasing order of lag (excluding the non-lagged versions).
+ *                     - The coefficients associated with the non-lagged exogenous matrix
+ * @param yMaxLag the maximum lag order for the dependent variable
+ * @param xMaxLag the maximum lag order for exogenous variables
+ * @param includesOriginalX a boolean flag indicating if the non-lagged exogenous variables should
+ *                         be included
+ */
+class ARXModel(
+                val c: Double,
+                val coefficients: Array[Double],
+                val yMaxLag: Int,
+                val xMaxLag: Int,
+                includesOriginalX: Boolean) {
+
+
+  def predict(y: Vector[Double], x: Matrix[Double]): Vector[Double] = {
+    val predictors = AutoregressionX.assemblePredictors(y.toArray, matToRowArrs(x), yMaxLag, xMaxLag, includesOriginalX)
+    val results = DenseVector.zeros[Double](predictors.length)
+
+    for ((rowArray, rowIndex) <- predictors.zipWithIndex) {
+      for ((value, colIndex) <- rowArray.zipWithIndex) {
+        results(rowIndex) += value * coefficients(colIndex)
+      }
+    }
+
+    results
+  }
+}
diff --git a/src/test/scala/com/cloudera/sparkts/models/AutoregressionXSuite.scala b/src/test/scala/com/cloudera/sparkts/models/AutoregressionXSuite.scala
@@ -0,0 +1,133 @@
+/**
+ * Copyright (c) 2016, Cloudera, Inc. All Rights Reserved.
+ *
+ * Cloudera, Inc. licenses this file to you under the Apache License,
+ * Version 2.0 (the "License"). You may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for
+ * the specific language governing permissions and limitations under the
+ * License.
+ */
+
+package com.cloudera.sparkts.models
+
+import breeze.linalg._
+
+import org.apache.commons.math3.random.MersenneTwister
+import com.cloudera.sparkts.Lag
+import org.scalatest.FunSuite
+import org.scalatest.Matchers._
+
+class AutoregressionXSuite extends FunSuite {
+  val rand = new MersenneTwister(10L)
+  val nRows = 1000
+  val nCols = 2
+  val X = Array.fill(nRows, nCols)(rand.nextGaussian())
+  val intercept = rand.nextGaussian * 10
+
+  // tests an autoregressive model where the exogenous variables are not lagged
+  test("fit ARX(1, 0, true)") {
+    val xCoeffs = Array(0.8, 0.2)
+    val rawY = X.map(_.zip(xCoeffs).map { case (b, v) => b * v }.sum + intercept)
+    val arCoeff = 0.4
+    val y = rawY.scanLeft(0.0) { case (priorY, currY) => currY + priorY * arCoeff }.tail
+    val dy = new DenseVector(y)
+    val dx = new DenseMatrix(rows = X.length, cols = X.head.length, data = X.transpose.flatten)
+    val model = AutoregressionX.fitModel(dy, dx, 1, 0, includeOriginalX = true)
+    val combinedCoeffs = Array(arCoeff) ++ xCoeffs
+
+    model.c should be (intercept +- 1e-4)
+    for (i <- combinedCoeffs.indices) {
+      model.coefficients(i) should be (combinedCoeffs(i) +- 1e-4)
+    }
+  }
+
+  // tests a model with no autoregressive term but with lagged exogenous variables
+  test("fit ARX(0, 1, false) model") {
+    val xCoeffs = Array(0.4, 0.15)
+    val xLagged = Lag.lagMatTrimBoth(X, 1)
+    val y = xLagged.map(_.zip(xCoeffs).map { case (b, v) => b * v }.sum + intercept)
+    val dy = new DenseVector(Array(0.0) ++ y)
+    // note that we provide the original X matrix to the fitting functiond
+    val dx = new DenseMatrix(rows = X.length, cols = X.head.length, data = X.transpose.flatten)
+    val model = AutoregressionX.fitModel(dy, dx, 0, 1, includeOriginalX = false)
+
+    model.c should be (intercept +- 1e-4)
+    for (i <- xCoeffs.indices) {
+      model.coefficients(i) should be (xCoeffs(i) +- 1e-4)
+    }
+  }
+
+  // this test simply reduces to a normal regression model
+  test("fit ARX(0, 0, true) model") {
+    // note that
+    val xCoeffs = Array(0.8, 0.2)
+    val y = X.map(_.zip(xCoeffs).map { case (b, v) => b * v }.sum + intercept)
+    val dy = new DenseVector(y)
+    val dx = new DenseMatrix(rows = X.length, cols = X.head.length, data = X.transpose.flatten)
+    val model = AutoregressionX.fitModel(dy, dx, 0, 0, includeOriginalX = true)
+
+    model.c should be (intercept +- 1e-4)
+    for (i <- xCoeffs.indices) {
+      model.coefficients(i) should be (xCoeffs(i) +- 1e-4)
+    }
+  }
+
+  // tests a model with no autoregressive term but with lagged exogenous variables
+  // of order 2 and inclusive of the original X values
+  test("fit ARX(0, 2, true) model") {
+    val xLagCoeffs = Array(0.4, 0.15, 0.2, 0.7)
+    val xLagged = Lag.lagMatTrimBoth(X, 2)
+    val yLaggedPart = xLagged.map(_.zip(xLagCoeffs).map { case (b, v) => b * v }.sum )
+    val xNormalCoeffs = Array(0.3, 0.5)
+    val yNormalPart = X.map(_.zip(xNormalCoeffs).map { case (b, v) => b * v }.sum )
+    val y = yLaggedPart.zip(yNormalPart.drop(2)).map { case (l, n) => l + n + intercept }
+
+    val dy = new DenseVector(Array(0.0, 0.0) ++ y)
+    val dx = new DenseMatrix(rows = X.length, cols = X.head.length, data = X.transpose.flatten)
+    val model = AutoregressionX.fitModel(dy, dx, 0, 2, includeOriginalX = true)
+    val combinedCoeffs = xLagCoeffs ++ xNormalCoeffs
+
+    model.c should be (intercept +- 1e-4)
+    for (i <- combinedCoeffs.indices) {
+      model.coefficients(i) should be (combinedCoeffs(i) +- 1e-4)
+    }
+  }
+
+  test("fit ARX(1, 1, false) model") {
+    val xCoeffs = Array(0.8, 0.2)
+    val xLagged = Lag.lagMatTrimBoth(X, 1)
+    val rawY = xLagged.map(_.zip(xCoeffs).map { case (b, v) => b * v }.sum + intercept)
+    val arCoeff = 0.4
+    val y = rawY.scanLeft(0.0) { case (priorY, currY) => currY + priorY * arCoeff }.tail
+    val dy = new DenseVector(Array(0.0) ++ y)
+    val dx = new DenseMatrix(rows = X.length, cols = X.head.length, data = X.transpose.flatten)
+    val model = AutoregressionX.fitModel(dy, dx, 1, 1, includeOriginalX = false)
+    val combinedCoeffs = Array(arCoeff) ++ xCoeffs
+
+    model.c should be (intercept +- 1e-4)
+    for (i <- combinedCoeffs.indices) {
+      model.coefficients(i) should be (combinedCoeffs(i) +- 1e-4)
+    }
+  }
+
+  test("predict ARX model") {
+    val c = 0
+    val xCoeffs = Array(-1.136026484226831e-08, 8.637677568908233e-07, 15238.143039368977, -7.993535860373772e-09, -5.198597570089805e-07, 1.5691547009557947e-08, 7.409621376205488e-08)
+    val yMaxLag = 0
+    val xMaxLag = 0
+    val arxModel = new ARXModel(c, xCoeffs, yMaxLag, xMaxLag, includesOriginalX = true)
+
+    val y = new DenseVector(Array(100.0))
+    val x = new DenseMatrix(rows = 1, cols = 7, data = Array(465,1,0.006562479,24,1,0,51))
+
+    val results = arxModel.predict(y, x)
+    results.length should be (1)
+    results(0) should be (y(0) +- 1e-4)
+  }
+}
+

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,14 @@ private[sparkts] object MatrixUtil {`
`36`	`36`	`arrs`
`37`	`37`	`}`
`38`	`38`
	`39`	`+ def matToRowArrs(mat: Matrix[Double]): Array[Array[Double]] = {`
	`40`	`+ val arrs = new Array[Array[Double]](mat.rows)`
	`41`	`+ for (r <- 0 until mat.rows) {`
	`42`	`+ arrs(r) = mat(r to r, 0 to mat.cols - 1).toDenseMatrix.toArray`
	`43`	`+ }`
	`44`	`+ arrs`
	`45`	`+ }`
	`46`	`+`
`39`	`47`	`def arrsToMat(arrs: Iterator[Array[Double]]): DenseMatrix[Double] = {`
`40`	`48`	`vecArrsToMats(arrs, arrs.length).next()`
`41`	`49`	`}`