From e9ebb406720e226eb3a5aa616fdc3aca312f9b94 Mon Sep 17 00:00:00 2001
From: Rose Cersonsky <rosecersonsky@gmail.com>
Date: Wed, 11 Nov 2020 17:01:28 +0100
Subject: [PATCH] Removed all references to the utility classes and referenced
 the appropriate sklearn classes. Changed narrative to explain new syntax
 where necessary. Remove utility reference of MDS, because as written it is
 redundant with PCA and holds no operational value. Outstanding changes:
 reference skcosmo kernel scalers, when ready Check for naming mismatches with
 new sklearn referencing

---
 notebooks/1_LinearMethods.ipynb | 252 ++++++++++----------------------
 1 file changed, 79 insertions(+), 173 deletions(-)
diff --git a/notebooks/1_LinearMethods.ipynb b/notebooks/1_LinearMethods.ipynb
index e2a3a4b..18593e8 100644
--- a/notebooks/1_LinearMethods.ipynb
+++ b/notebooks/1_LinearMethods.ipynb
@@ -41,8 +41,7 @@
     "    get_cmaps, table_from_dict,\n",
     "    check_mirrors\n",
     ")\n",
-    "from utilities.kernels import linear_kernel, gaussian_kernel, center_kernel\n",
-    "from utilities.classes import PCA, LR\n",
+    "from utilities.kernels import center_kernel # to be replaced with scalers from skcosmo\n",
     "\n",
     "cmaps = get_cmaps()\n",
     "plt.style.use('../utilities/kernel_pcovr.mplstyle')\n",
@@ -677,9 +676,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from sklearn.decomposition import PCA\n",
+    "\n",
     "T = np.matmul(X_test, PXT)\n",
     "\n",
-    "ref = PCA(n_PC=n_MDS)\n",
+    "ref = PCA(n_components=n_MDS)\n",
     "ref.fit(X_train)\n",
     "T_ref = ref.transform(X_test)"
    ]
@@ -857,8 +858,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from sklearn.linear_model import Ridge\n",
     "regularizations = np.linspace(-10, 0, 24)\n",
-    "lrs = [LR(regularization=10**i) for i in regularizations]\n",
+    "lrs = [Ridge(alpha=10**i) for i in regularizations]\n",
     "\n",
     "for lr in lrs:\n",
     "    lr.fit(X_train, Y_train)"
@@ -870,7 +872,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "coeff_deter = np.array([lr.statistics(X_test, Y_test)['Coefficient of Determination<br>($R^2$)'] for lr in lrs])"
+    "ell = np.array([np.linalg.norm(Y_test - lr.predict(X_test)) ** 2.0 / np.linalg.norm(Y_test) **2.0 for lr in lrs])"
    ]
   },
   {
@@ -886,16 +888,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.semilogx(10**regularizations, coeff_deter, marker='o')\n",
+    "plt.semilogx(10**regularizations, ell, marker='o')\n",
     "\n",
-    "best_regularization = 10**regularizations[coeff_deter.argmax()]\n",
+    "best_regularization = 10**regularizations[ell.argmin()]\n",
     "plt.scatter([best_regularization], \n",
-    "            [coeff_deter.max()], c='r', zorder=3)\n",
+    "            [ell.min()], c='r', zorder=3)\n",
     "\n",
     "print(f\"The ideal value of $\\lambda$ for this LR (with respect to $R^2$) is {round(best_regularization,4)}.\")\n",
     "\n",
     "plt.xlabel(r\"$\\lambda$\")\n",
-    "plt.ylabel(r\"$R^2$\")\n",
+    "plt.ylabel(r\"$\\ell$\")\n",
     "\n",
     "plt.title(r\"Effect of $\\lambda$ on $R^2$ for Linear Regression\")\n",
     "\n",
@@ -906,23 +908,24 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Next: Combining LR and PCA\n",
-    "\n",
-    "In the next notebook, we show how PCA and LR can be combined into a new model. [Continue on -->](2_PrincipalCovariatesRegression.ipynb)"
+    "## Principal Covariates Regression: Predicting Properties with our Low-Dimensional Projection"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# The Utility Classes"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Classes from the utility module enable computing PCA, MDS, and LR with a scikit.learn-like syntax. "
+    "It's possible to use the projection determined by PCA to regress properties in a manner similar to linear regression. For this, we determine a weight $\\mathbf{P}_{TY}$ which minimizes\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\ell = \\lVert\\mathbf{Y}-\\mathbf{T}\\mathbf{P}_{TY}\\rVert^2 + \\lambda \\lVert \\mathbf{P}_{TY} \\rVert^2\n",
+    "\\end{equation}\n",
+    "\n",
+    "Analogous to LR, we find this weight to be:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\mathbf{P}_{TY} = (\\mathbf{T}^T \\mathbf{T} + \\lambda \\mathbf{I})^{-1} \\mathbf{T}^T \\mathbf{Y}.\n",
+    "\\end{equation}"
    ]
   },
   {
@@ -931,46 +934,55 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from utilities.classes import PCA, MDS, LR"
+    "pca = PCA(n_components=2)\n",
+    "pca.fit(X_train)\n",
+    "T_train = pca.transform(X_train)\n",
+    "T_test = pca.transform(X_test)\n",
+    "\n",
+    "PTY = np.matmul(T_train.T, T_train) + best_regularization*np.eye(T_train.shape[1])\n",
+    "PTY = np.linalg.pinv(PTY)\n",
+    "PTY = np.matmul(PTY, T_train.T)\n",
+    "PTY = np.matmul(PTY, Y_train)\n",
+    "\n",
+    "Y_pca = np.matmul(T_test, PTY)\n",
+    "plot_regression(Y_test[:,0], Y_pca[:,0], **cmaps)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## PCA"
+    "Look at how bad that is! As one can imagine, including more components gives a better prediction of the properties."
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "pca = PCA(n_PC=n_PC)"
+    "# Next: Combining LR and PCA\n",
+    "\n",
+    "In the next notebook, we show how PCA and LR can be combined into a new model. [Continue on -->](2_PrincipalCovariatesRegression.ipynb)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Calling `pca.fit(X)` computes the covariance $\\mathbf{C}$ of $\\mathbf{X}$ and internally stores the eigenvectors/values."
+    "# Implementation in `scikit-learn`"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "pca.fit(X_train)"
+    "The classes covered in this notebook are available through [scikit-learn](scikit-learn.org), more commonly referred to as `sklearn`. Here we'll review the basics of their usage; readers are encouraged to refer to sklearn documentation for further detail."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Calling `pca.transform(X)` computes and returns the PCA projection $\\mathbf{T}$. We can also use the eigenvectors from the PCA fit to project new data that was not used to originally construct $\\mathbf{P}_{XT}$"
+    "## PCA"
    ]
   },
   {
@@ -979,8 +991,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "T_test = pca.transform(X_test)\n",
-    "T_train = pca.transform(X_train)"
+    "from sklearn.decomposition import PCA"
    ]
   },
   {
@@ -989,20 +1000,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "fig, axes = plt.subplots(1,2, figsize=dbl_fig)\n",
-    "\n",
-    "plot_projection(Y_train, T_train, title=\"Projection of Training Data\", fig=fig, ax=axes[0], **cmaps)\n",
-    "plot_projection(Y_test, T_test, title=\"Projection of Testing Data\", fig=fig, ax=axes[1], **cmaps)\n",
-    "\n",
-    "fig.tight_layout()\n",
-    "plt.show()"
+    "pca = PCA(n_components=n_PC)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Calling `pca.statistics(X)` outputs the statistics of the projection of $\\mathbf{X}$."
+    "Calling `pca.fit(X)` computes the covariance $\\mathbf{C}$ of $\\mathbf{X}$ and internally stores the eigenvectors/values."
    ]
   },
   {
@@ -1011,33 +1016,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "table_from_dict([pca.statistics(X_train), pca.statistics(X_test)], \n",
-    "                 headers = [\"Training\", \"Testing\"], \n",
-    "                 title=\"PCA\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Predicting Properties with our Low-Dimensional Projection"
+    "pca.fit(X_train)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "It's possible to use the projection determined by PCA to regress properties in a manner similar to linear regression. For this, we determine a weight $\\mathbf{P}_{TY}$ which minimizes\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\ell = \\lVert\\mathbf{Y}-\\mathbf{T}\\mathbf{P}_{TY}\\rVert^2 + \\lambda \\lVert \\mathbf{P}_{TY} \\rVert^2\n",
-    "\\end{equation}\n",
-    "\n",
-    "Analogous to LR, we find this weight to be:\n",
-    "\n",
-    "\\begin{equation}\n",
-    "\\mathbf{P}_{TY} = (\\mathbf{T}^T \\mathbf{T} + \\lambda \\mathbf{I})^{-1} \\mathbf{T}^T \\mathbf{Y}.\n",
-    "\\end{equation}"
+    "Calling `pca.transform(X)` computes and returns the PCA projection $\\mathbf{T}$. We can also use the eigenvectors from the PCA fit to project new data that was not used to originally construct $\\mathbf{P}_{XT}$"
    ]
   },
   {
@@ -1046,20 +1032,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "PTY = np.matmul(T_train.T, T_train) + best_regularization*np.eye(T_train.shape[1])\n",
-    "PTY = np.linalg.pinv(PTY)\n",
-    "PTY = np.matmul(PTY, T_train.T)\n",
-    "PTY = np.matmul(PTY, Y_train)\n",
-    "\n",
-    "Y_pca = np.matmul(T_test, PTY)\n",
-    "plot_regression(Y_test[:,0], Y_pca[:,0], **cmaps)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Look at how bad that is! As one can imagine, including more components gives a better prediction of the properties."
+    "T_test = pca.transform(X_test)\n",
+    "T_train = pca.transform(X_train)"
    ]
   },
   {
@@ -1068,77 +1042,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "larger_npca = [10, 50, 100, 200]\n",
-    "\n",
-    "all_stats = [get_stats(y=Y_test, yp=Y_pca)]\n",
-    "headers = [r\"$n_{PCA}$\"+\"={}\".format(n_PC)]\n",
-    "\n",
-    "for n in larger_npca:\n",
-    "    pca2 = PCA(n_PC=n)\n",
-    "    pca2.fit(X_train)\n",
-    "\n",
-    "    T_train2 = pca2.transform(X_train)\n",
-    "    T_test2 = pca2.transform(X_test)\n",
-    "\n",
-    "    PTY2 = np.matmul(T_train2.T, T_train2) + best_regularization*np.eye(T_train2.shape[1])\n",
-    "    PTY2 = np.linalg.pinv(PTY2)\n",
-    "    PTY2 = np.matmul(PTY2, T_train2.T)\n",
-    "    PTY2 = np.matmul(PTY2, Y_train)\n",
+    "fig, axes = plt.subplots(1,2, figsize=dbl_fig)\n",
     "\n",
-    "    Y_pca2 = np.matmul(T_test2, PTY2)\n",
-    "    \n",
-    "    all_stats.append(get_stats(y=Y_test, yp=Y_pca2))\n",
-    "    headers.append(r\"$n_{PCA}$\"+\"={}\".format(n))\n",
+    "plot_projection(Y_train, T_train, title=\"Projection of Training Data\", fig=fig, ax=axes[0], **cmaps)\n",
+    "plot_projection(Y_test, T_test, title=\"Projection of Testing Data\", fig=fig, ax=axes[1], **cmaps)\n",
     "\n",
-    "table_from_dict(all_stats,\n",
-    "                 headers = headers, \n",
-    "                 title=\"Regression Errors from PCA Projections\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## MDS"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A helper class is provided to run MDS with a scikit-learn-like syntax"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mds = MDS(n_MDS=n_MDS)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Calling `mds.fit(X)` computes the Gram matrix $\\mathbf{K}$ from $\\mathbf{X}$ and internally stores the eigenvectors/values."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mds.fit(X_train)"
+    "fig.tight_layout()\n",
+    "plt.show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Calling `mds.transform(X)` computes and returns the MDS projection $\\mathbf{T}$. We also use the eigenvectors from the MDS fit to project new data that was not used to originally construct the MDS."
+    "Calling `pca.inverse_transform(X)` will provide the approximation of $\\mathbf{X}$ from the latent space projection."
    ]
   },
   {
@@ -1147,7 +1064,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "T = mds.transform(X_test)"
+    "Xr_train = pca.inverse_transform(T_train)\n",
+    "Xr_test = pca.inverse_transform(T_test)"
    ]
   },
   {
@@ -1156,17 +1074,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "fig, ax = plt.subplots(1,2, figsize=dbl_fig)\n",
-    "plot_projection(Y_test, check_mirrors(T, T_ref), fig=fig, ax=ax[0], title=\"Multidimensional Scaling with the Utility Class\", **cmaps)\n",
-    "plot_projection(Y_test, T_ref, fig=fig, ax=ax[1], title=\"PCA\", **cmaps)\n",
-    "fig.tight_layout()"
+    "table_from_dict([get_stats(x=X_train, y=Y_train, t=T_train, xr=Xr_train), \n",
+    "                 get_stats(x=X_test, y=Y_test, t=T_test, xr=Xr_test)], \n",
+    "                 headers = [\"Training\", \"Testing\"], \n",
+    "                 title=\"PCA\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Calling `mds.statistics(X)` outputs the statistics of the projection of $\\mathbf{X}$."
+    "## LR\n",
+    "\n",
+    "Unlike PCA, which takes the number of PCA components in the constructor, the linear regression class takes the regularization as a parameter.\n",
+    "\n",
+    "In `sklearn`, we can use either linear_models.LinearRegression or linear_models.Ridge to compute a linear regression. However, only Ridge will allow the inclusion of the regularization parameter, by definition."
    ]
   },
   {
@@ -1175,18 +1097,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "table_from_dict([mds.statistics(X_train), mds.statistics(X_test)], \n",
-    "                 headers = [\"Training\", \"Testing\"], \n",
-    "                 title=\"MDS\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## LR\n",
-    "\n",
-    "Unlike PCA, which takes the number of PCA components in the constructor, the linear regression class takes the regularization as a parameter."
+    "from sklearn.linear_model import LinearRegression as LR\n",
+    "from sklearn.linear_model import Ridge as Ridge"
    ]
   },
   {
@@ -1195,7 +1107,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "lr = LR(regularization=best_regularization)"
+    "lr = Ridge(alpha=best_regularization)"
    ]
   },
   {
@@ -1218,7 +1130,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Calling `lr.transform(X)` computes and returns the predicted $\\hat{\\mathbf{Y}}$ values from $\\hat{\\mathbf{Y}}_{LR} = \\mathbf{X}\\mathbf{P}_{XY}$."
+    "Calling `lr.predict(X)` computes and returns the predicted $\\hat{\\mathbf{Y}}$ values from $\\hat{\\mathbf{Y}}_{LR} = \\mathbf{X}\\mathbf{P}_{XY}$."
    ]
   },
   {
@@ -1227,8 +1139,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "Y_lr_train = lr.transform(X_train)\n",
-    "Y_lr_test = lr.transform(X_test)"
+    "Y_lr_train = lr.predict(X_train)\n",
+    "Y_lr_test = lr.predict(X_test)"
    ]
   },
   {
@@ -1246,20 +1158,14 @@
     "plt.show()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Calling `lr.statistics(X,Y)` outputs the statistics of the regression of $\\mathbf{X}$ and $\\mathbf{Y}$."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "table_from_dict([lr.statistics(X_train, Y_train), lr.statistics(X_test, Y_test)], \n",
+    "table_from_dict([get_stats(x=X_train, y=Y_train, yp = Y_lr_train), \n",
+    "                 get_stats(x=X_test, y=Y_test, yp = Y_lr_test)], \n",
     "                 headers = [\"Training\", \"Testing\"], \n",
     "                 title=\"Linear Regression\")"
    ]
@@ -1294,13 +1200,13 @@
     "frames = ase.io.read(\"../datasets/CSD-1000R.xyz\", \":10\")\n",
     "\n",
     "# We want to have prediction and projection for all the environments\n",
-    "lr = LR(regularization=best_regularization)\n",
+    "lr = Ridge(alpha=best_regularization)\n",
     "lr.fit(X_train, Y_train)\n",
-    "Y_lr_full_scaled = lr.transform(X)\n",
+    "Y_lr_full_scaled = lr.predict(X)\n",
     "# we have to rescale the properties back to be able to compare them to the actual values\n",
     "Y_lr_full = Y_scale * Y_lr_full_scaled + Y_center\n",
     "\n",
-    "pca = PCA(n_PC=n_PC)\n",
+    "pca = PCA(n_components=n_PC)\n",
     "pca.fit(X_train)\n",
     "X_pca_full = pca.transform(X)"
    ]