diff --git a/cpp/include/cones.h b/cpp/include/cones.h
index ddde2d8..f93230c 100644
--- a/cpp/include/cones.h
+++ b/cpp/include/cones.h
@@ -16,6 +16,20 @@ class Cone {
       : type(type), sizes(sizes){};
 };
 
+/* Project `x` onto the primal or dual cone.
+ *
+ *  Args:
+ *    x:     The point at which to evaluate the derivative
+ *    cones: A list of cones; the cone on which to project is the cartesian
+ *           product of these cones
+ *    dual:  whether to project onto the dual cone
+ *
+ *  Returns:
+ *    A Vector with the projection.
+ */
+Vector projection(const Vector &x, const std::vector<Cone> &cones,
+                  bool dual);
+
 /* Compute the derivative, at `x`, of a projection onto a cone.
  *
  *  Args:
diff --git a/cpp/include/deriv.h b/cpp/include/deriv.h
index 4a512c2..29f1742 100644
--- a/cpp/include/deriv.h
+++ b/cpp/include/deriv.h
@@ -3,6 +3,13 @@
 #include "cones.h"
 #include "eigen_includes.h"
 #include "linop.h"
+#include "lsqr.h"
+
+LinearOperator dpi(const Vector &u, const Vector &v, double w,
+                   const std::vector<Cone> &cones);
+
+Matrix dpi_dense(const Vector &u, const Vector &v, double w,
+                 const std::vector<Cone> &cones);
 
 LinearOperator M_operator(const SparseMatrix &Q, const std::vector<Cone> &cones,
                           const Vector &u, const Vector &v, double w);
@@ -10,6 +17,11 @@ LinearOperator M_operator(const SparseMatrix &Q, const std::vector<Cone> &cones,
 Matrix M_dense(const Matrix &Q, const std::vector<Cone> &cones, const Vector &u,
                const Vector &v, double w);
 
+LsqrResult _solve_adjoint_derivative_lsqr(
+    const SparseMatrix &Q, const std::vector<Cone> &cones,
+    const Vector &u, const Vector &v, double w, const Vector &dz);
+
+
 // this function releases the GIL.
 Vector _solve_derivative_dense(const Matrix &M, const Matrix &MT,
                                const Vector &rhs);
diff --git a/cpp/include/eigen_includes.h b/cpp/include/eigen_includes.h
index 9e6d095..d6ca6b2 100644
--- a/cpp/include/eigen_includes.h
+++ b/cpp/include/eigen_includes.h
@@ -4,6 +4,7 @@
 #include "Eigen/Sparse"
 
 using Vector = Eigen::VectorXd;
+using VectorRef = Eigen::Ref<Eigen::VectorXd>;
 using Array = Eigen::Array<double, Eigen::Dynamic, 1>;
 using Matrix = Eigen::MatrixXd;
 using MatrixRef = Eigen::Ref<Eigen::MatrixXd>;
diff --git a/cpp/include/residual.h b/cpp/include/residual.h
new file mode 100644
index 0000000..57e3482
--- /dev/null
+++ b/cpp/include/residual.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include "cones.h"
+#include "eigen_includes.h"
+#include "linop.h"
+#include "lsqr.h"
+#include "deriv.h"
+
+std::tuple<Vector, Vector> proj_pq(const std::vector<Cone> &cones,
+                                   const Vector &u, const Vector &v, double w);
+
+/**
+ * Refines the solution to a cone program by using LSQR to improve
+ * the normalized residual map:
+ * https://stanford.edu/~boyd/papers/pdf/cone_prog_refine.pdf
+ */
+Vector refine(const SparseMatrix &Q, const std::vector<Cone> &cones,
+              const Vector &u, const Vector &v, double w,
+              const int n_iter,
+              const double lsqr_lambda=1e-8,
+              const double lsqr_iter_lim=30,
+              const int alpha_K=10);
+
+
+/**
+ * Compute the value of the residual map at a point z = (u, v, w)
+ * for a cone program defined by a sparse Q.
+ */
+Vector residual(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                const Vector &u, const Vector &v, double w);
+
+/**
+ * Compute the value of the normalized residual map at a point z = (u, v, w)
+ * for a cone program defined by a sparse Q.
+ */
+Vector N_residual(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                  const Vector &u, const Vector &v, double w);
+
+/**
+ * Compute the value of the residual map at a point z = (u, v, w)
+ * for a cone program defined by a dense Q.
+ */
+Vector residual_dense(const Matrix &Q, const std::vector<Cone> &cones,
+                      const Vector &u, const Vector &v, double w);
+
+/**
+ * Compute the value of the normalized residual map at a point z = (u, v, w)
+ * for a cone program defined by a sparse Q.
+ */
+Vector N_residual_dense(const Matrix &Q, const std::vector<Cone> &cones,
+                        const Vector &u, const Vector &v, double w);
+
+/**
+ * The derivative of the normalized residual map at a point z = (u, v, w)
+ * for a cone program defined by a sparse Q.
+ */
+LinearOperator DN(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                  const Vector &u, const Vector &v, double w);
+
+/**
+ * The derivative of the normalized residual map at a point z = (u, v, w)
+ * for a cone program defined by a dense Q.
+ */
+Matrix DN_dense(const Matrix &Q, const std::vector<Cone> &cones, const Vector &u,
+                const Vector &v, double w);
+
+/**
+ * Left-multiply a point dz = (du, dv, dw) by the derivative of the
+ * normalized residual map.
+ */
+Vector apply_DN_matvec(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                       const Vector &u, const Vector &v, double w,
+                       const Vector &du, const Vector &dv, double dw);
+
+/**
+ * Right-multiply a point dz = (du, dv, dw) by the derivative of the
+ * normalized residual map.
+ */
+Vector apply_DN_rmatvec(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                        const Vector &u, const Vector &v, double w,
+                        const Vector &du, const Vector &dv, double dw);
diff --git a/cpp/src/cones.cpp b/cpp/src/cones.cpp
index 25fbe9b..3ebed4e 100644
--- a/cpp/src/cones.cpp
+++ b/cpp/src/cones.cpp
@@ -14,6 +14,55 @@
 const double EulerConstant = std::exp(1.0);
 const double sqrt_two = std::sqrt(2.0);
 
+int vectorized_psd_size(int n) { return n * (n + 1) / 2; }
+
+
+bool in_exp(const Eigen::Vector3d &x) {
+  return (x[0] <= 0 && std::abs(x[1]) <= CONE_THRESH && x[2] >= 0) ||
+         (x[1] > 0 && x[1] * exp(x[0] / x[1]) - x[2] <= CONE_THRESH);
+}
+
+bool in_exp_dual(const Eigen::Vector3d &x) {
+  return (std::abs(x[0]) <= CONE_THRESH && x[1] >= 0 && x[2] >= 0) ||
+         (x[0] < 0 &&
+          -x[0] * exp(x[1] / x[0]) - EulerConstant * x[2] <= CONE_THRESH);
+}
+
+Vector lower_triangular_from_matrix(const Matrix &matrix) {
+  int n = matrix.rows();
+  Vector lower_tri = Vector::Zero(vectorized_psd_size(n));
+  int offset = 0;
+  for (int col = 0; col < n; ++col) {
+    for (int row = col; row < n; ++row) {
+      if (row != col) {
+        lower_tri[offset] = matrix(row, col) * sqrt_two;
+      } else {
+        lower_tri[offset] = matrix(row, col);
+      }
+      ++offset;
+    }
+  }
+  return lower_tri;
+}
+
+Matrix matrix_from_lower_triangular(const Vector &lower_tri) {
+  int n = static_cast<int>(std::sqrt(2 * lower_tri.size()));
+  Matrix matrix = Matrix::Zero(n, n);
+  int offset = 0;
+  for (int col = 0; col < n; ++col) {
+    for (int row = col; row < n; ++row) {
+      if (row != col) {
+        matrix(row, col) = lower_tri[offset] / sqrt_two;
+        matrix(col, row) = lower_tri[offset] / sqrt_two;
+      } else {
+        matrix(row, col) = lower_tri[offset];
+      }
+      ++offset;
+    }
+  }
+  return matrix;
+}
+
 double exp_newton_one_d(double rho, double y_hat, double z_hat) {
   double t = std::max(-z_hat, 1e-6);
   double f, fp;
@@ -115,52 +164,91 @@ Eigen::Vector3d project_exp_cone(const Eigen::Vector3d &x) {
   return projection;
 }
 
-bool in_exp(const Eigen::Vector3d &x) {
-  return (x[0] <= 0 && std::abs(x[1]) <= CONE_THRESH && x[2] >= 0) ||
-         (x[1] > 0 && x[1] * exp(x[0] / x[1]) - x[2] <= CONE_THRESH);
-}
 
-bool in_exp_dual(const Eigen::Vector3d &x) {
-  return (std::abs(x[0]) <= CONE_THRESH && x[1] >= 0 && x[2] >= 0) ||
-         (x[0] < 0 &&
-          -x[0] * exp(x[1] / x[0]) - EulerConstant * x[2] <= CONE_THRESH);
-}
-
-int vectorized_psd_size(int n) { return n * (n + 1) / 2; }
-
-Vector lower_triangular_from_matrix(const Matrix &matrix) {
-  int n = matrix.rows();
-  Vector lower_tri = Vector::Zero(vectorized_psd_size(n));
-  int offset = 0;
-  for (int col = 0; col < n; ++col) {
-    for (int row = col; row < n; ++row) {
-      if (row != col) {
-        lower_tri[offset] = matrix(row, col) * sqrt_two;
-      } else {
-        lower_tri[offset] = matrix(row, col);
-      }
-      ++offset;
+void _projection(VectorRef &y, const Vector &x,
+                 ConeType type, bool dual) {
+  if (type == ZERO) {
+    if (dual) {
+      y << x;
+    } else {
+      /* segment is already zero */
+    }
+  } else if (type == POS) {
+    y << x.cwiseMax(0.);
+  } else if (type == SOC) {
+    double t = x[0];
+    const Vector z = x.segment(1, x.rows()-1);
+    double norm_z = z.norm();
+    if (norm_z <= t) {
+      y<< x;
+    } else if (norm_z <= -t) {
+      /* segment is already zero */
+    } else {
+      float c = 0.5 * (1. + t/norm_z);
+      VectorRef y_z = y.segment(1, y.rows()-1);
+      y[0] = c * norm_z;
+      y_z << c * z;
     }
+  } else if (type == PSD) {
+    const Matrix &X = matrix_from_lower_triangular(x);
+    Eigen::SelfAdjointEigenSolver<Matrix> eigen_solver(X.rows());
+    eigen_solver.compute(X);
+    const Vector &eigenvalues = eigen_solver.eigenvalues();
+    const Matrix &Q = eigen_solver.eigenvectors();
+    const Vector &clamped_eigenvalues = eigenvalues.cwiseMax(0.);
+    Matrix result = Q * clamped_eigenvalues.asDiagonal() * Q.transpose();
+    y << lower_triangular_from_matrix(result);
+  } else if (type == EXP) {
+    int num_cones = x.size() / 3;
+    int offset = 0;
+    for (int i = 0; i < num_cones; ++i) {
+        Eigen::Vector3d x_i;
+        if (dual) {
+          x_i = -1. * x.segment(offset, 3);
+        } else {
+          x_i = x.segment(offset, 3);
+        }
+        y.segment(offset, 3) << project_exp_cone(x_i);
+        offset += 3;
+    }
+    // via Moreau: Pi_K*(x) = x + Pi_K(-x)
+    if (dual) {
+        y += x;
+    }
+  } else if (type == EXP_DUAL) {
+    _projection(y, x, EXP, !dual);
+  } else {
+    throw std::invalid_argument("Cone not supported.");
   }
-  return lower_tri;
 }
 
-Matrix matrix_from_lower_triangular(const Vector &lower_tri) {
-  int n = static_cast<int>(std::sqrt(2 * lower_tri.size()));
-  Matrix matrix = Matrix::Zero(n, n);
+
+Vector projection(const Vector &x, const std::vector<Cone> &cones,
+                  bool dual) {
+  Vector y = Vector::Zero(x.rows());
+
   int offset = 0;
-  for (int col = 0; col < n; ++col) {
-    for (int row = col; row < n; ++row) {
-      if (row != col) {
-        matrix(row, col) = lower_tri[offset] / sqrt_two;
-        matrix(col, row) = lower_tri[offset] / sqrt_two;
-      } else {
-        matrix(row, col) = lower_tri[offset];
+  for (const Cone &cone : cones) {
+    const ConeType &type = cone.type;
+    const std::vector<int> &sizes = cone.sizes;
+    if (std::accumulate(sizes.begin(), sizes.end(), 0) == 0) {
+      continue;
+    }
+    for (int size : sizes) {
+      if (type == PSD) {
+        size = vectorized_psd_size(size);
+      } else if (type == EXP) {
+        size *= 3;
+      } else if (type == EXP_DUAL) {
+        size *= 3;
       }
-      ++offset;
+      VectorRef y_segment = y.segment(offset, size);
+      _projection(y_segment, x.segment(offset, size), type, dual);
+      offset += size;
     }
   }
-  return matrix;
+
+  return y;
 }
 
 LinearOperator _dprojection_exp(const Vector &x, bool dual) {
diff --git a/cpp/src/deriv.cpp b/cpp/src/deriv.cpp
index b92cfeb..d168a88 100644
--- a/cpp/src/deriv.cpp
+++ b/cpp/src/deriv.cpp
@@ -50,12 +50,22 @@ Matrix M_dense(const Matrix &Q, const std::vector<Cone> &cones, const Vector &u,
   return (Q - eye) * dpi_dense(u, v, w, cones) + eye;
 }
 
+
 Vector _solve_derivative_dense(const Matrix &M, const Matrix &MT,
                                const Vector &rhs) {
   // TODO: Factorization could be cached to optimize multiple calls
   return (MT * M).ldlt().solve(MT * rhs);
 }
 
+LsqrResult _solve_adjoint_derivative_lsqr(
+    const SparseMatrix &Q, const std::vector<Cone> &cones,
+    const Vector &u, const Vector &v, double w, const Vector &dz) {
+  LinearOperator M = M_operator(Q, cones, u, v, w);
+  LinearOperator MT = M.transpose();
+  LsqrResult result = lsqr(MT, dz);
+  return result;
+}
+
 Vector _solve_adjoint_derivative_dense(const Matrix &M, const Matrix &MT,
                                        const Vector &dz) {
   // TODO: Factorization could be cached to optimize multiple calls
diff --git a/cpp/src/residual.cpp b/cpp/src/residual.cpp
new file mode 100644
index 0000000..ca01b3f
--- /dev/null
+++ b/cpp/src/residual.cpp
@@ -0,0 +1,170 @@
+#include "residual.h"
+#include "lsqr.h"
+#include "deriv.h"
+#include <tuple>
+#include <iostream>
+
+inline double gt(double x, double t) {
+  if (x >= t) {
+    return 1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+std::tuple<Vector, Vector> proj_pq(const std::vector<Cone> &cones,
+          const Vector &u, const Vector &v, double w) {
+  int n = u.size();
+  int m = v.size();
+  int N = n + m + 1;
+  Vector z = Vector::Zero(N);
+  z << u, v, w;
+  Vector p = Vector(N);
+  p << u, projection(v, cones, true), std::max(w, 0.0);
+  Vector q = p - z;
+  return {p, q};
+}
+
+Vector residual(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                const Vector &u, const Vector &v, double w) {
+  auto [p, q] = proj_pq(cones, u, v, w);
+  return Q*p - q;
+}
+
+Vector N_residual(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                  const Vector &u, const Vector &v, double w) {
+  return residual(Q, cones, u, v, w) / w;
+}
+
+Vector residual_dense(const Matrix &Q, const std::vector<Cone> &cones,
+                      const Vector &u, const Vector &v, double w) {
+  auto [p, q] = proj_pq(cones, u, v, w);
+  return Q*p - q;
+}
+
+Vector N_residual_dense(const Matrix &Q, const std::vector<Cone> &cones,
+                        const Vector &u, const Vector &v, double w) {
+  return residual_dense(Q, cones, u, v, w) / w;
+}
+
+
+Vector apply_DN_matvec(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                       const Vector &u, const Vector &v, double w,
+                       const Vector &du, const Vector &dv, double dw) {
+  int N = u.size() + v.size() + 1;
+  LinearOperator dpi_op = dpi(u, v, w, cones);
+
+  Vector dz = Vector::Zero(N);
+  dz << du, dv, dw;
+
+  Vector dpi_mv = dpi_op.matvec(dz);
+  Vector result = (Q*dpi_mv - dpi_mv + dz) / abs(w);
+  float dz_w = dz[N-1];
+  Vector last_row_offset = gt(w, 0.0) * residual(Q, cones, u, v, w) / (w*w);
+  result -= dz_w * last_row_offset;
+  return result;
+}
+
+Vector apply_DN_rmatvec(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                        const Vector &u, const Vector &v, double w,
+                        const Vector &du, const Vector &dv, double dw) {
+  int N = u.size() + v.size() + 1;
+  LinearOperator dpi_op = dpi(u, v, w, cones);
+
+  Vector dz = Vector::Zero(N);
+  dz << du, dv, dw;
+
+  Vector t = Q.transpose() * dz - dz;
+  Vector dpi_t_rmv = dpi_op.rmatvec(t);
+  Vector result = (dpi_t_rmv + dz) / abs(w);
+  Vector last_row_offset = gt(w, 0.0) * residual(Q, cones, u, v, w) / (w*w);
+  result[N-1] -= last_row_offset.dot(dz);
+  return result;
+}
+
+
+LinearOperator DN(const SparseMatrix &Q, const std::vector<Cone> &cones,
+                  const Vector &u, const Vector &v, double w) {
+  int N = u.size() + v.size() + 1;
+  LinearOperator dpi_op = dpi(u, v, w, cones);
+
+  // TODO: Potentially remove duplicated code applying the matvec/rmatvec,
+  // but keeping for now to cache dpi_op.
+  const VecFn matvec = [Q,cones,u,v,w,N,dpi_op](const Vector &dz) -> Vector {
+    Vector dpi_mv = dpi_op.matvec(dz);
+    Vector result = (Q*dpi_mv - dpi_mv + dz) / abs(w);
+    float dz_w = dz[N-1];
+    Vector last_row_offset = gt(w, 0.0) * residual(Q, cones, u, v, w) / (w*w);
+    result -= dz_w * last_row_offset;
+    return result;
+  };
+  const VecFn rmatvec = [Q,cones,u,v,w,N,dpi_op](const Vector &dz) -> Vector {
+    Vector t = Q.transpose() * dz - dz;
+    Vector dpi_t_rmv = dpi_op.rmatvec(t);
+    Vector result = (dpi_t_rmv + dz) / abs(w);
+    Vector last_row_offset = gt(w, 0.0) * residual(Q, cones, u, v, w) / (w*w);
+    result[N-1] -= last_row_offset.dot(dz);
+    return result;
+  };
+  return LinearOperator(N, N, matvec, rmatvec);
+}
+
+Matrix DN_dense(const Matrix &Q, const std::vector<Cone> &cones,
+                const Vector &u, const Vector &v, double w) {
+  int n = u.size();
+  int m = v.size();
+  int N = n + m + 1;
+  Matrix eye = Matrix::Identity(N, N);
+  Matrix DN = (Q - eye) * dpi_dense(u, v, w, cones) + eye;
+  DN /= abs(w);
+
+  Vector last_row_offset = gt(w, 0.0) * residual_dense(Q, cones, u, v, w) / (w*w);
+  DN.col(N-1) -= last_row_offset;
+  return DN;
+}
+
+Vector refine(const SparseMatrix &Q, const std::vector<Cone> &cones,
+              const Vector &in_u, const Vector &in_v, double in_w,
+              const int n_iter,
+              const double lambda,
+              const double lsqr_iter_lim,
+              const int alpha_K) {
+  int n = in_u.size();
+  int m = in_v.size();
+  int N = n + m + 1;
+
+  Vector z = Vector(N);
+  z << in_u, in_v, in_w;
+
+  for (int i = 0; i < n_iter; ++i) {
+    Vector z_u = z.segment(0, n);
+    Vector z_v = z.segment(n, m);
+    double z_w = z[N-1];
+    Vector res = N_residual(Q, cones, z_u, z_v, z_w);
+    double res_norm = res.norm();
+
+    LinearOperator DN_op = DN(Q, cones, z_u, z_v, z_w);
+    double lsqr_atol = 1e-8;
+    double lsqr_btol = 1e-8;
+    LsqrResult result = lsqr(DN_op, res, lambda, lsqr_atol, lsqr_btol,
+                             1e8, lsqr_iter_lim);
+    Vector step = result.x;
+
+    int alpha_k = 0;
+    double next_norm = res_norm + 1.;
+    Vector next_z;
+    while ((next_norm > res_norm) & (alpha_k < alpha_K)) {
+      double alpha = 1./((double) (2 << alpha_k));
+      next_z = z - alpha * step;
+      Vector next_z_u = next_z.segment(0, n);
+      Vector next_z_v = next_z.segment(n, m);
+      double next_z_w = next_z[N-1];
+      Vector next_res = N_residual(Q, cones, next_z_u, next_z_v, next_z_w);
+      next_norm = next_res.norm();
+      alpha_k += 1;
+    }
+    z << next_z;
+  }
+
+  return z;
+}
diff --git a/cpp/src/wrapper.cpp b/cpp/src/wrapper.cpp
index cefc691..079748e 100644
--- a/cpp/src/wrapper.cpp
+++ b/cpp/src/wrapper.cpp
@@ -6,6 +6,7 @@
 #include "deriv.h"
 #include "linop.h"
 #include "lsqr.h"
+#include "residual.h"
 
 namespace py = pybind11;
 
@@ -46,10 +47,26 @@ PYBIND11_MODULE(_diffcp, m) {
   m.def("M_dense", &M_dense, py::call_guard<py::gil_scoped_release>());
   m.def("_solve_derivative_dense", &_solve_derivative_dense, py::call_guard<py::gil_scoped_release>());
   m.def("_solve_adjoint_derivative_dense", &_solve_adjoint_derivative_dense, py::call_guard<py::gil_scoped_release>());
+  m.def("_solve_adjoint_derivative_lsqr", &_solve_adjoint_derivative_lsqr, py::call_guard<py::gil_scoped_release>());
 
-  m.def("dprojection", &dprojection);
-  m.def("dprojection_dense", &dprojection_dense);
+  m.def("projection", &projection, py::call_guard<py::gil_scoped_release>());
+  m.def("dprojection", &dprojection, py::call_guard<py::gil_scoped_release>());
+  m.def("dprojection_dense", &dprojection_dense, py::call_guard<py::gil_scoped_release>());
+  m.def("dpi", &dpi, py::call_guard<py::gil_scoped_release>());
+  m.def("dpi_dense", &dpi_dense, py::call_guard<py::gil_scoped_release>());
   m.def("project_exp_cone", &project_exp_cone);
   m.def("in_exp", &in_exp);
   m.def("in_exp_dual", &in_exp_dual);
+
+  // residual.h
+  m.def("proj_pq", &proj_pq, py::call_guard<py::gil_scoped_release>());
+  m.def("residual", &residual, py::call_guard<py::gil_scoped_release>());
+  m.def("N_residual", &N_residual, py::call_guard<py::gil_scoped_release>());
+  m.def("residual_dense", &residual_dense, py::call_guard<py::gil_scoped_release>());
+  m.def("N_residual_dense", &N_residual_dense, py::call_guard<py::gil_scoped_release>());
+  m.def("DN", &DN, py::call_guard<py::gil_scoped_release>());
+  m.def("apply_DN_matvec", &apply_DN_matvec, py::call_guard<py::gil_scoped_release>());
+  m.def("apply_DN_rmatvec", &apply_DN_rmatvec, py::call_guard<py::gil_scoped_release>());
+  m.def("DN_dense", &DN_dense, py::call_guard<py::gil_scoped_release>());
+  m.def("refine", &refine, py::call_guard<py::gil_scoped_release>());
 }
diff --git a/diffcp/cone_program.py b/diffcp/cone_program.py
index ead7907..134f73c 100644
--- a/diffcp/cone_program.py
+++ b/diffcp/cone_program.py
@@ -217,7 +217,6 @@ def solve_and_derivative(A, b, c, cone_dict, warm_start=None, mode='lsqr', **kwa
     DT = result["DT"]
     return x, y, s, D, DT
 
-
 def solve_and_derivative_internal(A, b, c, cone_dict, warm_start=None,
                                   mode='lsqr', raise_on_error=True, **kwargs):
     if mode not in ["dense", "lsqr"]:
@@ -281,23 +280,6 @@ def solve_and_derivative_internal(A, b, c, cone_dict, warm_start=None,
     z = (x, y - s, np.array([1]))
     u, v, w = z
 
-    Q = sparse.bmat([
-        [None, A.T, np.expand_dims(c, - 1)],
-        [-A, None, np.expand_dims(b, -1)],
-        [-np.expand_dims(c, -1).T, -np.expand_dims(b, -1).T, None]
-    ])
-
-    D_proj_dual_cone = _diffcp.dprojection(v, cones_parsed, True)
-    if mode == "dense":
-        Q_dense = Q.todense()
-        M = _diffcp.M_dense(Q_dense, cones_parsed, u, v, w)
-        MT = M.T
-    else:
-        M = _diffcp.M_operator(Q, cones_parsed, u, v, w)
-        MT = M.transpose()
-
-    pi_z = pi(z, cones)
-
     def derivative(dA, db, dc, **kwargs):
         """Applies derivative at (A, b, c) to perturbations dA, db, dc
         Args:
@@ -309,6 +291,23 @@ def derivative(dA, db, dc, **kwargs):
            NumPy arrays dx, dy, ds, the result of applying the derivative
            to the perturbations.
         """
+        Q = sparse.bmat([
+            [None, A.T, np.expand_dims(c, - 1)],
+            [-A, None, np.expand_dims(b, -1)],
+            [-np.expand_dims(c, -1).T, -np.expand_dims(b, -1).T, None]
+        ])
+
+        D_proj_dual_cone = _diffcp.dprojection(v, cones_parsed, True)
+        if mode == "dense":
+            Q_dense = Q.todense()
+            M = _diffcp.M_dense(Q_dense, cones_parsed, u, v, w)
+            MT = M.T
+
+        pi_z = pi(z, cones)
+
+        M = _diffcp.M_operator(Q, cones_parsed, u, v, w)
+        MT = M.transpose()
+
         dQ = sparse.bmat([
             [None, dA.T, np.expand_dims(dc, - 1)],
             [-dA, None, np.expand_dims(db, -1)],
@@ -338,6 +337,20 @@ def adjoint_derivative(dx, dy, ds, **kwargs):
             (`dA`, `db`, `dc`), the result of applying the adjoint to the
             perturbations; the sparsity pattern of `dA` matches that of `A`.
         """
+        Q = sparse.bmat([
+            [None, A.T, np.expand_dims(c, - 1)],
+            [-A, None, np.expand_dims(b, -1)],
+            [-np.expand_dims(c, -1).T, -np.expand_dims(b, -1).T, None]
+        ])
+
+        D_proj_dual_cone = _diffcp.dprojection(v, cones_parsed, True)
+        if mode == "dense":
+            Q_dense = Q.todense()
+            M = _diffcp.M_dense(Q_dense, cones_parsed, u, v, w)
+            MT = M.T
+
+        pi_z = pi(z, cones)
+
         dw = -(x @ dx + y @ dy + s @ ds)
         dz = np.concatenate(
             [dx, D_proj_dual_cone.rmatvec(dy + ds) - ds, np.array([dw])])
@@ -347,7 +360,9 @@ def adjoint_derivative(dx, dy, ds, **kwargs):
         elif mode == "dense":
             r = _diffcp._solve_adjoint_derivative_dense(M, MT, dz)
         else:
-            r = _diffcp.lsqr(MT, dz).solution
+            lsqr_result = _diffcp._solve_adjoint_derivative_lsqr(
+                Q, cones_parsed, u, v, w, dz)
+            r = lsqr_result.solution
 
         values = pi_z[cols] * r[rows + n] - pi_z[n + rows] * r[cols]
         dA = sparse.csc_matrix((values, (rows, cols)), shape=A.shape)
diff --git a/diffcp/cones.py b/diffcp/cones.py
index 410b241..0457296 100644
--- a/diffcp/cones.py
+++ b/diffcp/cones.py
@@ -3,7 +3,7 @@
 import scipy.sparse.linalg as splinalg
 import warnings
 
-from _diffcp import dprojection, project_exp_cone, Cone, ConeType
+from _diffcp import projection, dprojection, project_exp_cone, Cone, ConeType
 
 ZERO = "f"
 POS = "l"
@@ -118,7 +118,7 @@ def _proj(x, cone, dual=False):
         raise NotImplementedError("%s not implemented" % cone)
 
 
-def pi(x, cones, dual=False):
+def pi_python(x, cones, dual=False):
     """Projects x onto product of cones (or their duals)
     Args:
         x: NumPy array (with PSD data formatted in SCS convention)
@@ -142,3 +142,16 @@ def pi(x, cones, dual=False):
                 x[offset:offset + dim], cone, dual=dual)
             offset += dim
     return projection
+
+def pi(x, cones, dual=False):
+    """Projects x onto product of cones (or their duals)
+    Args:
+        x: NumPy array (with PSD data formatted in SCS convention)
+        cones: list of (cone name, size)
+        dual: whether to project onto the dual cone
+    Returns:
+        NumPy array that is the projection of `x` onto the (dual) cones
+    """
+
+    cone_list_cpp = parse_cone_dict_cpp(cones)
+    return projection(x, cone_list_cpp, dual)
diff --git a/prof.py b/prof.py
index a71f7a8..92f32d5 100644
--- a/prof.py
+++ b/prof.py
@@ -8,33 +8,50 @@
 import diffcp.cones as cone_lib
 import diffcp.utils as utils
 
-
+modes = ['lsqr']
+# modes = ['lsqr', 'dense']
+n_batch = 256
 m = 100
 n = 50
+n_trial = 10
+eps = 1e-4
+
+data = []
+for i in range(n_batch):
+    data.append(utils.least_squares_eq_scs_data(m, n, seed=i))
+A, b, c, cone_dims = zip(*data)
+
+print('mode forward_time adjoint_time adjoint_derivative_time')
 
-A, b, c, cone_dims = utils.least_squares_eq_scs_data(m, n)
-for mode in ["lsqr", "dense"]:
-    x, y, s, derivative, adjoint_derivative = cone_prog.solve_and_derivative(
-        A, b, c, cone_dims, eps=1e-10, mode=mode)
+for mode in modes:
+    forward_time = 0.0
+    for i in range(n_trial):
+        tic = time.time()
+        x, y, s, derivative, adjoint_derivative = cone_prog.solve_and_derivative_batch(
+            A, b, c, cone_dims, eps=eps, mode=mode)
+        toc = time.time()
+        forward_time += (toc - tic) / float(n_trial)
 
-    dA = utils.get_random_like(
-        A, lambda n: np.random.normal(0, 1e-2, size=n))
-    db = np.random.normal(0, 1e-2, size=b.size)
-    dc = np.random.normal(0, 1e-2, size=c.size)
+    dA, db, dc = [], [], []
+    for i in range(n_batch):
+        dA.append(utils.get_random_like(
+            A[0], lambda n: np.random.normal(0, 1e-2, size=n)))
+        db.append(np.random.normal(0, 1e-2, size=b[0].size))
+        dc.append(np.random.normal(0, 1e-2, size=c[0].size))
 
     derivative_time = 0.0
-    for _ in range(10):
+    for _ in range(n_trial):
         tic = time.time()
         dx, dy, ds = derivative(dA, db, dc)
         toc = time.time()
-        derivative_time += (toc - tic) / 10
+        derivative_time += (toc - tic) / float(n_trial)
 
     adjoint_derivative_time = 0.0
-    for _ in range(10):
+    for _ in range(n_trial):
         tic = time.time()
         dA, db, dc = adjoint_derivative(
-            c, np.zeros(y.size), np.zeros(s.size))
+            c, [np.zeros(y[0].size)] * n_batch, [np.zeros(s[0].size)] * n_batch)
         toc = time.time()
-        adjoint_derivative_time += (toc - tic) / 10
+        adjoint_derivative_time += (toc - tic) / float(n_trial)
 
-    print(mode, derivative_time, adjoint_derivative_time)
+    print(mode, forward_time, derivative_time, adjoint_derivative_time)