diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp
index 42b6b7fc..ac16bfbd 100644
--- a/include/gauxc/exceptions.hpp
+++ b/include/gauxc/exceptions.hpp
@@ -34,7 +34,7 @@ class magma_exception;
 class cutlass_exception;
 #endif
 
-/// C++ Excpetion for genertic GauXC errors
+/// C++ Exception for generic GauXC errors
 class generic_gauxc_exception : public std::exception {
 
   std::string file_;
diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx
index dfbfea06..ccd716eb 100644
--- a/src/xc_integrator/integrator_util/exx_screening.cxx
+++ b/src/xc_integrator/integrator_util/exx_screening.cxx
@@ -255,7 +255,7 @@ void exx_ek_screening(
   device_data.reset_allocations();
   device_data.allocate_static_data_exx_ek_screening( ntasks, nbf, nshells, 
     shpairs.npairs(), basis_map.max_l() );
-  device_data.send_static_data_density_basis( P_abs, ldp, basis );
+  device_data.send_static_data_density_basis( P_abs, ldp, nullptr, 0, nullptr, 0, nullptr, 0,  basis );
   device_data.send_static_data_exx_ek_screening( V_shell_max, ldv, basis_map,
     shpairs );
 
diff --git a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp
index 2ef12537..e590bb90 100644
--- a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp
@@ -63,5 +63,6 @@ void syr2k( device_blas_handle handle,
             int M, int K, T ALPHA, 
             const T* A, int LDA, const T* B, int LDB,
             T BETA, T* C, int LDC );
+
 }
 
diff --git a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp
index 3bc7a447..78ce5324 100644
--- a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp
@@ -7,20 +7,23 @@
  */
 #pragma once
 #include "device/xc_device_task.hpp"
+#include "device/xc_device_data.hpp"
 #include "device/device_queue.hpp"
 
 namespace GauXC {
 
-void eval_uvvars_lda( size_t ntasks, int32_t nbe_max, int32_t npts_max,
+
+void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
   XCDeviceTask* device_tasks, device_queue queue );
 
-void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbe_max, 
-  int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, 
-  const double* deny, const double* denz, double* gamma, device_queue queue );
+void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  XCDeviceTask* device_tasks, device_queue queue );
 
-void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbe_max, 
-  int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, 
-  const double* deny, const double* denz, double* gamma, bool do_lapl,
+void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, 
+  int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, 
   device_queue queue );
 
+void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue );
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp
index 2900f42b..514b6d38 100644
--- a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp
@@ -6,6 +6,7 @@
  * See LICENSE.txt for details
  */
 #include "device/xc_device_task.hpp"
+#include "device/xc_device_data.hpp"
 #include "device/device_queue.hpp"
 
 namespace GauXC {
@@ -14,12 +15,16 @@ void zmat_lda_vxc( size_t        ntasks,
                    int32_t       max_nbf,
                    int32_t       max_npts,
                    XCDeviceTask* tasks_device,
+                   integrator_ks_scheme s,
+                   density_id sel,
                    device_queue queue );
 
 void zmat_gga_vxc( size_t        ntasks,
                    int32_t       max_nbf,
                    int32_t       max_npts,
                    XCDeviceTask* tasks_device,
+                   integrator_ks_scheme s,
+                   density_id sel,
                    device_queue queue );
 
 void zmat_mgga_vxc( size_t        ntasks,
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu
index bdfc8724..62557401 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu
@@ -478,6 +478,88 @@ void eval_collocation_shell_to_task_hessian(
 }
 
 
+uint32_t max_threads_shell_to_task_collocation_laplacian( int32_t l, bool pure ) {
+  if( pure ) {
+    switch(l) {
+      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );\
+      $for( L in range(1, L_max + 1) )
+      case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_$(L) );
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  } else {
+    switch(l) {\
+      $for( L in range(L_max + 1) )
+      case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L) );\
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  }
+  return 0;
+}
+
+
+
+
+
+template <typename... Args>
+void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l,
+  bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
+
+  dim3 threads = max_threads_shell_to_task_collocation(l,pure);
+  int nwarp_per_block = threads.x / cuda::warp_size;
+  int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
+  dim3 block(n_task_blocks, 1, nshells);
+
+  if( pure ) {
+    switch(l) {
+      case 0:
+        collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;
+      $for( L in range(1, L_max + 1) )
+      case $(L):
+        collocation_device_shell_to_task_kernel_spherical_laplacian_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;\
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  } else {
+    switch(l) {\
+      $for( L in range(0, L_max + 1) )
+      case $(L):
+        collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;\
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  }
+
+}
+
+
+
+void eval_collocation_shell_to_task_laplacian(
+  uint32_t                    max_l,
+  AngularMomentumShellToTaskBatch* l_batched_shell_to_task,
+  XCDeviceTask*               device_tasks,
+  device_queue           queue
+) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
+
+  for( auto l = 0u; l <= max_l; ++l ) {
+    auto pure = l_batched_shell_to_task[l].pure;
+    auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device;
+    auto nshells = l_batched_shell_to_task[l].nshells_in_batch;
+    auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average);
+    dispatch_shell_to_task_collocation_laplacian( stream, l, pure,
+      ntask_average, nshells, shell_to_task_device, device_tasks );
+    auto stat = cudaGetLastError();
+    GAUXC_CUDA_ERROR("LAP", stat);
+  }
+
+
+}
 
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp
index f1a62f95..544554b3 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp
@@ -19,6 +19,10 @@
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_hessian.hpp"\
 $endfor
 
+$for( L in range(L_max + 1))
+#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_laplacian.hpp"\
+$endfor
+
 $for( L in range(L_max + 1))
 #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L).hpp"\
 $endfor
@@ -30,3 +34,7 @@
 $for( L in range(L_max + 1))
 #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_hessian.hpp"\
 $endfor
+
+$for( L in range(L_max + 1))
+#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_laplacian.hpp"\
+$endfor
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu
index f421c4f3..ee7c7746 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu
@@ -181,5 +181,6 @@ void syr2k( device_blas_handle generic_handle,
 
 }
 
+
 }
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu
index 242c73e4..4dd328bd 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu
@@ -165,9 +165,9 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
       const auto* __restrict__  vrho = task->vrho;
       const auto* __restrict__  vgamma = task->vgamma;
 
-      const auto* __restrict__ den_x = task->ddenx;
-      const auto* __restrict__ den_y = task->ddeny;
-      const auto* __restrict__ den_z = task->ddenz;
+      const auto* __restrict__ den_x = task->dden_sx;
+      const auto* __restrict__ den_y = task->dden_sy;
+      const auto* __restrict__ den_z = task->dden_sz;
 
       #pragma unroll 1
       for( uint32_t ipt = threadIdx.x % cuda::warp_size; 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
index c42d2182..6aea5225 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
@@ -10,10 +10,20 @@
 #include "device_specific/cuda_device_constants.hpp"
 #include <gauxc/util/div_ceil.hpp>
 #include "device_specific/cuda_util.hpp"
+#include "device/xc_device_data.hpp"
 
 namespace GauXC {
 
-__global__ void eval_uvars_lda_kernel( size_t        ntasks,
+#define VVAR_KERNEL_SM_BLOCK 32
+#define GGA_KERNEL_SM_WARPS 16
+#define MGGA_KERNEL_SM_BLOCK 32
+
+__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+  // eval_vvars populated uvar storage already in the case of LDA+RKS
+  return;
+}
+
+__global__ void eval_uvars_lda_uks_kernel( size_t        ntasks,
                                        XCDeviceTask* tasks_device ) {
 
   const int batch_idx = blockIdx.z;
@@ -22,131 +32,275 @@ __global__ void eval_uvars_lda_kernel( size_t        ntasks,
   auto& task = tasks_device[ batch_idx ];
 
   const auto npts            = task.npts;
-  const auto nbf             = task.bfn_screening.nbe;
 
-  auto* den_eval_device   = task.den;
+  auto* den_pos_eval_device   = task.den_s;
+  auto* den_neg_eval_device   = task.den_z;
 
-  const auto* basis_eval_device = task.bf;
 
-  const auto* den_basis_prod_device = task.zmat;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
 
-  register double den_reg = 0.;
+  if( tid < npts ) {
+    const auto ps = den_pos_eval_device[ tid ];
+    const auto pz = den_neg_eval_device[ tid ];
+    den_pos_eval_device[ tid ] = 0.5*(ps + pz);
+    den_neg_eval_device[ tid ] = 0.5*(ps - pz);
 
-  if( tid_x < nbf and tid_y < npts ) {
+  }
+}
 
-    const double* bf_col   = basis_eval_device     + tid_x*npts;
-    const double* db_col   = den_basis_prod_device + tid_x*npts;
+__global__ void eval_uvars_lda_gks_kernel( size_t        ntasks,
+                                       XCDeviceTask* tasks_device ) {
 
-    den_reg = bf_col[ tid_y ]   * db_col[ tid_y ];
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
 
-  }
+  auto& task = tasks_device[ batch_idx ];
 
-  // Warp blocks are stored col major
-  constexpr auto warp_size = cuda::warp_size;
-  //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block;
-  den_reg = cuda::warp_reduce_sum<warp_size>( den_reg );
+  const auto npts            = task.npts;
 
+  auto* den_z_eval_device   = task.den_s;
+  auto* den_s_eval_device   = task.den_z;
+  auto* den_y_eval_device   = task.den_y;
+  auto* den_x_eval_device   = task.den_x;
+  auto* K_z_eval_device     = task.K_z;
+  auto* K_y_eval_device     = task.K_y;
+  auto* K_x_eval_device     = task.K_x;
+  const double dtolsq = 1e-24;  // TODO: make variable
 
-  if( threadIdx.x == 0 and tid_y < npts ) {
-    atomicAdd( den_eval_device   + tid_y, den_reg );
-  }
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+  if( tid < npts ) {
+    const auto ps = den_s_eval_device[ tid ];
+    const auto pz = den_z_eval_device[ tid ];
+    const auto py = den_y_eval_device[ tid ];
+    const auto px = den_x_eval_device[ tid ];
+    const auto mtemp = pz*pz + px*px + py*py;
+    double mnorm = 0.;
   
+    if (mtemp > dtolsq) {
+      const double inv_mnorm = rsqrt(mtemp);
+      mnorm = 1./inv_mnorm;
+      K_z_eval_device[ tid ] = pz * inv_mnorm;
+      K_y_eval_device[ tid ] = py * inv_mnorm;
+      K_x_eval_device[ tid ] = px * inv_mnorm;
+    }
+    else {
+      mnorm = (1. / 3.) * (px + py + pz);
+      K_z_eval_device[ tid ] = 1. / 3.;
+      K_y_eval_device[ tid ] = 1. / 3.;
+      K_x_eval_device[ tid ] = 1. / 3.;
+    }
+
+    den_s_eval_device[ tid ] = 0.5*(ps + mnorm);
+    den_z_eval_device[ tid ] = 0.5*(ps - mnorm);
 
+  }
 }
 
 
+__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+  
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts  = task.npts;
+  
+  const auto*   dden_sx_eval_device = task.dden_sx;
+  const auto*   dden_sy_eval_device = task.dden_sy;
+  const auto*   dden_sz_eval_device = task.dden_sz;
+  auto*         gamma_eval_device   = task.gamma;
 
-#define GGA_KERNEL_SM_BLOCK_Y 32
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-__global__ void eval_uvars_gga_kernel( size_t           ntasks,
-                                       XCDeviceTask* tasks_device ) {
+  if( tid < npts ) {
+    const double dx = dden_sx_eval_device[ tid ];
+    const double dy = dden_sy_eval_device[ tid ];
+    const double dz = dden_sz_eval_device[ tid ];
 
-  constexpr auto warp_size = cuda::warp_size;
-  //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block;
+    gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz;
+
+  }
+
+}
+
+__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
 
   const int batch_idx = blockIdx.z;
   if( batch_idx >= ntasks ) return;
 
-  auto& task = tasks_device[ batch_idx ];
-
+  const auto& task = tasks_device[ batch_idx ];
   const auto npts            = task.npts;
-  const auto nbf             = task.bfn_screening.nbe;
 
-  auto* den_eval_device   = task.den;
-  auto* den_x_eval_device = task.ddenx;
-  auto* den_y_eval_device = task.ddeny;
-  auto* den_z_eval_device = task.ddenz;
+  auto*           den_pos_eval_device   = task.den_s;
+  const auto*     den_pos_x_eval_device = task.dden_sx;
+  const auto*     den_pos_y_eval_device = task.dden_sy;
+  const auto*     den_pos_z_eval_device = task.dden_sz;
 
-  const auto* basis_eval_device = task.bf;
-  const auto* dbasis_x_eval_device = task.dbfx;
-  const auto* dbasis_y_eval_device = task.dbfy;
-  const auto* dbasis_z_eval_device = task.dbfz;
+  auto*           den_neg_eval_device   = task.den_z;
+  const auto*     den_neg_x_eval_device = task.dden_zx;
+  const auto*     den_neg_y_eval_device = task.dden_zy;
+  const auto*     den_neg_z_eval_device = task.dden_zz;
 
-  const auto* den_basis_prod_device = task.zmat;
+  auto*     gamma_pp_eval_device  = task.gamma_pp;
+  auto*     gamma_pm_eval_device  = task.gamma_pm;
+  auto*     gamma_mm_eval_device  = task.gamma_mm;
 
-  __shared__ double den_shared[4][warp_size][GGA_KERNEL_SM_BLOCK_Y+1];
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-  for ( int bid_x = blockIdx.x * blockDim.x; 
-        bid_x < nbf;
-        bid_x += blockDim.x * gridDim.x ) {
-    
-    for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; 
-          bid_y < npts;
-          bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) {
-        
-      for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
-        den_shared[0][threadIdx.x][sm_y] = 0.;
-        den_shared[1][threadIdx.x][sm_y] = 0.;
-        den_shared[2][threadIdx.x][sm_y] = 0.;
-        den_shared[3][threadIdx.x][sm_y] = 0.;
+  if( tid < npts ) {
+    const double ps     = den_pos_eval_device[ tid ];
+    const double pz     = den_neg_eval_device[ tid ];
+    const double dndx   = den_pos_x_eval_device[ tid ];
+    const double dndy   = den_pos_y_eval_device[ tid ];
+    const double dndz   = den_pos_z_eval_device[ tid ];
+    const double dMzdx  = den_neg_x_eval_device[ tid ];
+    const double dMzdy  = den_neg_y_eval_device[ tid ];
+    const double dMzdz  = den_neg_z_eval_device[ tid ];
+
+    // (del n).(del n)
+    const auto dn_sq  = dndx*dndx + dndy*dndy + dndz*dndz;
+    // (del Mz).(del Mz)
+    const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz;
+    // (del n).(del Mz)
+    const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz;
+
+    gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz;
+    gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq);
+    gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz;
+
+    den_pos_eval_device[ tid ] = 0.5*(ps + pz);
+    den_neg_eval_device[ tid ] = 0.5*(ps - pz);
+  }
 
-        if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { 
-          const double* db_col   = den_basis_prod_device + (bid_x + sm_y)*npts;
-          const double* bf_col   = basis_eval_device     + (bid_x + sm_y)*npts;
-          const double* bf_x_col = dbasis_x_eval_device  + (bid_x + sm_y)*npts;
-          const double* bf_y_col = dbasis_y_eval_device  + (bid_x + sm_y)*npts;
-          const double* bf_z_col = dbasis_z_eval_device  + (bid_x + sm_y)*npts;
+}
 
-          den_shared[0][threadIdx.x][sm_y] = bf_col  [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-          den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-          den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-          den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-        }
-      }
-      __syncthreads();
+__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
 
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
 
-      for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
-        const int tid_y = bid_y + sm_y;
-        register double den_reg = den_shared[0][sm_y][threadIdx.x];
-        register double dx_reg  = den_shared[1][sm_y][threadIdx.x];
-        register double dy_reg  = den_shared[2][sm_y][threadIdx.x];
-        register double dz_reg  = den_shared[3][sm_y][threadIdx.x];
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
 
-        // Warp blocks are stored col major
-        den_reg =     cuda::warp_reduce_sum<warp_size>( den_reg );
-        dx_reg  = 2 * cuda::warp_reduce_sum<warp_size>( dx_reg );
-        dy_reg  = 2 * cuda::warp_reduce_sum<warp_size>( dy_reg );
-        dz_reg  = 2 * cuda::warp_reduce_sum<warp_size>( dz_reg );
+        auto*     den_s_eval_device   = task.den_s;
+  const auto*     dden_sx_eval_device = task.dden_sx;
+  const auto*     dden_sy_eval_device = task.dden_sy;
+  const auto*     dden_sz_eval_device = task.dden_sz;
 
+        auto*     den_z_eval_device   = task.den_z;
+  const auto*     dden_zx_eval_device = task.dden_zx;
+  const auto*     dden_zy_eval_device = task.dden_zy;
+  const auto*     dden_zz_eval_device = task.dden_zz;
 
-        if( threadIdx.x == 0 and tid_y < npts ) {
-          atomicAdd( den_eval_device   + tid_y, den_reg );
-          atomicAdd( den_x_eval_device + tid_y, dx_reg  );
-          atomicAdd( den_y_eval_device + tid_y, dy_reg  );
-          atomicAdd( den_z_eval_device + tid_y, dz_reg  );
-        }
-      }
-      __syncthreads();
+  const auto*     den_y_eval_device   = task.den_y;
+  const auto*     dden_yx_eval_device = task.dden_yx;
+  const auto*     dden_yy_eval_device = task.dden_yy;
+  const auto*     dden_yz_eval_device = task.dden_yz;
+
+  const auto*     den_x_eval_device   = task.den_x;
+  const auto*     dden_xx_eval_device = task.dden_xx;
+  const auto*     dden_xy_eval_device = task.dden_xy;
+  const auto*     dden_xz_eval_device = task.dden_xz;
+
+  auto*     gamma_pp_eval_device  = task.gamma_pp;
+  auto*     gamma_pm_eval_device  = task.gamma_pm;
+  auto*     gamma_mm_eval_device  = task.gamma_mm;
+
+  auto*     H_z_eval_device = task.H_z;
+  auto*     H_y_eval_device = task.H_y;
+  auto*     H_x_eval_device = task.H_x;
+  auto*     K_z_eval_device = task.K_z;
+  auto*     K_y_eval_device = task.K_y;
+  auto*     K_x_eval_device = task.K_x;
+
+  const double dtolsq = 1e-24;  // TODO: make variable
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const double dndz = dden_sz_eval_device[ tid ];
+    const double dndy = dden_sy_eval_device[ tid ];
+    const double dndx = dden_sx_eval_device[ tid ];
+
+    const double dMzdz = dden_zz_eval_device[ tid ];
+    const double dMzdy = dden_zy_eval_device[ tid ];
+    const double dMzdx = dden_zx_eval_device[ tid ];
+
+    const double dMydz = dden_yz_eval_device[ tid ];
+    const double dMydy = dden_yy_eval_device[ tid ];
+    const double dMydx = dden_yx_eval_device[ tid ];
+
+    const double dMxdz = dden_xz_eval_device[ tid ];
+    const double dMxdy = dden_xy_eval_device[ tid ];
+    const double dMxdx = dden_xx_eval_device[ tid ];
+
+    const auto ps = den_s_eval_device[ tid ];
+    const auto pz = den_z_eval_device[ tid ];
+    const auto py = den_y_eval_device[ tid ];
+    const auto px = den_x_eval_device[ tid ];
+
+    const auto mtemp = pz*pz + px*px + py*py;
+    double mnorm = 0.;
+
+    const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz;
+    const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz;
+    const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz;
+    const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz;
+
+    const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz;
+    const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz;
+    const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz;
+
+    const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely;
+    const auto s_sum =
+               dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py;
+
+    const auto inv_sqsum2 =
+        rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx +
+             dels_dot_dely * dels_dot_dely);
+    const auto sqsum2 = 1./inv_sqsum2;
+
+    double sign = 1.;
+    if( signbit(s_sum)) 
+      sign = -1.;
+
+
+    if (mtemp > dtolsq) {
+      const double inv_mnorm = rsqrt(mtemp);
+      mnorm = 1./inv_mnorm;
+      K_z_eval_device[ tid ] = pz * inv_mnorm;
+      K_y_eval_device[ tid ] = py * inv_mnorm;
+      K_x_eval_device[ tid ] = px * inv_mnorm;
+      H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2;
+      H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2;
+      H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2;
+    }
+    else {
+      mnorm = (1. / 3.) * (px + py + pz);
+      K_z_eval_device[ tid ] = 1. / 3.;
+      K_y_eval_device[ tid ] = 1. / 3.;
+      K_x_eval_device[ tid ] = 1. / 3.;
+
+      H_z_eval_device[ tid ] = sign / 3.;
+      H_y_eval_device[ tid ] = sign / 3.;
+      H_x_eval_device[ tid ] = sign / 3.;
     }
+
+    gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2;
+    gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum);
+    gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2;
+
+    den_s_eval_device[ tid ] = 0.5*(ps + mnorm);
+    den_z_eval_device[ tid ] = 0.5*(ps - mnorm);
+
   }
+
 }
 
 template <bool need_lapl>
-__global__ void eval_uvars_mgga_kernel( size_t           ntasks,
+__global__ void eval_uvars_mgga_rks_kernel( size_t           ntasks,
                                        XCDeviceTask* tasks_device ) {
 
   constexpr auto warp_size = cuda::warp_size;
@@ -184,17 +338,17 @@ __global__ void eval_uvars_mgga_kernel( size_t           ntasks,
     den_basis_prod_device = task.zmat;
   }
 
-  __shared__ double den_shared[3+!!need_lapl][warp_size][GGA_KERNEL_SM_BLOCK_Y+1];
+  __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1];
 
   for ( int bid_x = blockIdx.x * blockDim.x; 
         bid_x < nbf;
         bid_x += blockDim.x * gridDim.x ) {
     
-    for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; 
+    for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; 
           bid_y < npts;
-          bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) {
+          bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) {
         
-      for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
+      for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
         den_shared[0][threadIdx.x][sm_y] = 0.;
         den_shared[1][threadIdx.x][sm_y] = 0.;
         den_shared[2][threadIdx.x][sm_y] = 0.;
@@ -226,7 +380,7 @@ __global__ void eval_uvars_mgga_kernel( size_t           ntasks,
       __syncthreads();
 
 
-      for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
+      for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
         const int tid_y = bid_y + sm_y;
 
         register double tx_reg  = den_shared[0][sm_y][threadIdx.x];
@@ -258,93 +412,280 @@ __global__ void eval_uvars_mgga_kernel( size_t           ntasks,
 }
 
 
-__global__ void eval_vvars_gga_kernel( 
-  size_t   npts,
-  const double* den_x_eval_device,
-  const double* den_y_eval_device,
-  const double* den_z_eval_device,
-        double* gamma_eval_device
-) {
+#define EVAL_UVARS_KERNEL(xc_approx) \
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();  \
+  dim3 blocks( util::div_ceil( npts_max,  threads.x ),  \
+               1, \
+               ntasks ); \
+  switch ( ks_scheme ) { \
+    case RKS: \
+      eval_uvars_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
+      break; \
+    case UKS: \
+      eval_uvars_##xc_approx##_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
+      break; \
+    case GKS: \
+      eval_uvars_##xc_approx##_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
+      break; \
+    default: \
+      GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate UV vars" ); \
+  } 
+
+void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 );
+  EVAL_UVARS_KERNEL(lda);
+}
 
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if( tid < npts ) {
 
-    const double dx = den_x_eval_device[ tid ];
-    const double dy = den_y_eval_device[ tid ];
-    const double dz = den_z_eval_device[ tid ];
 
-    gamma_eval_device[tid] = dx*dx + dy*dy + dz*dz;
+void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 );
+  EVAL_UVARS_KERNEL(gga);
+}
+
+
 
+void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, 
+  int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, 
+  device_queue queue ) {
+  // TODO: This interface should be unified with the lda/gga interfaces
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
+
+  // U Variables
+  {
+  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
+  dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
+               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               ntasks );
+  if(do_lapl)
+    eval_uvars_mgga_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+  else
+    eval_uvars_mgga_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   }
 
+  // V variables (GAMMA)
+  dim3 threads( cuda::max_threads_per_thread_block );
+  dim3 blocks( util::div_ceil( npts_total,  threads.x ),  
+               1, 
+               ntasks ); 
+  eval_uvars_gga_rks_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
 }
 
 
 
 
-void eval_uvvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max,
-  XCDeviceTask* device_tasks, device_queue queue ) {
 
-  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
-  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 );
-  dim3 blocks( util::div_ceil( nbf_max,  threads.x ),
-               util::div_ceil( npts_max, threads.y ),
-               ntasks );
 
-  eval_uvars_lda_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+
+template <density_id den_select>
+__global__ void eval_vvar_grad_kern( size_t        ntasks,
+                                       XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  double* den_eval_device   = nullptr;
+  double* den_x_eval_device = nullptr;
+  double* den_y_eval_device = nullptr;
+  double* den_z_eval_device = nullptr;
+
+  constexpr auto warp_size = cuda::warp_size;
+
+  if constexpr (den_select == DEN_S) {
+    den_eval_device   = task.den_s;
+    den_x_eval_device = task.dden_sx;
+    den_y_eval_device = task.dden_sy;
+    den_z_eval_device = task.dden_sz;
+  }
+  if constexpr (den_select == DEN_Z) {
+    den_eval_device   = task.den_z;
+    den_x_eval_device = task.dden_zx;
+    den_y_eval_device = task.dden_zy;
+    den_z_eval_device = task.dden_zz;
+  }
+  if constexpr (den_select == DEN_Y) {
+    den_eval_device   = task.den_y;
+    den_x_eval_device = task.dden_yx;
+    den_y_eval_device = task.dden_yy;
+    den_z_eval_device = task.dden_yz;
+  }
+  if constexpr (den_select == DEN_X) {
+    den_eval_device   = task.den_x;
+    den_x_eval_device = task.dden_xx;
+    den_y_eval_device = task.dden_xy;
+    den_z_eval_device = task.dden_xz;
+  }
+
+  const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+
+  const auto* den_basis_prod_device = task.zmat;
+  
+  __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1];
+
+  for ( int bid_x = blockIdx.x * blockDim.x; 
+        bid_x < nbf;
+        bid_x += blockDim.x * gridDim.x ) {
+    
+    for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; 
+          bid_y < npts;
+          bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) {
+        
+      for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
+        den_shared[0][threadIdx.x][sm_y] = 0.;
+        den_shared[1][threadIdx.x][sm_y] = 0.;
+        den_shared[2][threadIdx.x][sm_y] = 0.;
+        den_shared[3][threadIdx.x][sm_y] = 0.;
+
+        if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { 
+          const double* db_col   = den_basis_prod_device + (bid_x + sm_y)*npts;
+          const double* bf_col   = basis_eval_device     + (bid_x + sm_y)*npts;
+          const double* bf_x_col = dbasis_x_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_y_col = dbasis_y_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_z_col = dbasis_z_eval_device  + (bid_x + sm_y)*npts;
+
+          den_shared[0][threadIdx.x][sm_y] = bf_col  [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+        }
+      }
+      __syncthreads();
+
+
+      for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
+        const int tid_y = bid_y + sm_y;
+        register double den_reg = den_shared[0][sm_y][threadIdx.x];
+        register double dx_reg  = den_shared[1][sm_y][threadIdx.x];
+        register double dy_reg  = den_shared[2][sm_y][threadIdx.x];
+        register double dz_reg  = den_shared[3][sm_y][threadIdx.x];
+
+        // Warp blocks are stored col major
+        den_reg =     cuda::warp_reduce_sum<warp_size>( den_reg );
+        dx_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dx_reg );
+        dy_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dy_reg );
+        dz_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dz_reg );
+
+
+        if( threadIdx.x == 0 and tid_y < npts ) {
+          atomicAdd( den_eval_device   + tid_y, den_reg );
+          atomicAdd( den_x_eval_device + tid_y, dx_reg  );
+          atomicAdd( den_y_eval_device + tid_y, dy_reg  );
+          atomicAdd( den_z_eval_device + tid_y, dz_reg  );
+        }
+      }
+      __syncthreads();
+    }
+  }
 
 }
 
 
-void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, 
-  int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, 
-  const double* deny, const double* denz, double* gamma, device_queue queue ) {
 
-  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
+template <density_id den_select>
+__global__ void eval_vvar_kern( size_t        ntasks,
+                                       XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  double* den_eval_device   = nullptr;
+  // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored
+  if constexpr (den_select == DEN_S) den_eval_device = task.den_s;
+  if constexpr (den_select == DEN_Z) den_eval_device = task.den_z;
+  if constexpr (den_select == DEN_Y) den_eval_device = task.den_y;
+  if constexpr (den_select == DEN_X) den_eval_device = task.den_x;
+
+  const auto* basis_eval_device = task.bf;
+
+  const auto* den_basis_prod_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  register double den_reg = 0.;
+
+  if( tid_x < nbf and tid_y < npts ) {
+
+    const double* bf_col   = basis_eval_device     + tid_x*npts;
+    const double* db_col   = den_basis_prod_device + tid_x*npts;
+
+    den_reg = bf_col[ tid_y ]   * db_col[ tid_y ];
 
-  // U Variables
-  {
-  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
-  dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
-               ntasks );
-  eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   }
 
-  // V variables
-  dim3 threads( cuda::max_threads_per_thread_block );
-  dim3 blocks( util::div_ceil( npts_total, threads.x ) );
-  eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>(
-    npts_total, denx, deny, denz, gamma
-  );
+  // Warp blocks are stored col major
+  constexpr auto warp_size = cuda::warp_size;
+  //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block;
+  den_reg = cuda::warp_reduce_sum<warp_size>( den_reg );
+
+
+  if( threadIdx.x == 0 and tid_y < npts ) {
+    atomicAdd( den_eval_device   + tid_y, den_reg );
+  }
+  
+
 }
 
-void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, 
-  int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, 
-  const double* deny, const double* denz, double* gamma, bool do_lapl,
-  device_queue queue ) {
 
-  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
 
-  // U Variables
-  {
-  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
-  dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
-               ntasks );
-  eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-  if(do_lapl)
-    eval_uvars_mgga_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-  else
-    eval_uvars_mgga_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+
+void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
+  dim3 threads;
+  dim3 blocks;
+  if( do_grad ) {
+    threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
+    blocks = dim3( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
+            std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+            ntasks );
+  } else {
+    threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block, 1 );
+    blocks = dim3( util::div_ceil( nbf_max,  threads.x ),
+            util::div_ceil( npts_max, threads.y ),
+            ntasks );
+  }
+  switch( den_select ) {
+    case DEN_S: 
+      if (do_grad)  eval_vvar_grad_kern<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      else          eval_vvar_kern<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      break;
+    case DEN_Z: 
+      if (do_grad)  eval_vvar_grad_kern<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      else          eval_vvar_kern<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      break;
+    case DEN_Y: 
+      if (do_grad)  eval_vvar_grad_kern<DEN_Y><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      else          eval_vvar_kern<DEN_Y><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      break;
+    case DEN_X: 
+      if (do_grad)  eval_vvar_grad_kern<DEN_X><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      else          eval_vvar_kern<DEN_X><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_vvar called with improper density selected" );
   }
 
-  // V variables (GAMMA)
-  dim3 threads( cuda::max_threads_per_thread_block );
-  dim3 blocks( util::div_ceil( npts_total, threads.x ) );
-  eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>(
-    npts_total, denx, deny, denz, gamma
-  );
 }
 
+
+
+
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu
index 5c7851e6..616e0bd5 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu
@@ -13,7 +13,7 @@
 namespace GauXC {
 
 
-__global__ void zmat_lda_vxc_kernel( size_t        ntasks,
+__global__ void zmat_lda_vxc_rks_kernel( size_t        ntasks,
                                      XCDeviceTask* tasks_device ) {
 
   const int batch_idx = blockIdx.z;
@@ -45,34 +45,93 @@ __global__ void zmat_lda_vxc_kernel( size_t        ntasks,
 
 
 
-void zmat_lda_vxc( size_t            ntasks,
-                   int32_t           max_nbf,
-                   int32_t           max_npts,
-                   XCDeviceTask*     tasks_device,
-                   device_queue queue ) {
 
-  cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
 
 
-  dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1);
-  dim3 blocks( util::div_ceil( max_npts, threads.x ),
-               util::div_ceil( max_nbf,  threads.y ),
-               ntasks );
 
-  zmat_lda_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+
+
+
+
+
+template<density_id den_selector>
+__global__ void zmat_lda_vxc_uks_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  const double* vrho_pos_device    = task.vrho_pos;
+  const double* vrho_neg_device    = task.vrho_neg;
+
+
+  const auto* basis_eval_device = task.bf;
+
+
+  auto* z_matrix_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+    const double factp = 0.5 * vrho_pos_device[tid_x];
+    const double factm = 0.5 * vrho_neg_device[tid_x];
+    double sign = 1.0;
+    if constexpr ( den_selector == DEN_Z )  sign = -1.0;
+    
+    z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + sign * factm * basis_eval_device[ ibfoff ]);
+  }
 
 }
 
+template<density_id den_selector>
+__global__ void zmat_lda_vxc_gks_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
 
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  const double* vrho_pos_device    = task.vrho_pos;
+  const double* vrho_neg_device    = task.vrho_neg;
 
+  double* K_device;
+  if constexpr ( den_selector == DEN_Z ) K_device = task.K_z;
+  if constexpr ( den_selector == DEN_Y ) K_device = task.K_y;
+  if constexpr ( den_selector == DEN_X ) K_device = task.K_x;
+  
 
 
+  const auto* basis_eval_device = task.bf;
 
 
+  auto* z_matrix_device = task.zmat;
 
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
 
+  if( tid_x < npts and tid_y < nbf ) {
+    const size_t ibfoff = tid_y * npts + tid_x;
+    const double factp = 0.5 * vrho_pos_device[tid_x];
+    const double factm = 0.5 * vrho_neg_device[tid_x];
 
+    if constexpr ( den_selector == DEN_S ) {
+      z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + factm * basis_eval_device[ ibfoff ]);
+    }
+    else {
+      const double factk = 0.5 * (factp - factm);
+      z_matrix_device[ ibfoff ] = K_device[ ibfoff ] * factk * basis_eval_device[ ibfoff ];
+    }
+  }
 
+}
 
 
 
@@ -82,7 +141,7 @@ void zmat_lda_vxc( size_t            ntasks,
 
 
 
-__global__ void zmat_gga_vxc_kernel( size_t        ntasks,
+__global__ void zmat_gga_vxc_rks_kernel( size_t        ntasks,
                                      XCDeviceTask* tasks_device ) {
 
   const int batch_idx = blockIdx.z;
@@ -93,9 +152,9 @@ __global__ void zmat_gga_vxc_kernel( size_t        ntasks,
   const auto nbf             = task.bfn_screening.nbe;
   const auto* vrho_device    = task.vrho;
   const auto* vgamma_device  = task.vgamma;
-  const auto* den_x_eval_device = task.ddenx;
-  const auto* den_y_eval_device = task.ddeny;
-  const auto* den_z_eval_device = task.ddenz;
+  const auto* den_x_eval_device = task.dden_sx;
+  const auto* den_y_eval_device = task.dden_sy;
+  const auto* den_z_eval_device = task.dden_sz;
 
   const auto* basis_eval_device = task.bf;
   const auto* dbasis_x_eval_device = task.dbfx;
@@ -123,28 +182,219 @@ __global__ void zmat_gga_vxc_kernel( size_t        ntasks,
   }
 }
 
-void zmat_gga_vxc( size_t            ntasks,
-                   int32_t           max_nbf,
-                   int32_t           max_npts,
-                   XCDeviceTask*     tasks_device,
-                   device_queue queue ) {
 
-  cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
 
 
-  dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1);
-  dim3 blocks( util::div_ceil( max_npts, threads.x ),
-               util::div_ceil( max_nbf,  threads.y ),
-               ntasks );
 
-  zmat_gga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
 
+
+
+template<density_id den_selector>
+__global__ void zmat_gga_vxc_uks_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  const double* vrho_pos_device    = task.vrho_pos;
+  const double* vrho_neg_device    = task.vrho_neg;
+  const double* vgamma_pp_device   = task.vgamma_pp;
+  const double* vgamma_pm_device   = task.vgamma_pm;
+  const double* vgamma_mm_device   = task.vgamma_mm;
+
+  const auto* den_pos_x_eval_device = task.dden_sx;
+  const auto* den_pos_y_eval_device = task.dden_sy;
+  const auto* den_pos_z_eval_device = task.dden_sz;
+  const auto* den_neg_x_eval_device = task.dden_zx;
+  const auto* den_neg_y_eval_device = task.dden_zy;
+  const auto* den_neg_z_eval_device = task.dden_zz;
+
+
+  const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+
+  auto* z_matrix_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+
+    const double factp = 0.25 * vrho_pos_device[tid_x];
+    const double factm = 0.25 * vrho_neg_device[tid_x];
+    
+    const auto gga_fact_pp  = vgamma_pp_device[tid_x];
+    const auto gga_fact_pm  = vgamma_pm_device[tid_x];
+    const auto gga_fact_mm  = vgamma_mm_device[tid_x];
+    
+    const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm);
+    const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm);
+    const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm);
+
+    double sign = 1.0;
+
+    double x_fact, y_fact, z_fact;
+
+    if constexpr ( den_selector == DEN_S ) {
+       x_fact = gga_fact_1 * den_pos_x_eval_device[ tid_x ] + gga_fact_2 * den_neg_x_eval_device[ tid_x ];
+       y_fact = gga_fact_1 * den_pos_y_eval_device[ tid_x ] + gga_fact_2 * den_neg_y_eval_device[ tid_x ];
+       z_fact = gga_fact_1 * den_pos_z_eval_device[ tid_x ] + gga_fact_2 * den_neg_z_eval_device[ tid_x ];
+      
+
+    }
+    if constexpr ( den_selector == DEN_Z ) {
+       sign = -1.0;
+       x_fact = gga_fact_3 * den_neg_x_eval_device[ tid_x ] + gga_fact_2 * den_pos_x_eval_device[ tid_x ];
+       y_fact = gga_fact_3 * den_neg_y_eval_device[ tid_x ] + gga_fact_2 * den_pos_y_eval_device[ tid_x ];
+       z_fact = gga_fact_3 * den_neg_z_eval_device[ tid_x ] + gga_fact_2 * den_pos_z_eval_device[ tid_x ];
+
+    }
+
+    z_matrix_device[ ibfoff ] =   x_fact * dbasis_x_eval_device[ ibfoff ]      
+                                + y_fact * dbasis_y_eval_device[ ibfoff ]
+                                + z_fact * dbasis_z_eval_device[ ibfoff ] 
+                                + (factp + sign * factm) * basis_eval_device[ ibfoff ];
+  }
 }
-              
 
 
+
+
+template<density_id den_selector>
+__global__ void zmat_gga_vxc_gks_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  const double* vrho_pos_device    = task.vrho_pos;
+  const double* vrho_neg_device    = task.vrho_neg;
+  const double* vgamma_pp_device   = task.vgamma_pp;
+  const double* vgamma_pm_device   = task.vgamma_pm;
+  const double* vgamma_mm_device   = task.vgamma_mm;
+
+  
+  // for non-DEN_S
+  double* K_device;
+  double* H_device;
+  if constexpr ( den_selector == DEN_Z ) { K_device = task.K_z; H_device = task.H_z; }
+  if constexpr ( den_selector == DEN_Y ) { K_device = task.K_y; H_device = task.H_y; }
+  if constexpr ( den_selector == DEN_X ) { K_device = task.K_x; H_device = task.H_x; }
+
+  const auto* dden_sx_eval_device = task.dden_sx;
+  const auto* dden_sy_eval_device = task.dden_sy;
+  const auto* dden_sz_eval_device = task.dden_sz;
+  const auto* dden_zx_eval_device = task.dden_zx;
+  const auto* dden_zy_eval_device = task.dden_zy;
+  const auto* dden_zz_eval_device = task.dden_zz;
+  const auto* dden_yx_eval_device = task.dden_yx;
+  const auto* dden_yy_eval_device = task.dden_yy;
+  const auto* dden_yz_eval_device = task.dden_yz;
+  const auto* dden_xx_eval_device = task.dden_xx;
+  const auto* dden_xy_eval_device = task.dden_xy;
+  const auto* dden_xz_eval_device = task.dden_xz;
+
+
+  const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+
+  auto* z_matrix_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+
+    const double fact_p =  0.5*vrho_pos_device[tid_x];
+    const double fact_m =  0.5*vrho_neg_device[tid_x];
+    
+    const auto gga_fact_pp  = vgamma_pp_device[tid_x];
+    const auto gga_fact_pm  = vgamma_pm_device[tid_x];
+    const auto gga_fact_mm  = vgamma_mm_device[tid_x];
+    
+    const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm);
+    const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm);
+    const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm);
+
+    double s_fact, x_fact, y_fact, z_fact;
+
+    if constexpr ( den_selector == DEN_S ) {
+      const double* Hz_device          = task.H_z;
+      const double* Hy_device          = task.H_y;
+      const double* Hx_device          = task.H_x;
+      
+      s_fact = 0.5 * (fact_p + fact_m);
+
+      x_fact = gga_fact_1 * dden_sx_eval_device[ tid_x ]
+             + gga_fact_2 * (Hz_device[ tid_x ] * dden_zx_eval_device[ tid_x ]
+                          +  Hy_device[ tid_x ] * dden_yx_eval_device[ tid_x ]
+                          +  Hx_device[ tid_x ] * dden_xx_eval_device[ tid_x ] );
+      y_fact = gga_fact_1 * dden_sy_eval_device[ tid_x ]
+             + gga_fact_2 * (Hz_device[ tid_x ] * dden_zy_eval_device[ tid_x ]
+                          +  Hy_device[ tid_x ] * dden_yy_eval_device[ tid_x ]
+                          +  Hx_device[ tid_x ] * dden_xy_eval_device[ tid_x ] );
+      z_fact = gga_fact_1 * dden_sz_eval_device[ tid_x ]
+                        + gga_fact_2 * (Hz_device[ tid_x ] * dden_zz_eval_device[ tid_x ]
+                                     +  Hy_device[ tid_x ] * dden_yz_eval_device[ tid_x ]
+                                     +  Hx_device[ tid_x ] * dden_xz_eval_device[ tid_x ] );
+    }
+
+    if constexpr ( den_selector == DEN_Z ) {
+      s_fact  = K_device[ tid_x ] * 0.5 * (fact_p - fact_m);
+      x_fact  = gga_fact_3 * dden_zx_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ];
+      y_fact  = gga_fact_3 * dden_zy_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ];
+      z_fact  = gga_fact_3 * dden_zz_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ];
+    }
+
+    if constexpr ( den_selector == DEN_Y ) {
+      s_fact  = K_device[ tid_x ] * 0.5 * (fact_p - fact_m);
+      x_fact  = gga_fact_3 * dden_yx_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ];
+      y_fact  = gga_fact_3 * dden_yy_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ];
+      z_fact  = gga_fact_3 * dden_yz_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ];
+    }
+
+    if constexpr ( den_selector == DEN_X ) {
+      s_fact  = K_device[ tid_x ] * 0.5 * (fact_p - fact_m);
+      x_fact  = gga_fact_3 * dden_xx_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ];
+      y_fact  = gga_fact_3 * dden_xy_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ];
+      z_fact  = gga_fact_3 * dden_xz_eval_device[ tid_x ]
+             +  gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ];
+    }
+
+    z_matrix_device[ ibfoff ] =   x_fact * dbasis_x_eval_device[ ibfoff ]      
+                                + y_fact * dbasis_y_eval_device[ ibfoff ]
+                                + z_fact * dbasis_z_eval_device[ ibfoff ] 
+                                + s_fact *  basis_eval_device[ ibfoff ];
+
+  }
+}
+
 template <bool need_lapl>
-__global__ void zmat_mgga_vxc_kernel( size_t        ntasks,
+__global__ void zmat_mgga_vxc_rks_kernel( size_t        ntasks,
                                      XCDeviceTask* tasks_device ) {
 
   const int batch_idx = blockIdx.z;
@@ -156,9 +406,9 @@ __global__ void zmat_mgga_vxc_kernel( size_t        ntasks,
   const auto* vrho_device    = task.vrho;
   const auto* vgamma_device  = task.vgamma;
   const double* vlapl_device = need_lapl ? task.vlapl : nullptr;
-  const auto* den_x_eval_device = task.ddenx;
-  const auto* den_y_eval_device = task.ddeny;
-  const auto* den_z_eval_device = task.ddenz;
+  const auto* den_x_eval_device = task.dden_sx;
+  const auto* den_y_eval_device = task.dden_sy;
+  const auto* den_z_eval_device = task.dden_sz;
 
   const auto* basis_eval_device = task.bf;
   const auto* dbasis_x_eval_device = task.dbfx;
@@ -194,6 +444,60 @@ __global__ void zmat_mgga_vxc_kernel( size_t        ntasks,
   }
 }
 
+
+
+#define ZMAT_VXC_KERN(xc_approx) \
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>(); \
+  dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); \
+  dim3 blocks( util::div_ceil( max_npts, threads.x ), \
+               util::div_ceil( max_nbf,  threads.y ), \
+               ntasks ); \
+  switch( scheme ) { \
+    case RKS: \
+      zmat_##xc_approx##_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+      break; \
+    case UKS: \
+      if ( sel == DEN_S )       zmat_##xc_approx##_vxc_uks_kernel<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+      else if ( sel == DEN_Z )  zmat_##xc_approx##_vxc_uks_kernel<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+      else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \
+      break; \
+    case GKS: \
+      if ( sel == DEN_S )       zmat_##xc_approx##_vxc_gks_kernel<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+      else if ( sel == DEN_Z )  zmat_##xc_approx##_vxc_gks_kernel<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+      else if ( sel == DEN_Y )  zmat_##xc_approx##_vxc_gks_kernel<DEN_Y><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+      else if ( sel == DEN_X )  zmat_##xc_approx##_vxc_gks_kernel<DEN_X><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+      else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \
+      break; \
+    default: \
+      GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid KS scheme" ); \
+  }
+
+
+
+void zmat_lda_vxc( size_t            ntasks,
+                   int32_t           max_nbf,
+                   int32_t           max_npts,
+                   XCDeviceTask*     tasks_device,
+                   integrator_ks_scheme scheme,
+                   density_id sel,
+                   device_queue queue ) {
+ZMAT_VXC_KERN(lda)
+}
+
+
+
+void zmat_gga_vxc( size_t            ntasks,
+                   int32_t           max_nbf,
+                   int32_t           max_npts,
+                   XCDeviceTask*     tasks_device,
+                   integrator_ks_scheme scheme,
+                   density_id sel,
+                   device_queue queue ) {
+ZMAT_VXC_KERN(gga)
+}
+
+
+
 void zmat_mgga_vxc( size_t            ntasks,
                     int32_t           max_nbf,
                     int32_t           max_npts,
@@ -210,9 +514,9 @@ void zmat_mgga_vxc( size_t            ntasks,
                ntasks );
 
   if(do_lapl)
-    zmat_mgga_vxc_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+    zmat_mgga_vxc_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
   else
-    zmat_mgga_vxc_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+    zmat_mgga_vxc_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
 
 }
 
@@ -232,7 +536,7 @@ void zmat_mgga_vxc( size_t            ntasks,
 
 
 template <bool need_lapl>
-__global__ void mmat_mgga_vxc_kernel( size_t        ntasks,
+__global__ void mmat_mgga_vxc_rks_kernel( size_t        ntasks,
                                      XCDeviceTask* tasks_device ) {
 
   const int batch_idx = blockIdx.z;
@@ -332,9 +636,9 @@ void mmat_mgga_vxc( size_t            ntasks,
                ntasks );
 
   if(do_lapl)
-    mmat_mgga_vxc_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+    mmat_mgga_vxc_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
   else
-    mmat_mgga_vxc_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+    mmat_mgga_vxc_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
 
   //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device);
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx
index 237f5647..3c11a5fe 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx
@@ -16,9 +16,10 @@
 
 namespace GauXC {
 
-void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad ){
+void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ){
 
   if( do_grad ) GAUXC_GENERIC_EXCEPTION("CUTLASS + X Gradient NYI");
+  if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI");
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -33,7 +34,7 @@ void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_g
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, 
+  sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device, 
     nbf, submat_block_size, data->device_backend_->queue() );
 
   auto cutlass_stack = data->cutlass_stack;
@@ -50,7 +51,7 @@ void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_g
   );
 }
 
-void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, bool do_m){
+void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id,  bool do_m){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -58,6 +59,7 @@ void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, bool do_m){
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
   if(do_m) GAUXC_GENERIC_EXCEPTION("CUTLASS + MGGA NYI");
+  if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI");
 
   auto& tasks = data->host_device_tasks;
   const auto ntasks = tasks.size();
@@ -81,7 +83,7 @@ void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, bool do_m){
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
   sym_task_inc_potential( ntasks, aos_stack.device_tasks, 
-    static_stack.vxc_device, nbf, submat_block_size, 
+    static_stack.vxc_s_device, nbf, submat_block_size, 
     data->device_backend_->queue() );
 }
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp
index 8654a1e7..0f3ec69e 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp
@@ -17,8 +17,8 @@ namespace GauXC {
 
 struct AoSScheme1CUTLASSBase : public AoSScheme1Base {
 
-  void eval_xmat(double fac, XCDeviceData*, bool do_grad ) override final;
-  void inc_vxc( XCDeviceData*, bool ) override final;
+  void eval_xmat(double fac, XCDeviceData*, bool do_grad, density_id ) override final;
+  void inc_vxc( XCDeviceData*, density_id, bool ) override final;
 
   struct Data;
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx
index 96fbc97d..6bf35c75 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx
@@ -96,7 +96,7 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send(
   std::vector<int64_t> ld64_dmat_host( ntask ), ld64_zmat_host( ntask ), 
                        ld64_vmat_host( ntask ), ld64_bf_host( ntask );
 
-  double* static_dmat = static_stack.dmat_device;
+  double* static_dmat = static_stack.dmat_s_device;
   const auto nbf = global_dims.nbf;
 
   // host_device_tasks should be populated by parent impl called at top
diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx
index e4d288b8..2a83e76c 100644
--- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx
@@ -36,6 +36,28 @@ void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, bool b ) { \
   pimpl_->NAME(device_data, b);                                         \
 }
 
+#define FWD_TO_PIMPL_DEN_ID(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, density_id den ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, den);                               \
+}
+
+#define FWD_TO_PIMPL_DEN_ID_BOOL(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, density_id den, bool b ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, den, b);                               \
+}
+
+#define FWD_TO_PIMPL_KS_SCHEME(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, track);                               \
+}
+#define FWD_TO_PIMPL_KS_SCHEME_DEN_ID(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, density_id den ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, track, den);                               \
+}
 
 FWD_TO_PIMPL(partition_weights)         // Partition weights
 
@@ -44,63 +66,42 @@ FWD_TO_PIMPL(eval_collocation_gradient) // Collocation Gradient
 FWD_TO_PIMPL(eval_collocation_hessian)  // Collocation Hessian
 FWD_TO_PIMPL(eval_collocation_laplacian)  // Collocation Laplacian
 
-FWD_TO_PIMPL(eval_uvvar_lda_rks)            // U/VVar LDA (density)
-FWD_TO_PIMPL(eval_uvvar_gga_rks)            // U/VVar GGA (density + grad, gamma)
-
-FWD_TO_PIMPL(eval_uvvar_lda_uks)            // U/VVar LDA (density)
-FWD_TO_PIMPL(eval_uvvar_gga_uks)            // U/VVar GGA (density + grad, gamma)
 
-FWD_TO_PIMPL(eval_uvvar_lda_gks)            // U/VVar LDA (density)
-FWD_TO_PIMPL(eval_uvvar_gga_gks)            // U/VVar GGA (density + grad, gamma)
+FWD_TO_PIMPL_KS_SCHEME(eval_uvars_lda)            // U variables LDA (rho)
+FWD_TO_PIMPL_KS_SCHEME(eval_uvars_gga)            // U variables GGA (gamma)
+FWD_TO_PIMPL_BOOL(eval_uvars_mgga)                // U variables MGGA (tau, lapl)
+FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvar)               // V variable (density + grad)
 
-FWD_TO_PIMPL(eval_zmat_lda_vxc_rks)         // Eval Z Matrix LDA VXC
-FWD_TO_PIMPL(eval_zmat_gga_vxc_rks)         // Eval Z Matrix GGA VXC
-
-FWD_TO_PIMPL(eval_zmat_lda_vxc_uks)         // Eval Z Matrix LDA VXC
-FWD_TO_PIMPL(eval_zmat_gga_vxc_uks)         // Eval Z Matrix GGA VXC
-
-FWD_TO_PIMPL(eval_zmat_lda_vxc_gks)         // Eval Z Matrix LDA VXC
-FWD_TO_PIMPL(eval_zmat_gga_vxc_gks)         // Eval Z Matrix GGA VXC
-
-
-FWD_TO_PIMPL_BOOL(eval_uvvar_mgga_rks)           // U/VVar MGGA (density + grad, gamma, tau, lapl)
-FWD_TO_PIMPL_BOOL(eval_uvvar_mgga_uks)           // U/VVar MGGA (density + grad, gamma, tau, lapl)
-FWD_TO_PIMPL_BOOL(eval_uvvar_mgga_gks)           // U/VVar MGGA (density + grad, gamma, tau, lapl)
-
-FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc_rks)        // Eval Z Matrix MGGA VXC
-FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc_uks)        // Eval Z Matrix MGGA VXC
-FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc_gks)        // Eval Z Matrix MGGA VXC
-
-FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc_rks)        // Eval M Matrix MGGA VXC
-FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc_uks)        // Eval M Matrix MGGA VXC
-FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc_gks)        // Eval M Matrix MGGA VXC
+FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_lda_vxc)         // Eval Z Matrix LDA VXC
+FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_gga_vxc)         // Eval Z Matrix GGA VXC
+FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc)                    // Eval Z Matrix mGGA VXC
+FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc)                    // Eval M Matrix mGGA VXC
 
 FWD_TO_PIMPL(eval_exx_fmat)             // Eval EXX F Matrix
 //FWD_TO_PIMPL(eval_exx_gmat)             // Eval EXX G Matrix
 
+
 FWD_TO_PIMPL(inc_exc)
 FWD_TO_PIMPL(inc_nel)
+FWD_TO_PIMPL_DEN_ID_BOOL(inc_vxc)            // Increment VXC_I by Z
+
 FWD_TO_PIMPL(inc_exx_k)     
 FWD_TO_PIMPL(inc_exc_grad_lda)
 FWD_TO_PIMPL(inc_exc_grad_gga)
 
-FWD_TO_PIMPL(symmetrize_vxc)
+FWD_TO_PIMPL_DEN_ID(symmetrize_vxc)
 FWD_TO_PIMPL(symmetrize_exx_k)
 FWD_TO_PIMPL(eval_exx_ek_screening_bfn_stats)
 
 // X     = fac * P * B
 // dX/dx = fac * P * dB/dx (do_grad)
-void LocalDeviceWorkDriver::eval_xmat( double fac, XCDeviceData* device_data, bool do_grad ) {
-  throw_if_invalid_pimpl(pimpl_);
-  pimpl_->eval_xmat(fac, device_data, do_grad);
-}
 
-// Increment VXC by Z (+ M{x,y,z} for MMGA)
-void LocalDeviceWorkDriver::inc_vxc( XCDeviceData* device_data, bool do_m ) {
+void LocalDeviceWorkDriver::eval_xmat( double fac, XCDeviceData* device_data, bool do_grad, density_id den ) {
   throw_if_invalid_pimpl(pimpl_);
-  pimpl_->inc_vxc(device_data, do_m);
+  pimpl_->eval_xmat(fac, device_data, do_grad, den);
 }
 
+
 void LocalDeviceWorkDriver::eval_exx_gmat( XCDeviceData* device_data, 
   const BasisSetMap& basis_map) {
   throw_if_invalid_pimpl(pimpl_);
diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp
index d02c8810..604f0739 100644
--- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp
+++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp
@@ -61,47 +61,30 @@ class LocalDeviceWorkDriver : public LocalWorkDriver {
   void eval_collocation_gradient( XCDeviceData* );
   void eval_collocation_hessian( XCDeviceData* );
   void eval_collocation_laplacian( XCDeviceData* );
+  void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den );
+  
+  void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) ;
+  void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) ;
+  void eval_uvars_mgga( XCDeviceData*, bool ) ;
+  void eval_vvar( XCDeviceData*, density_id, bool ) ;
 
-  void eval_xmat( double fac, XCDeviceData*, bool do_grad = false );
-
-  void eval_uvvar_lda_rks( XCDeviceData* );
-  void eval_uvvar_gga_rks( XCDeviceData* );
-  void eval_uvvar_mgga_rks( XCDeviceData*, bool );
-
-  void eval_uvvar_lda_uks( XCDeviceData* );
-  void eval_uvvar_gga_uks( XCDeviceData* );
-  void eval_uvvar_mgga_uks( XCDeviceData*, bool );
-
-  void eval_uvvar_lda_gks( XCDeviceData* );
-  void eval_uvvar_gga_gks( XCDeviceData* );
-  void eval_uvvar_mgga_gks( XCDeviceData*, bool );
 
   void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* );
   void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* );
   void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* );
 
-  void eval_zmat_lda_vxc_rks( XCDeviceData* );
-  void eval_zmat_gga_vxc_rks( XCDeviceData* );
-  void eval_zmat_mgga_vxc_rks( XCDeviceData*, bool );
-
-  void eval_zmat_lda_vxc_uks( XCDeviceData* );
-  void eval_zmat_gga_vxc_uks( XCDeviceData* );
-  void eval_zmat_mgga_vxc_uks( XCDeviceData*, bool );
-
-  void eval_zmat_lda_vxc_gks( XCDeviceData* );
-  void eval_zmat_gga_vxc_gks( XCDeviceData* );
-  void eval_zmat_mgga_vxc_gks( XCDeviceData*, bool );
 
-  void eval_mmat_mgga_vxc_rks( XCDeviceData*, bool );
-  void eval_mmat_mgga_vxc_uks( XCDeviceData*, bool );
-  void eval_mmat_mgga_vxc_gks( XCDeviceData*, bool );
+  void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ;
+  void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ;
+  void eval_zmat_mgga_vxc( XCDeviceData*, bool ) ;
+  void eval_mmat_mgga_vxc( XCDeviceData*, bool );
 
   void eval_exx_fmat( XCDeviceData* );
   void eval_exx_gmat( XCDeviceData*, const BasisSetMap& );
 
   void inc_exc( XCDeviceData* );
   void inc_nel( XCDeviceData* );
-  void inc_vxc( XCDeviceData*, bool do_m = false );
+  void inc_vxc( XCDeviceData*, density_id, bool do_m = false );
   void inc_exc_grad_lda( XCDeviceData* );
   void inc_exc_grad_gga( XCDeviceData* );
   void inc_exx_k( XCDeviceData* );
@@ -110,7 +93,7 @@ class LocalDeviceWorkDriver : public LocalWorkDriver {
   void exx_ek_shellpair_collision( double eps_E, double eps_K, XCDeviceData*, 
     host_task_iterator, host_task_iterator, const ShellPairCollection<double>& );
 
-  void symmetrize_vxc( XCDeviceData* );
+  void symmetrize_vxc( XCDeviceData*, density_id );
   void symmetrize_exx_k( XCDeviceData* );
 
   std::unique_ptr<XCDeviceData> create_device_data(const DeviceRuntimeEnvironment&);
diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp
index 7a31b0e3..f43dd12c 100644
--- a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp
+++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp
@@ -34,41 +34,28 @@ struct LocalDeviceWorkDriverPIMPL {
   virtual void eval_collocation_gradient( XCDeviceData* ) = 0;
   virtual void eval_collocation_hessian( XCDeviceData* ) = 0;
   virtual void eval_collocation_laplacian( XCDeviceData* ) = 0;
-  virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad ) = 0;
+  virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ) = 0;
   virtual void eval_exx_fmat( XCDeviceData* ) = 0;
   //virtual void eval_exx_gmat( XCDeviceData* ) = 0;
   virtual void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) = 0;
-  virtual void eval_uvvar_lda_rks( XCDeviceData* ) = 0;
-  virtual void eval_uvvar_gga_rks( XCDeviceData* ) = 0;
-  virtual void eval_uvvar_mgga_rks( XCDeviceData*, bool ) = 0;
-  virtual void eval_uvvar_lda_uks( XCDeviceData* ) = 0;
-  virtual void eval_uvvar_gga_uks( XCDeviceData* ) = 0;
-  virtual void eval_uvvar_mgga_uks( XCDeviceData*, bool ) = 0;
-  virtual void eval_uvvar_lda_gks( XCDeviceData* ) = 0;  
-  virtual void eval_uvvar_gga_gks( XCDeviceData* ) = 0;
-  virtual void eval_uvvar_mgga_gks( XCDeviceData*, bool ) = 0;
+  virtual void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) = 0;
+  virtual void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) = 0;
+  virtual void eval_uvars_mgga( XCDeviceData*, bool ) = 0;
+  virtual void eval_vvar( XCDeviceData*,  density_id, bool ) = 0;
   virtual void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) = 0;
   virtual void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) = 0;
   virtual void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ) = 0;
-  virtual void eval_zmat_lda_vxc_rks( XCDeviceData* ) = 0;
-  virtual void eval_zmat_gga_vxc_rks( XCDeviceData* ) = 0;
-  virtual void eval_zmat_mgga_vxc_rks( XCDeviceData*, bool ) = 0;
-  virtual void eval_zmat_lda_vxc_uks( XCDeviceData* ) = 0;
-  virtual void eval_zmat_gga_vxc_uks( XCDeviceData* ) = 0;
-  virtual void eval_zmat_mgga_vxc_uks( XCDeviceData*, bool ) = 0;
-  virtual void eval_zmat_lda_vxc_gks( XCDeviceData* ) = 0;
-  virtual void eval_zmat_gga_vxc_gks( XCDeviceData* ) = 0;
-  virtual void eval_zmat_mgga_vxc_gks( XCDeviceData*, bool ) = 0;
-  virtual void eval_mmat_mgga_vxc_rks( XCDeviceData*, bool ) = 0;
-  virtual void eval_mmat_mgga_vxc_uks( XCDeviceData*, bool ) = 0;
-  virtual void eval_mmat_mgga_vxc_gks( XCDeviceData*, bool ) = 0;
+  virtual void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0;
+  virtual void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0;
+  virtual void eval_zmat_mgga_vxc( XCDeviceData*, bool ) = 0;
+  virtual void eval_mmat_mgga_vxc( XCDeviceData*, bool ) = 0;
   virtual void inc_exc( XCDeviceData* ) = 0;
   virtual void inc_nel( XCDeviceData* ) = 0;
-  virtual void inc_vxc( XCDeviceData*, bool ) = 0;
+  virtual void inc_vxc( XCDeviceData* , density_id, bool) = 0;
   virtual void inc_exc_grad_lda( XCDeviceData* ) = 0;
   virtual void inc_exc_grad_gga( XCDeviceData* ) = 0;
   virtual void inc_exx_k( XCDeviceData* ) = 0;
-  virtual void symmetrize_vxc( XCDeviceData* ) = 0;
+  virtual void symmetrize_vxc( XCDeviceData*, density_id ) = 0;
   virtual void symmetrize_exx_k( XCDeviceData* ) = 0;
 
   virtual void eval_exx_ek_screening_bfn_stats( XCDeviceData* ) = 0;
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx
index f42aff3c..5ffa8443 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx
@@ -17,6 +17,7 @@
 #include "device/common/increment_exc_grad.hpp"
 #include "device/common/exx_ek_screening.hpp"
 
+#include "buffer_adaptor.hpp"
 
 #include "device/common/shell_pair_to_task.hpp"
 #ifdef GAUXC_HAS_CUDA
@@ -93,7 +94,7 @@ namespace XGPU {
         size_t max_ntask,
         const GauXC::ShellPairToTaskDevice* sp2task,
         GauXC::XCDeviceTask*                device_tasks,
-		    double *boys_table,
+        double *boys_table,
         cudaStream_t stream); 
 
   void integral_1_1_task_batched(
@@ -116,7 +117,7 @@ namespace XGPU {
         size_t max_ntask,
         const GauXC::ShellPairToTaskDevice* sp2task,
         GauXC::XCDeviceTask*                device_tasks,
-		    double *boys_table,
+        double *boys_table,
         cudaStream_t stream); 
 
   void integral_2_2_task_batched(
@@ -139,7 +140,7 @@ namespace XGPU {
         size_t max_ntask,
         const GauXC::ShellPairToTaskDevice* sp2task,
         GauXC::XCDeviceTask*                device_tasks,
-		    double *boys_table,
+        double *boys_table,
         cudaStream_t stream); 
         
   void integral_1_0_task_batched(
@@ -164,7 +165,7 @@ namespace XGPU {
         size_t max_ntask,
         const GauXC::ShellPairToTaskDevice* sp2task,
         GauXC::XCDeviceTask*                device_tasks,
-		    double *boys_table,
+        double *boys_table,
         cudaStream_t stream); 
 
   void integral_2_0_task_batched(
@@ -189,7 +190,7 @@ namespace XGPU {
         size_t max_ntask,
         const GauXC::ShellPairToTaskDevice* sp2task,
         GauXC::XCDeviceTask*                device_tasks,
-		    double *boys_table,
+        double *boys_table,
         cudaStream_t stream); 
 
   void integral_2_1_task_batched(
@@ -214,7 +215,7 @@ namespace XGPU {
         size_t max_ntask,
         const GauXC::ShellPairToTaskDevice* sp2task,
         GauXC::XCDeviceTask*                device_tasks,
-		    double *boys_table,
+        double *boys_table,
         cudaStream_t stream); 
 }
 #endif
@@ -234,7 +235,7 @@ AoSScheme1Base::~AoSScheme1Base() noexcept {
 #endif
 }
 
-void AoSScheme1Base::eval_zmat_lda_vxc_rks( XCDeviceData* _data){
+void AoSScheme1Base::eval_zmat_lda_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, density_id den ) {
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -250,13 +251,13 @@ void AoSScheme1Base::eval_zmat_lda_vxc_rks( XCDeviceData* _data){
   }
 
   auto aos_stack     = data->aos_stack;
-  zmat_lda_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks,
+  zmat_lda_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, scheme, den,
     data->device_backend_->queue() );
 
   data->device_backend_->check_error("zmat_lda" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-void AoSScheme1Base::eval_zmat_gga_vxc_rks( XCDeviceData* _data){
+void AoSScheme1Base::eval_zmat_gga_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, density_id den ) {
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -272,13 +273,13 @@ void AoSScheme1Base::eval_zmat_gga_vxc_rks( XCDeviceData* _data){
   }
 
   auto aos_stack     = data->aos_stack;
-  zmat_gga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks,
+  zmat_gga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, scheme, den,
     data->device_backend_->queue() );
 
   data->device_backend_->check_error("zmat_gga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-void AoSScheme1Base::eval_zmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){
+void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -301,39 +302,8 @@ void AoSScheme1Base::eval_zmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){
   data->device_backend_->check_error("zmat_mgga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-void AoSScheme1Base::eval_zmat_lda_vxc_uks( XCDeviceData* ){
-
-  GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device");
-
-}
-
-void AoSScheme1Base::eval_zmat_gga_vxc_uks( XCDeviceData* ){
-
-  GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device");
 
-}
-
-void AoSScheme1Base::eval_zmat_mgga_vxc_uks( XCDeviceData*, bool /*do_lapl*/){
-  GAUXC_GENERIC_EXCEPTION("UKS MGGA Not Yet Implemented For Device");
-}
-
-void AoSScheme1Base::eval_zmat_lda_vxc_gks( XCDeviceData* ){
-
-  GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device");
-
-}
-
-void AoSScheme1Base::eval_zmat_gga_vxc_gks( XCDeviceData* ){
-
-  GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device");
-
-}
-
-void AoSScheme1Base::eval_zmat_mgga_vxc_gks( XCDeviceData*, bool /*do_lapl*/){
-  GAUXC_GENERIC_EXCEPTION("GKS MGGA Not Yet Implemented For Device");
-}
-
-void AoSScheme1Base::eval_mmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){
+void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -355,12 +325,6 @@ void AoSScheme1Base::eval_mmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){
 
   data->device_backend_->check_error("mmat_mgga" __FILE__ ": " + std::to_string(__LINE__));
 }
-void AoSScheme1Base::eval_mmat_mgga_vxc_uks( XCDeviceData*, bool /*do_lapl*/){
-  GAUXC_GENERIC_EXCEPTION("UKS MGGA Not Yet Implemented For Device");
-}
-void AoSScheme1Base::eval_mmat_mgga_vxc_gks( XCDeviceData*, bool /*do_lapl*/){
-  GAUXC_GENERIC_EXCEPTION("GKS MGGA Not Yet Implemented For Device");
-}
 
 void AoSScheme1Base::eval_collocation( XCDeviceData* _data ) {
 
@@ -479,10 +443,20 @@ void AoSScheme1Base::inc_exc( XCDeviceData* _data ){
 
   auto base_stack    = data->base_stack;
   auto static_stack  = data->static_stack;
+  const bool is_RKS  = data->allocated_terms.ks_scheme == RKS;
+  const bool is_UKS  = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS  = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+  
   gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch,
-    base_stack.eps_eval_device, 1, base_stack.den_eval_device, 1, 
+    base_stack.eps_eval_device, 1, base_stack.den_s_eval_device, 1, 
     static_stack.acc_scr_device, static_stack.exc_device );
 
+  if( is_pol ) {
+    gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch,
+      base_stack.eps_eval_device, 1, base_stack.den_z_eval_device, 1, 
+      static_stack.acc_scr_device, static_stack.exc_device );
+  }
   
   data->device_backend_->check_error("inc exc" __FILE__ ": " + std::to_string(__LINE__));
 }
@@ -495,32 +469,29 @@ void AoSScheme1Base::inc_nel( XCDeviceData* _data ){
 
   auto base_stack    = data->base_stack;
   auto static_stack  = data->static_stack;
+
+  const bool is_RKS  = data->allocated_terms.ks_scheme == RKS;
+  const bool is_UKS  = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS  = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+  
   gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch,
-    base_stack.weights_device, 1, base_stack.den_eval_device, 1, 
+    base_stack.weights_device, 1, base_stack.den_s_eval_device, 1, 
     static_stack.acc_scr_device, static_stack.nel_device );
 
+  if( is_pol ) {
+    gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch,
+      base_stack.weights_device, 1, base_stack.den_z_eval_device, 1, 
+      static_stack.acc_scr_device, static_stack.nel_device );
+  }
   
   data->device_backend_->check_error("inc nel" __FILE__ ": " + std::to_string(__LINE__));
 }
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-void AoSScheme1Base::eval_uvvar_lda_rks( XCDeviceData* _data ){
-
+void AoSScheme1Base::eval_uvars_lda( XCDeviceData* _data, integrator_ks_scheme ks_scheme){
   auto* data = dynamic_cast<Data*>(_data);
-  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
@@ -528,31 +499,23 @@ void AoSScheme1Base::eval_uvvar_lda_rks( XCDeviceData* _data ){
   const auto ntasks = tasks.size();
   size_t nbe_max = 0, npts_max = 0;
   for( auto& task : tasks ) {
-    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
     npts_max = std::max( npts_max, task.npts );
   }
 
-  // Zero density
   auto base_stack    = data->base_stack;
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_eval_device, "Den Zero" );
-    
-
+  
   // Evaluate U variables
   auto aos_stack     = data->aos_stack;
-  eval_uvvars_lda( ntasks, nbe_max, npts_max, 
+  GauXC::eval_uvars_lda( ntasks, npts_max, ks_scheme,
     aos_stack.device_tasks, data->device_backend_->queue() );
 
   
   data->device_backend_->check_error("uvvar lda" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-
-
-
-void AoSScheme1Base::eval_uvvar_gga_rks( XCDeviceData* _data ){
-
+void AoSScheme1Base::eval_uvars_gga( XCDeviceData* _data, integrator_ks_scheme ks_scheme){
   auto* data = dynamic_cast<Data*>(_data);
-  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
@@ -560,29 +523,21 @@ void AoSScheme1Base::eval_uvvar_gga_rks( XCDeviceData* _data ){
   const auto ntasks = tasks.size();
   size_t nbe_max = 0, npts_max = 0;
   for( auto& task : tasks ) {
-    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
     npts_max = std::max( npts_max, task.npts );
   }
 
-  // Zero density + gradient
   auto base_stack    = data->base_stack;
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_eval_device, "Den Zero" );
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_x_eval_device, "DenX Zero" );
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_y_eval_device, "DenY Zero" );
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_z_eval_device, "DenZ Zero" );
-
-  // Evaluate U variables
+  
+  // Evaluate U variable
   auto aos_stack     = data->aos_stack;
-  eval_uvvars_gga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, 
-    aos_stack.device_tasks, base_stack.den_x_eval_device, base_stack.den_y_eval_device,
-    base_stack.den_z_eval_device, base_stack.gamma_eval_device, 
-    data->device_backend_->queue() );
+  GauXC::eval_uvars_gga( ntasks, npts_max, ks_scheme,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
   
   data->device_backend_->check_error("uvvar gga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-void AoSScheme1Base::eval_uvvar_mgga_rks( XCDeviceData* _data, bool do_lapl ){
+void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -597,12 +552,8 @@ void AoSScheme1Base::eval_uvvar_mgga_rks( XCDeviceData* _data, bool do_lapl ){
     npts_max = std::max( npts_max, task.npts );
   }
 
-  // Zero density + gradient
+  // Zero tau
   auto base_stack    = data->base_stack;
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_eval_device, "Den Zero" );
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_x_eval_device, "DenX Zero" );
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_y_eval_device, "DenY Zero" );
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_z_eval_device, "DenZ Zero" );
   data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.tau_eval_device,   "Tau Zero" );
   if(do_lapl) {
     data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_lapl_eval_device, "DenLapl Zero" );
@@ -611,47 +562,78 @@ void AoSScheme1Base::eval_uvvar_mgga_rks( XCDeviceData* _data, bool do_lapl ){
 
   // Evaluate U variables
   auto aos_stack     = data->aos_stack;
-  eval_uvvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, 
-    aos_stack.device_tasks, base_stack.den_x_eval_device, base_stack.den_y_eval_device,
-    base_stack.den_z_eval_device, base_stack.gamma_eval_device, 
-    do_lapl, data->device_backend_->queue() );
+  GauXC::eval_uvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, do_lapl,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
   
   data->device_backend_->check_error("uvvar mgga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-void AoSScheme1Base::eval_uvvar_lda_uks( XCDeviceData* ){
-
-  GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); 
-
-}
-
-void AoSScheme1Base::eval_uvvar_gga_uks( XCDeviceData* ){
-
-  GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device");
-
-}
-
-void AoSScheme1Base::eval_uvvar_mgga_uks( XCDeviceData*, bool /*do_lapl*/ ){
-  GAUXC_GENERIC_EXCEPTION("UKS MGGA Not Yet Implemented For Device"); 
-}
 
+void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool do_grad){
+  auto* data = dynamic_cast<Data*>(_data);
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
-void AoSScheme1Base::eval_uvvar_lda_gks( XCDeviceData* ){
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-    GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device");
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
 
-}
+  // Zero density
+  auto base_stack    = data->base_stack;
+  double* den_eval_ptr    = nullptr;
+  double* den_x_eval_ptr  = nullptr;
+  double* den_y_eval_ptr  = nullptr;
+  double* den_z_eval_ptr  = nullptr;
+  switch ( den_select ) {
+    case DEN_S:
+      den_eval_ptr = base_stack.den_s_eval_device;
+      if (do_grad) { den_x_eval_ptr = base_stack.dden_sx_eval_device;
+                     den_y_eval_ptr = base_stack.dden_sy_eval_device;
+                     den_z_eval_ptr = base_stack.dden_sz_eval_device; }
+      break;
+    case DEN_Z:
+      den_eval_ptr = base_stack.den_z_eval_device;
+      if (do_grad) { den_x_eval_ptr = base_stack.dden_zx_eval_device;
+                     den_y_eval_ptr = base_stack.dden_zy_eval_device;
+                     den_z_eval_ptr = base_stack.dden_zz_eval_device; }
+      break;
+    case DEN_Y:
+      den_eval_ptr = base_stack.den_y_eval_device;
+      if (do_grad) { den_x_eval_ptr = base_stack.dden_yx_eval_device;
+                     den_y_eval_ptr = base_stack.dden_yy_eval_device;
+                     den_z_eval_ptr = base_stack.dden_yz_eval_device; }
+      break;
+    case DEN_X:
+      den_eval_ptr = base_stack.den_x_eval_device;
+      if (do_grad) { den_x_eval_ptr = base_stack.dden_xx_eval_device;
+                     den_y_eval_ptr = base_stack.dden_xy_eval_device;
+                     den_z_eval_ptr = base_stack.dden_xz_eval_device; }
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_vvar called with invalid density selected!" );
+  }
 
-void AoSScheme1Base::eval_uvvar_gga_gks( XCDeviceData* ){
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" );
 
-    GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device");
+  if (do_grad) {
+    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" );
+    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" );
+    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" );
+  }
+  
+  // Evaluate V variable
+  auto aos_stack     = data->aos_stack;
+  GauXC::eval_vvar( ntasks, nbe_max, npts_max, do_grad, den_select,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
 }
 
-void AoSScheme1Base::eval_uvvar_mgga_gks( XCDeviceData*, bool /*do_lapl*/ ){
-  GAUXC_GENERIC_EXCEPTION("GKS MGGA Not Yet Implemented For Device"); 
-}
 
 void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func, 
   XCDeviceData* _data ) {
@@ -664,15 +646,50 @@ void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func,
   if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!");
 
   auto base_stack    = data->base_stack;
-  GauXC::eval_kern_exc_vxc_lda( func, data->total_npts_task_batch, 
-    base_stack.den_eval_device, base_stack.eps_eval_device, 
+
+  const bool is_RKS = data->allocated_terms.ks_scheme == RKS;
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol = is_UKS or is_GKS;
+  const bool is_excgrad = data->allocated_terms.exc_grad;
+
+  const size_t npts = data->total_npts_task_batch ;
+  
+  auto* dep = base_stack.den_s_eval_device;
+
+  if ( is_pol ) {
+    dep = base_stack.den_eval_device;
+    // Interleave pos/neg densities before passing it to ExchCXX
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device  , 2, "den_s -> den_eval" );
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" );
+  }
+
+  GauXC::eval_kern_exc_vxc_lda( func, npts,
+    dep, base_stack.eps_eval_device, 
     base_stack.vrho_eval_device, data->device_backend_->queue() );
 
   hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.eps_eval_device, 1 );
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 );
+                  base_stack.weights_device, 1, base_stack.eps_eval_device, 1 );
 
+  if( not is_pol ) {
+    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
+                    base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 );
+  }
+  else if( is_pol ) {
+    // De-interleave pos/neg densities
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.vrho_eval_device  , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" );
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" );
+
+    // Weight results point-by-point
+    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
+                    base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 );
+    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
+                    base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 );
+  }
   
   data->device_backend_->check_error("exc_vxc lda" __FILE__ ": " + std::to_string(__LINE__));
 }
@@ -689,17 +706,80 @@ void AoSScheme1Base::eval_kern_exc_vxc_gga( const functional_type& func,
   if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!");
 
   auto base_stack    = data->base_stack;
+  double* den_eval_ptr = base_stack.den_s_eval_device;
+  
+  const bool is_RKS = data->allocated_terms.ks_scheme == RKS;
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+  const bool is_excgrad = data->allocated_terms.exc_grad;
+
+  const size_t npts = data->total_npts_task_batch ;
+  
+  
+
+  if ( is_pol ) {
+    den_eval_ptr = base_stack.den_eval_device;
+    // Interleave pos/neg densities before passing it to ExchCXX
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device  , 2, "den_s -> den_eval" );
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" );
+    // Interleave gamma pp, pm, mm
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.gamma_pp_eval_device, 1, base_stack.gamma_eval_device  , 3, "gamma_pp -> gamma_eval");
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.gamma_pm_eval_device, 1, base_stack.gamma_eval_device+1, 3, "gamma_pm -> gamma_eval");
+    data->device_backend_->
+      copy_async_2d( 1, npts, base_stack.gamma_mm_eval_device, 1, base_stack.gamma_eval_device+2, 3, "gamma_mm -> gamma_eval");
+  }
+
   GauXC::eval_kern_exc_vxc_gga( func, data->total_npts_task_batch, 
-    base_stack.den_eval_device, base_stack.gamma_eval_device, 
+    den_eval_ptr, base_stack.gamma_eval_device, 
     base_stack.eps_eval_device, base_stack.vrho_eval_device, 
     base_stack.vgamma_eval_device, data->device_backend_->queue() );
 
+
   hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
                     base_stack.weights_device, 1, base_stack.eps_eval_device, 1 );
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
+
+  if( not is_pol ) {
+    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
                     base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 );
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
+    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
                     base_stack.weights_device, 1, base_stack.vgamma_eval_device, 1 );
+  }
+  else if( is_pol ) {
+      // De-interleave pos/neg densities
+      data->device_backend_->
+        copy_async_2d( 1, npts, base_stack.vrho_eval_device  , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" );
+      data->device_backend_->
+        copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" );
+
+      // Multiply by weights point-by-point
+      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
+                      base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 );
+      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
+                      base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 );
+
+      // De-interleave vgamma
+      data->device_backend_->
+        copy_async_2d( 1, npts, base_stack.vgamma_eval_device  , 3, base_stack.vgamma_pp_eval_device, 1, "vgamma_eval -> vgamma_pp" );
+      data->device_backend_->
+        copy_async_2d( 1, npts, base_stack.vgamma_eval_device+1, 3, base_stack.vgamma_pm_eval_device, 1, "vgamma_eval -> vgamma_pm" );
+      data->device_backend_->
+        copy_async_2d( 1, npts, base_stack.vgamma_eval_device+2, 3, base_stack.vgamma_mm_eval_device, 1, "vgamma_eval -> vgamma_mm" );
+
+      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
+                      base_stack.weights_device, 1, base_stack.vgamma_pp_eval_device, 1 );
+      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
+                      base_stack.weights_device, 1, base_stack.vgamma_pm_eval_device, 1 );
+      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
+                      base_stack.weights_device, 1, base_stack.vgamma_mm_eval_device, 1 );
+      
+      
+  }
+
 
   
   data->device_backend_->check_error("exc_vxc gga" __FILE__ ": " + std::to_string(__LINE__));
@@ -723,7 +803,7 @@ void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func,
   }
 
   GauXC::eval_kern_exc_vxc_mgga( func, data->total_npts_task_batch, 
-    base_stack.den_eval_device, base_stack.gamma_eval_device, 
+    base_stack.den_s_eval_device, base_stack.gamma_eval_device, 
     base_stack.tau_eval_device, base_stack.den_lapl_eval_device,
     base_stack.eps_eval_device, base_stack.vrho_eval_device, 
     base_stack.vgamma_eval_device, base_stack.vtau_eval_device,
@@ -754,7 +834,7 @@ void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func,
 
 
 
-void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){
+void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -764,12 +844,31 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){
   auto tasks = data->host_device_tasks;
   const auto ntasks = tasks.size();
 
-  // Pack density matrix 
+  // Set correct density matrix pointer on the stack
   const auto nbf = data->global_dims.nbf;
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, 
+  double* dmat_ptr = nullptr;
+  switch ( den_select ) {
+    case DEN_S:
+      dmat_ptr = static_stack.dmat_s_device;
+      break;
+    case DEN_Z:
+      dmat_ptr = static_stack.dmat_z_device;
+      break;
+    case DEN_X:
+      dmat_ptr = static_stack.dmat_x_device;
+      break;
+    case DEN_Y:
+      dmat_ptr = static_stack.dmat_y_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION("eval_xmat: den_select not set");
+  }
+
+  // Pack density matrix 
+  sym_pack_submat( ntasks, aos_stack.device_tasks, dmat_ptr, 
     nbf, submat_block_size, data->device_backend_->queue() );
 
 
@@ -783,10 +882,13 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){
 
   // Launch GEMM in round-robin
   const auto n_blas_streams = data->device_backend_->blas_pool_size();
+  
+   
+
   //size_t nsingle = 0;
   for( size_t iT = 0; iT < ntasks; ++iT ) {
     auto& task = tasks[iT];
-      auto den_ptr = task.bfn_screening.ncut > 1 ? task.nbe_scr : static_stack.dmat_device + task.bfn_screening.ibf_begin*(nbf+1);
+      auto den_ptr = task.bfn_screening.ncut > 1 ? task.nbe_scr : dmat_ptr + task.bfn_screening.ibf_begin*(nbf+1);
       int  ldden   = task.bfn_screening.ncut > 1 ? task.bfn_screening.nbe : nbf;
       auto handle = data->device_backend_->blas_pool_handle( iT % n_blas_streams );
       do_gemm( handle, task.npts, task.bfn_screening.nbe, task.bf, den_ptr, ldden, task.zmat );
@@ -811,14 +913,7 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){
 
 
 
-
-
-
-
-
-
-
-void AoSScheme1Base::inc_vxc( XCDeviceData* _data, bool do_m ){
+void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool do_m ){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -857,8 +952,25 @@ void AoSScheme1Base::inc_vxc( XCDeviceData* _data, bool do_m ){
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  sym_task_inc_potential( ntasks, aos_stack.device_tasks, 
-    static_stack.vxc_device, nbf, submat_block_size, 
+  double* vxc_ptr    = nullptr;
+  switch( den_selector ) {
+    case DEN_S:
+      vxc_ptr = static_stack.vxc_s_device;
+      break;
+    case DEN_Z:
+      vxc_ptr = static_stack.vxc_z_device;
+      break;
+    case DEN_Y:
+      vxc_ptr = static_stack.vxc_y_device;
+      break;
+    case DEN_X:
+      vxc_ptr = static_stack.vxc_x_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "inc_vxc called with invalid density selected" );
+  }
+  sym_task_inc_potential( ntasks, aos_stack.device_tasks,
+    vxc_ptr, nbf, submat_block_size,
     data->device_backend_->queue() );
   
   data->device_backend_->check_error("inc_vxc" __FILE__ ": " + std::to_string(__LINE__));
@@ -874,9 +986,7 @@ void AoSScheme1Base::inc_vxc( XCDeviceData* _data, bool do_m ){
 
 
 
-
-
-void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data) {
+void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data, density_id den_selector) {
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -885,9 +995,26 @@ void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data) {
 
   const auto nbf = data->global_dims.nbf;
   auto static_stack  = data->static_stack;
-  symmetrize_matrix( nbf, static_stack.vxc_device, nbf, 
-    data->device_backend_->queue() ); 
-
+  switch ( den_selector ) {
+    case DEN_S:
+      symmetrize_matrix( nbf, static_stack.vxc_s_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    case DEN_Z:
+      symmetrize_matrix( nbf, static_stack.vxc_z_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    case DEN_Y:
+      symmetrize_matrix( nbf, static_stack.vxc_y_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    case DEN_X:
+      symmetrize_matrix( nbf, static_stack.vxc_x_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "symmetrize_vxc: invalid density selected" );
+  }
   
   data->device_backend_->check_error("symmetrize vxc" __FILE__ ": " + std::to_string(__LINE__));
 }
@@ -949,7 +1076,7 @@ void AoSScheme1Base::eval_exx_fmat( XCDeviceData* _data ) {
   // Pack the density matrix into (bfn, cou) shape
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto aos_stack     = data->aos_stack;
-  asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device,
+  asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device,
     nbf, submat_block_size, data->device_backend_->queue() );
 
   // Sync blas streams with master stream
@@ -1426,7 +1553,7 @@ void AoSScheme1Base::exx_ek_shellpair_collision( double eps_E, double eps_K,
   auto static_stack    = data->static_stack;
 
   GauXC::exx_ek_shellpair_collision( ntasks, nshells, nbf,
-    static_stack.dmat_device, nbf,
+    static_stack.dmat_s_device, nbf,
     static_stack.vshell_max_sparse_device, 
     static_stack.shpair_row_ind_device,
     static_stack.shpair_col_ind_device,
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp
index dc980762..37964914 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp
+++ b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp
@@ -18,30 +18,16 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL {
   void eval_collocation_gradient( XCDeviceData* ) override final;
   void eval_collocation_hessian( XCDeviceData* ) override final;
   void eval_collocation_laplacian( XCDeviceData* ) override final;
-  void eval_uvvar_lda_rks( XCDeviceData* ) override final;
-  void eval_uvvar_gga_rks( XCDeviceData* ) override final;
-  void eval_uvvar_mgga_rks( XCDeviceData*, bool ) override final;
-  void eval_zmat_lda_vxc_rks( XCDeviceData* ) override final;
-  void eval_zmat_gga_vxc_rks( XCDeviceData* ) override final;
-  void eval_zmat_mgga_vxc_rks( XCDeviceData*, bool ) override final;
 
-  void eval_uvvar_lda_uks( XCDeviceData* ) override final;
-  void eval_uvvar_gga_uks( XCDeviceData* ) override final;
-  void eval_uvvar_mgga_uks( XCDeviceData*, bool ) override final;
-  void eval_zmat_lda_vxc_uks( XCDeviceData* ) override final;
-  void eval_zmat_gga_vxc_uks( XCDeviceData* ) override final;
-  void eval_zmat_mgga_vxc_uks( XCDeviceData*, bool ) override final;
+  void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) override final;
+  void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) override final;
+  void eval_uvars_mgga( XCDeviceData*, bool ) override final;
+  void eval_vvar( XCDeviceData*, density_id, bool ) override final;
 
-  void eval_uvvar_lda_gks( XCDeviceData* ) override final;
-  void eval_uvvar_gga_gks( XCDeviceData* ) override final;
-  void eval_uvvar_mgga_gks( XCDeviceData*, bool ) override final;
-  void eval_zmat_lda_vxc_gks( XCDeviceData* ) override final;
-  void eval_zmat_gga_vxc_gks( XCDeviceData* ) override final;
-  void eval_zmat_mgga_vxc_gks( XCDeviceData*, bool ) override final;
-
-  void eval_mmat_mgga_vxc_rks( XCDeviceData*, bool ) override final;
-  void eval_mmat_mgga_vxc_uks( XCDeviceData*, bool ) override final;
-  void eval_mmat_mgga_vxc_gks( XCDeviceData*, bool ) override final;
+  void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final;
+  void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final;
+  void eval_zmat_mgga_vxc( XCDeviceData*, bool ) override final;
+  void eval_mmat_mgga_vxc( XCDeviceData*, bool ) override final;
 
   void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) override final;
   void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) override final;
@@ -50,7 +36,7 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL {
   void inc_nel( XCDeviceData* ) override final;
   void inc_exc_grad_lda( XCDeviceData* ) override final;
   void inc_exc_grad_gga( XCDeviceData* ) override final;
-  void symmetrize_vxc( XCDeviceData* ) override final;
+  void symmetrize_vxc( XCDeviceData* , density_id) override final;
   void symmetrize_exx_k( XCDeviceData* ) override final;
   //void eval_exx_gmat( XCDeviceData* ) override final;
   void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) override final;
@@ -62,11 +48,12 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL {
 
 
   // Overridable APIs
-  virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad ) override;
+  virtual void eval_xmat( double fac, XCDeviceData*, bool , density_id ) override;
   virtual void eval_exx_fmat( XCDeviceData* ) override;
-  virtual void inc_vxc( XCDeviceData*, bool ) override;
+  virtual void inc_vxc( XCDeviceData*, density_id, bool ) override;
   virtual void inc_exx_k( XCDeviceData* ) override;
 
+
   using Data = Scheme1DataBase;
 
   AoSScheme1Base();
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx
index fb484ed0..8c3c15cc 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx
@@ -16,7 +16,7 @@
 
 namespace GauXC {
 
-void AoSScheme1MAGMABase::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){
+void AoSScheme1MAGMABase::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den ){
 
   if( do_grad ) GAUXC_GENERIC_EXCEPTION("MAGMA + X Gradient NYI");
 
@@ -33,12 +33,42 @@ void AoSScheme1MAGMABase::eval_xmat( double fac, XCDeviceData* _data, bool do_gr
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, 
-    nbf, submat_block_size, data->device_backend_->queue() );
-
-
   auto master_queue = data->device_backend_->master_magma_queue();
   auto magma_stack = data->magma_stack;
+  double* dmat_ptr   = nullptr;
+  switch( den ) {
+    case DEN_S:
+      dmat_ptr = static_stack.dmat_s_device;
+      break;
+    case DEN_Z:
+      dmat_ptr = static_stack.dmat_z_device;
+      break;
+    case DEN_Y:
+      dmat_ptr = static_stack.dmat_y_device;
+      break;
+    case DEN_X:
+      dmat_ptr = static_stack.dmat_x_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_xmat called with invalid density specifier" );
+  }
+  sym_pack_submat( ntasks, aos_stack.device_tasks, dmat_ptr,
+    nbf, submat_block_size, data->device_backend_->queue() );
+  
+  // Update dmat on magma_stack 
+  std::vector<double*> dmat_host( ntasks );
+
+  for( auto i = 0; i < ntasks; i++ ) {
+    auto& task = tasks[i];
+    if( task.bfn_screening.ncut > 1 ) {
+      dmat_host[i]  = task.nbe_scr;
+    } else {
+      dmat_host[i]  = dmat_ptr + task.bfn_screening.ibf_begin*(nbf+1);
+    }
+  }
+  data->device_backend_->copy_async( ntasks, dmat_host.data(), 
+      magma_stack.xdmat_array_device, "send xdmat array");
+
   magmablas_dgemm_vbatched( MagmaNoTrans, MagmaNoTrans,
     magma_stack.xmat_m_array_device, magma_stack.xmat_n_array_device, 
     magma_stack.xmat_k_array_device, 
@@ -66,7 +96,7 @@ void AoSScheme1MAGMABase::eval_exx_fmat( XCDeviceData* _data ) {
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device,
+  asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device,
     nbf, submat_block_size, data->device_backend_->queue() );
 
 
@@ -82,7 +112,7 @@ void AoSScheme1MAGMABase::eval_exx_fmat( XCDeviceData* _data ) {
 #endif
 }
 
-void AoSScheme1MAGMABase::inc_vxc( XCDeviceData* _data, bool do_m){
+void AoSScheme1MAGMABase::inc_vxc( XCDeviceData* _data, density_id den, bool do_m){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -108,8 +138,25 @@ void AoSScheme1MAGMABase::inc_vxc( XCDeviceData* _data, bool do_m){
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
+  double* vxc_ptr    = nullptr;
+  switch (den) {
+    case DEN_S:
+      vxc_ptr = static_stack.vxc_s_device;
+      break;
+    case DEN_Z:
+      vxc_ptr = static_stack.vxc_z_device;
+      break;
+    case DEN_Y:
+      vxc_ptr = static_stack.vxc_y_device;
+      break;
+    case DEN_X:
+      vxc_ptr = static_stack.vxc_x_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "Inc_vxc called with invalid density specifier" );
+  }
   sym_task_inc_potential( ntasks, aos_stack.device_tasks, 
-    static_stack.vxc_device, nbf, submat_block_size, 
+    vxc_ptr,  nbf, submat_block_size, 
     data->device_backend_->queue() );
 }
 
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp
index 2b99bb60..99dde319 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp
+++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp
@@ -12,9 +12,9 @@ namespace GauXC {
 
 struct AoSScheme1MAGMABase : public AoSScheme1Base {
 
-  void eval_xmat( double fac, XCDeviceData*, bool do_grad ) override final;
+  void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ) override final;
   void eval_exx_fmat( XCDeviceData* ) override final;
-  void inc_vxc( XCDeviceData*, bool ) override final;
+  void inc_vxc( XCDeviceData*, density_id den, bool ) override final;
   void inc_exx_k( XCDeviceData* ) override final;
 
   struct Data;
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx
index fda0d11b..95b818be 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx
@@ -104,6 +104,7 @@ void AoSScheme1MAGMABase::Data::pack_and_send(
 
 }
 
+
 void AoSScheme1MAGMABase::Data::pack_and_send_xmat( 
   host_task_iterator task_begin, host_task_iterator task_end 
 ) {
@@ -115,7 +116,7 @@ void AoSScheme1MAGMABase::Data::pack_and_send_xmat(
                        ld_dmat_host( ntask ), ld_zmat_host( ntask ), 
                        ld_vmat_host( ntask ), ld_bf_host( ntask );
 
-  double* static_dmat = static_stack.dmat_device;
+  double* static_dmat = static_stack.dmat_s_device;
   const auto nbf = global_dims.nbf;
 
   // host_device_tasks should be populated by parent impl called at top
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp
index 8ac2fc57..30403175 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp
@@ -83,33 +83,13 @@ class IncoreReplicatedXCDeviceIntegrator :
                             host_task_iterator task_begin, host_task_iterator task_end,
                             XCDeviceData& device_data );
 
-  void exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, 
-                            host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data, bool do_vxc = true );
-
-  void exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, 
-                            value_type* VXC, int64_t ldvxc, value_type* EXC, value_type *N_EL,
-                            host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data );
-
-  void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
-                                const value_type* Pz, int64_t ldpz,
-                            host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data );
-
-  void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
-                            const value_type* Pz, int64_t ldpz,
-                            value_type* VXC, int64_t ldvxc,
-                            value_type* VXCz, int64_t ldvxcz, value_type* EXC, value_type *N_EL,
-                            host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data );
 
   void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
                             const value_type* Pz, int64_t ldpz,
                             const value_type* Py, int64_t ldpy,
                             const value_type* Px, int64_t ldpx,
                             host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data );
+                            XCDeviceData& device_data, bool do_vxc );
 
   void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
                             const value_type* Pz, int64_t ldpz,
@@ -157,7 +137,7 @@ class IncoreReplicatedXCDeviceIntegrator :
 
   virtual ~IncoreReplicatedXCDeviceIntegrator() noexcept;
 
-
+  
   template <typename... Args>
   void exc_vxc_local_work(Args&&... args) {
     exc_vxc_local_work_( std::forward<Args>(args)... );
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp
index 36b4ebe3..a8c32da3 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp
@@ -24,7 +24,6 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
              value_type* EXC, const IntegratorSettingsXC& settings ) {
 
 
-  if(Pz) GAUXC_GENERIC_EXCEPTION("UKS/GKS + EXC Only Device NYI");
   const auto& basis = this->load_balancer_->basis();
 
   // Check that P / VXC are sane
@@ -53,8 +52,10 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   // Compute local contributions to EXC/VXC and retrieve
   // data from device 
   this->timer_.time_op("XCIntegrator.LocalWork_EXC", [&](){
-    exc_vxc_local_work_( basis, Ps, ldps, nullptr, 0, EXC, 
-      &N_EL, tasks.begin(), tasks.end(), *device_data_ptr);
+    exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx,
+        // Passing nullptr for VXCs disables VXC entirely
+        nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, EXC, &N_EL,
+       tasks.begin(), tasks.end(), *device_data_ptr);
   });
 
   GAUXC_MPI_CODE(
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp
index ff88e14a..a49eee7f 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp
@@ -108,7 +108,7 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   const auto natoms  = mol.size();
   device_data.reset_allocations();
   device_data.allocate_static_data_exc_grad( nbf, nshells, natoms );
-  device_data.send_static_data_density_basis( P, ldp, basis );
+  device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis );
 
   // Zero integrands
   device_data.zero_exc_grad_integrands();
@@ -116,6 +116,7 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   // Processes batches in groups that saturadate available device memory
   integrator_term_tracker enabled_terms;
   enabled_terms.exc_grad = true;
+  enabled_terms.ks_scheme = RKS;
   if( func.is_lda() )      enabled_terms.xc_approx = integrator_xc_approx::LDA; 
   else if( func.is_gga() ) enabled_terms.xc_approx = integrator_xc_approx::GGA; 
   else GAUXC_GENERIC_EXCEPTION("XC Approx NYI");
@@ -135,11 +136,14 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
 
     // Evaluate X matrix
     const bool do_xmat_grad = func.is_gga();
-    lwd->eval_xmat( 2.0, &device_data, do_xmat_grad );
-
-    // Evaluate U/V variables
-    if( func.is_gga() ) lwd->eval_uvvar_gga_rks( &device_data );
-    else                lwd->eval_uvvar_lda_rks( &device_data );
+    lwd->eval_xmat( 2.0, &device_data, do_xmat_grad, DEN_S );
+    
+    // Evaluate V variable
+    lwd->eval_vvar( &device_data, DEN_S, do_xmat_grad );
+
+    // Evaluate U variables
+    if( func.is_gga() ) lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme );
+    else                lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme );
 
     // Evaluate XC functional (we need VXC for EXC Gradient)
     if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data );
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp
index cae5bec5..c8cae61d 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp
@@ -20,7 +20,42 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   eval_exc_vxc_( int64_t m, int64_t n, const value_type* P,
                  int64_t ldp, value_type* VXC, int64_t ldvxc,
                  value_type* EXC, const IntegratorSettingsXC& settings ) {
+  eval_exc_vxc_( m, n, P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, 
+                      VXC, ldvxc, nullptr, 0, nullptr, 0, nullptr, 0, EXC, settings );
+}
+
+
+template <typename ValueType>
+void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+  eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps,
+                      int64_t ldps,
+                      const value_type* Pz,
+                      int64_t ldpz,
+                      value_type* VXCs, int64_t ldvxcs,
+                      value_type* VXCz, int64_t ldvxcz,
+                      value_type* EXC, const IntegratorSettingsXC& settings ) { 
+  eval_exc_vxc_( m, n, Ps, ldps, Pz, ldpz, nullptr, 0, nullptr, 0, 
+                VXCs, ldvxcs, VXCz, ldvxcz, nullptr, 0, nullptr, 0, EXC, settings );
+}
 
+template <typename ValueType>
+void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+  eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps,
+                      int64_t ldps,
+                      const value_type* Pz,
+                      int64_t ldpz,
+                      const value_type* Py,
+                      int64_t ldpy,
+                      const value_type* Px,
+                      int64_t ldpx,
+                      value_type* VXCs, int64_t ldvxcs,
+                      value_type* VXCz, int64_t ldvxcz,
+                      value_type* VXCy, int64_t ldvxcy,
+                      value_type* VXCx, int64_t ldvxcx,
+                      value_type* EXC, const IntegratorSettingsXC& settings ) {
+  const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr);
+  const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr);
+  const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks);
 
   const auto& basis = this->load_balancer_->basis();
 
@@ -30,10 +65,27 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     GAUXC_GENERIC_EXCEPTION("P/VXC Must Be Square");
   if( m != nbf ) 
     GAUXC_GENERIC_EXCEPTION("P/VXC Must Have Same Dimension as Basis");
-  if( ldp < nbf )
-    GAUXC_GENERIC_EXCEPTION("Invalid LDP");
-  if( ldvxc < nbf )
-    GAUXC_GENERIC_EXCEPTION("Invalid LDVXC");
+  if( ldps < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDPs");
+  if( ldvxcs < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDVXCs");
+
+  if( not is_rks ) {
+    if( ldpz < nbf )
+      GAUXC_GENERIC_EXCEPTION("Invalid LDPz");
+    if( ldvxcz < nbf )
+      GAUXC_GENERIC_EXCEPTION("Invalid LDVXCz");
+    if( is_gks ) {
+      if( ldpy < nbf )
+        GAUXC_GENERIC_EXCEPTION("Invalid LDPy");
+      if( ldvxcy < nbf )
+        GAUXC_GENERIC_EXCEPTION("Invalid LDVXCy");
+      if( ldpx < nbf )
+        GAUXC_GENERIC_EXCEPTION("Invalid LDPx");
+      if( ldvxcx < nbf )
+        GAUXC_GENERIC_EXCEPTION("Invalid LDVXCx");
+    }
+  }
 
 
   // Get Tasks
@@ -52,10 +104,10 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   if( this->reduction_driver_->takes_device_memory() ) {
 
     // If we can do reductions on the device (e.g. NCCL)
-    // Don't communicate data back to the hot before reduction
+    // Don't communicate data back to the host before reduction
     this->timer_.time_op("XCIntegrator.LocalWork_EXC_VXC", [&](){
-      exc_vxc_local_work_( basis, P, ldp, tasks.begin(), tasks.end(), 
-        *device_data_ptr);
+      exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx, tasks.begin(), tasks.end(), 
+        *device_data_ptr, true);
     });
 
     GAUXC_MPI_CODE(
@@ -65,19 +117,52 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     )
 
     // Reduce results in device memory
-    auto vxc_device = device_data_ptr->vxc_device_data();
+    double* vxc_s_device = device_data_ptr->vxc_s_device_data();
+    double* vxc_z_device;
+    double* vxc_y_device;
+    double* vxc_x_device;
     auto exc_device = device_data_ptr->exc_device_data();
     auto nel_device = device_data_ptr->nel_device_data();
     auto queue      = device_data_ptr->queue();
-    this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
-      this->reduction_driver_->allreduce_inplace( vxc_device, nbf*nbf, ReductionOp::Sum, queue );
-      this->reduction_driver_->allreduce_inplace( exc_device, 1,       ReductionOp::Sum, queue );
-      this->reduction_driver_->allreduce_inplace( nel_device, 1,       ReductionOp::Sum, queue );
-    });
+    
+    if( not is_rks ) {
+      vxc_z_device = device_data_ptr->vxc_z_device_data();
+      if( is_gks ) {
+        // GKS
+        vxc_y_device = device_data_ptr->vxc_y_device_data();
+        vxc_x_device = device_data_ptr->vxc_x_device_data();
+        this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
+          this->reduction_driver_->allreduce_inplace( vxc_s_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( vxc_z_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( vxc_y_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( vxc_x_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( exc_device, 1,       ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( nel_device, 1,       ReductionOp::Sum, queue );
+        });
+      } else {
+        // UKS
+        this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
+          this->reduction_driver_->allreduce_inplace( vxc_s_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( vxc_z_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( exc_device, 1,       ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( nel_device, 1,       ReductionOp::Sum, queue );
+        });
+
+      }
+    } else {
+      // RKS
+      this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
+        this->reduction_driver_->allreduce_inplace( vxc_s_device, nbf*nbf, ReductionOp::Sum, queue );
+        this->reduction_driver_->allreduce_inplace( exc_device, 1,       ReductionOp::Sum, queue );
+        this->reduction_driver_->allreduce_inplace( nel_device, 1,       ReductionOp::Sum, queue );
+      });
+    }
+
 
     // Retrieve data to host
     this->timer_.time_op("XCIntegrator.DeviceToHostCopy_EXC_VXC",[&](){
-      device_data_ptr->retrieve_exc_vxc_integrands( EXC, &N_EL, VXC, ldvxc );
+      device_data_ptr->retrieve_exc_vxc_integrands( EXC, &N_EL, VXCs, ldvxcs, VXCz, ldvxcz,
+                                                                VXCy, ldvxcy, VXCx, ldvxcx );
     });
 
 
@@ -86,8 +171,9 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     // Compute local contributions to EXC/VXC and retrieve
     // data from device 
     this->timer_.time_op("XCIntegrator.LocalWork_EXC_VXC", [&](){
-      exc_vxc_local_work_( basis, P, ldp, VXC, ldvxc, EXC, 
-        &N_EL, tasks.begin(), tasks.end(), *device_data_ptr);
+      exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx,
+                                VXCs, ldvxcs, VXCz, ldvxcz, VXCy, ldvxcy, VXCx, ldvxcx, EXC, 
+                              &N_EL, tasks.begin(), tasks.end(), *device_data_ptr);
     });
 
     GAUXC_MPI_CODE(
@@ -97,62 +183,62 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     )
 
     // Reduce Results in host mem
-    this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
-      this->reduction_driver_->allreduce_inplace( VXC, nbf*nbf, ReductionOp::Sum );
-      this->reduction_driver_->allreduce_inplace( EXC,   1    , ReductionOp::Sum );
-      this->reduction_driver_->allreduce_inplace( &N_EL, 1    , ReductionOp::Sum );
-    });
-
+    if( is_rks ) {
+      this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
+        this->reduction_driver_->allreduce_inplace( VXCs, nbf*nbf, ReductionOp::Sum );
+        this->reduction_driver_->allreduce_inplace( EXC, 1,       ReductionOp::Sum );
+        this->reduction_driver_->allreduce_inplace( &N_EL, 1,       ReductionOp::Sum );
+      });
+    } else {
+      if( is_gks ) {
+        this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
+          this->reduction_driver_->allreduce_inplace( VXCs, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( VXCz, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( VXCy, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( VXCx, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( EXC, 1,       ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( &N_EL, 1,       ReductionOp::Sum );
+        });
+      } else {
+        // UKS
+        this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){
+          this->reduction_driver_->allreduce_inplace( VXCs, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( VXCz, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( EXC, 1,       ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( &N_EL, 1,       ReductionOp::Sum );
+        });
+
+      }
+    }
   }
 }
 
 
 template <typename ValueType>
 void IncoreReplicatedXCDeviceIntegrator<ValueType>::
-  eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps,
-                      int64_t ldps,
-                      const value_type* Pz,
-                      int64_t ldpz,
-                      value_type* VXCs, int64_t ldvxcs,
-                      value_type* VXCz, int64_t ldvxcz,
-                      value_type* EXC, const IntegratorSettingsXC& settings ) {
-  GauXC::util::unused(m,n,Ps,ldps,Pz,ldpz,VXCs,ldvxcs,VXCz,ldvxcz,EXC,settings);
-  GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device");
-}
-
-template <typename ValueType>
-void IncoreReplicatedXCDeviceIntegrator<ValueType>::
-  eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps,
-                      int64_t ldps,
-                      const value_type* Pz,
-                      int64_t ldpz,
-                      const value_type* Py,
-                      int64_t ldpy,
-                      const value_type* Px,
-                      int64_t ldpx,
-                      value_type* VXCs, int64_t ldvxcs,
-                      value_type* VXCz, int64_t ldvxcz,
-                      value_type* VXCy, int64_t ldvxcy,
-                      value_type* VXCx, int64_t ldvxcx,
-                      value_type* EXC, const IntegratorSettingsXC& settings ) {
-  GauXC::util::unused(m,n,Ps,ldps,Pz,ldpz,Py,ldpy,Px,ldpx,VXCs,ldvxcs,VXCz,ldvxcz,VXCy,ldvxcy,VXCx,ldvxcx,EXC,settings);
-  GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device");
-}
-
-
-template <typename ValueType>
-void IncoreReplicatedXCDeviceIntegrator<ValueType>::
-  exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, 
-                       host_task_iterator task_begin, host_task_iterator task_end,
-                       XCDeviceData& device_data, bool do_vxc ) {
-
+  exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* Py, int64_t ldpy,
+                            const value_type* Px, int64_t ldpx,
+                            host_task_iterator task_begin, host_task_iterator task_end,
+                            XCDeviceData& device_data, bool do_vxc ) {
+  const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr);
+  const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr);
+  const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks);
+  if (not is_rks and not is_uks and not is_gks) {
+    GAUXC_GENERIC_EXCEPTION("MUST BE EITHER RKS, UKS, or GKS!");
+  }
+  
 
+  // Cast LWD to LocalDeviceWorkDriver
   auto* lwd = dynamic_cast<LocalDeviceWorkDriver*>(this->local_work_driver_.get() );
 
   // Setup Aliases
   const auto& func  = *this->func_;
   const auto& mol   = this->load_balancer_->molecule();
 
+  if( func.is_mgga() and (is_uks or is_gks) ) GAUXC_GENERIC_EXCEPTION("Device + Polarized mGGAs NYI!");
+
   // Get basis map
   BasisSetMap basis_map(basis,mol);
 
@@ -172,38 +258,15 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   if( not lb_state.modified_weights_are_stored ) {
     GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); 
   }
+  
 
-#if 0
-  this->timer_.time_op("XCIntegrator.ScreenWeights",[&](){
-
-  constexpr double weight_thresh = std::numeric_limits<double>::epsilon();
-  for( auto it = task_begin; it != task_end; ++it ) {
-    it->max_weight = *std::max_element( it->weights.begin(), it->weights.end() );
-  }
-
-  size_t old_ntasks = std::distance( task_begin, task_end );
-  task_end = std::stable_partition(task_begin, task_end,
-    [&](const auto& a){ return a.max_weight > weight_thresh; } );
-
-  size_t new_ntasks = std::distance( task_begin, task_end );
-  std::cout << old_ntasks << ", " << new_ntasks << std::endl;
-
-  });
-#endif
-
-  // Do XC integration in task batches
-  const auto nbf     = basis.nbf();
-  const auto nshells = basis.nshells();
-  device_data.reset_allocations();
-  device_data.allocate_static_data_exc_vxc( nbf, nshells, do_vxc );
-  device_data.send_static_data_density_basis( P, ldp, basis );
-
-  // Zero integrands
-  device_data.zero_exc_vxc_integrands();
-
-  // Processes batches in groups that saturadate available device memory
   integrator_term_tracker enabled_terms;
   enabled_terms.exc_vxc = true;
+
+  if (is_rks) enabled_terms.ks_scheme = RKS;
+  else if (is_uks) enabled_terms.ks_scheme = UKS;
+  else if (is_gks) enabled_terms.ks_scheme = GKS;
+
   if( func.is_lda() )      
     enabled_terms.xc_approx = integrator_xc_approx::LDA; 
   else if( func.is_gga() ) 
@@ -212,15 +275,20 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     enabled_terms.xc_approx = integrator_xc_approx::MGGA_LAPL;
   else
     enabled_terms.xc_approx = integrator_xc_approx::MGGA_TAU;
+  
+  // Do XC integration in task batches
+  const auto nbf     = basis.nbf();
+  const auto nshells = basis.nshells();
+  device_data.reset_allocations();
+  device_data.allocate_static_data_exc_vxc( nbf, nshells, enabled_terms, do_vxc );
+  
+  device_data.send_static_data_density_basis( Ps, ldps, Pz, ldpz, Px, ldpx, Py, ldpy, basis );
+
+
+
+  // Zero integrands
+  device_data.zero_exc_vxc_integrands(enabled_terms);
 
-#if 0
-  std::vector<int32_t> full_shell_list(nshells);
-  std::iota(full_shell_list.begin(),full_shell_list.end(),0);
-  for( auto it = task_begin; it != task_end; ++it ) {
-    it->cou_screening.shell_list = full_shell_list;
-    it->cou_screening.nbe        = nbf;
-  }
-#endif
 
   auto task_it = task_begin;
   while( task_it != task_end ) {
@@ -230,8 +298,8 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
       device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end );
 
     /*** Process the batches ***/
+    
     const bool need_lapl = func.needs_laplacian();
-
     // Evaluate collocation
     if( func.is_mgga() ) {
       if(need_lapl) lwd->eval_collocation_laplacian( &device_data );
@@ -239,100 +307,77 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     }
     else if( func.is_gga() ) lwd->eval_collocation_gradient( &device_data );
     else                     lwd->eval_collocation( &device_data );
-
-    // Evaluate X matrix
+      
+    const double xmat_fac = is_rks ? 2.0 : 1.0;
     const bool need_xmat_grad = func.is_mgga();
-    lwd->eval_xmat( 2.0, &device_data, need_xmat_grad );
+    const bool need_vvar_grad = func.is_mgga() or func.is_gga();
+
+    // Evaluate X matrix and V vars
+    auto do_xmat_vvar = [&](density_id den_id) {
+      lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id );
+      lwd->eval_vvar( &device_data, den_id, need_vvar_grad );
+    };
+
+    do_xmat_vvar(DEN_S);
+    if (not is_rks) {
+      do_xmat_vvar(DEN_Z);
+      if (not is_uks) {
+        do_xmat_vvar(DEN_Y);
+        do_xmat_vvar(DEN_X);
+      }
+    }
+
 
-    // Evaluate U/V variables
-    if( func.is_mgga() )     lwd->eval_uvvar_mgga_rks( &device_data, need_lapl );
-    else if( func.is_gga() ) lwd->eval_uvvar_gga_rks( &device_data );
-    else                     lwd->eval_uvvar_lda_rks( &device_data );
+    // Evaluate U variables
+    if( func.is_mgga() )      lwd->eval_uvars_mgga( &device_data, need_lapl );     //<<< TODO: Fn call is different because MGGA U/GKS NYI
+    else if( func.is_gga() )  lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme );
+    else                      lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme );
 
     // Evaluate XC functional
     if( func.is_mgga() )     lwd->eval_kern_exc_vxc_mgga( func, &device_data );
     else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data );
     else                     lwd->eval_kern_exc_vxc_lda( func, &device_data );
+    
 
     // Do scalar EXC/N_EL integrations
     lwd->inc_exc( &device_data );
     lwd->inc_nel( &device_data );
     if( not do_vxc ) continue;
 
-    // Evaluate Z (+ M) matrix
-    if( func.is_mgga() ) {
-      lwd->eval_zmat_mgga_vxc_rks( &device_data, need_lapl );
-      lwd->eval_mmat_mgga_vxc_rks( &device_data, need_lapl );
-    }
-    else if( func.is_gga() ) lwd->eval_zmat_gga_vxc_rks( &device_data );
-    else                     lwd->eval_zmat_lda_vxc_rks( &device_data );
+   auto do_zmat_vxc = [&](density_id den_id) {
+     if( func.is_mgga() ) {
+       lwd->eval_zmat_mgga_vxc( &device_data, need_lapl);
+       lwd->eval_mmat_mgga_vxc( &device_data, need_lapl);
+     }
+     else if( func.is_gga() ) 
+       lwd->eval_zmat_gga_vxc( &device_data, enabled_terms.ks_scheme, den_id );
+     else 
+       lwd->eval_zmat_lda_vxc( &device_data, enabled_terms.ks_scheme, den_id );
+     lwd->inc_vxc( &device_data, den_id, func.is_mgga() );
+  };
 
-    // Increment VXC (LT)
-    lwd->inc_vxc( &device_data, func.is_mgga() );
+  do_zmat_vxc(DEN_S);
+  if(not is_rks) {
+    do_zmat_vxc(DEN_Z);
+    if(not is_uks) {
+      do_zmat_vxc(DEN_Y);
+      do_zmat_vxc(DEN_X);
+    }
+  } 
 
   } // Loop over batches of batches 
 
   // Symmetrize VXC in device memory
-  if(do_vxc) lwd->symmetrize_vxc( &device_data );
-
-}
-
-
-
-template <typename ValueType>
-void IncoreReplicatedXCDeviceIntegrator<ValueType>::
-  exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, 
-                       value_type* VXC, int64_t ldvxc, value_type* EXC, 
-                       value_type *N_EL,
-                       host_task_iterator task_begin, host_task_iterator task_end,
-                       XCDeviceData& device_data ) {
-
-  // Get integrate and keep data on device
-  const bool do_vxc = VXC;
-  exc_vxc_local_work_( basis, P, ldp, task_begin, task_end, device_data, do_vxc );
-  auto rt  = detail::as_device_runtime(this->load_balancer_->runtime());
-  rt.device_backend()->master_queue_synchronize();
-
-  // Receive XC terms from host
-  this->timer_.time_op("XCIntegrator.DeviceToHostCopy_EXC_VXC",[&](){
-    device_data.retrieve_exc_vxc_integrands( EXC, N_EL, VXC, ldvxc );
-  });
-
-}
-
-template <typename ValueType>
-void IncoreReplicatedXCDeviceIntegrator<ValueType>::
-  exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
-                                const value_type* Pz, int64_t ldpz,
-                            host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data ) {
-  GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,task_begin,task_end,device_data);
-  GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device");
-}
-
-template <typename ValueType>
-void IncoreReplicatedXCDeviceIntegrator<ValueType>::
-  exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
-                            const value_type* Pz, int64_t ldpz,
-                            value_type* VXCs, int64_t ldvxcs,
-                            value_type* VXCz, int64_t ldvxcz, value_type* EXC, value_type *N_EL,
-                            host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data ) {
-
-  GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,VXCs,ldvxcs,VXCz,ldvxcz,EXC,N_EL,task_begin,task_end,device_data);
-  GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device");
-}
-
-template <typename ValueType>
-void IncoreReplicatedXCDeviceIntegrator<ValueType>::
-  exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
-                            const value_type* Pz, int64_t ldpz,
-                            const value_type* Py, int64_t ldpy,
-                            const value_type* Px, int64_t ldpx,
-                            host_task_iterator task_begin, host_task_iterator task_end,
-                            XCDeviceData& device_data ) {
-  GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,Py,ldpy,Px,ldpx,task_begin,task_end,device_data);
-  GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device");
+  if( do_vxc ) {
+    lwd->symmetrize_vxc( &device_data, DEN_S );
+    if (not is_rks) {
+      lwd->symmetrize_vxc( &device_data, DEN_Z );
+      if (not is_uks) {
+        lwd->symmetrize_vxc( &device_data, DEN_Y );
+        lwd->symmetrize_vxc( &device_data, DEN_X );
+      }
+    }
+  }
 }
 
 template <typename ValueType>
@@ -347,9 +392,18 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
                             value_type* VXCx, int64_t ldvxcx, value_type* EXC, value_type *N_EL,
                             host_task_iterator task_begin, host_task_iterator task_end,
                             XCDeviceData& device_data ) {
+  
+  // Get integrate and keep data on device
+  const bool do_vxc = VXCs;
+  exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx, task_begin, task_end, device_data, do_vxc );
+  auto rt  = detail::as_device_runtime(this->load_balancer_->runtime());
+  rt.device_backend()->master_queue_synchronize();
+
+  // Receive XC terms from host
+  this->timer_.time_op("XCIntegrator.DeviceToHostCopy_EXC_VXC",[&](){
+    device_data.retrieve_exc_vxc_integrands( EXC, N_EL, VXCs, ldvxcs, VXCz, ldvxcz, VXCy, ldvxcy, VXCx, ldvxcx ); 
+  });
 
-  GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,Py,ldpy,Px,ldpx,VXCs,ldvxcs,VXCz,ldvxcz,VXCy,ldvxcy,VXCx,ldvxcx,EXC,N_EL,task_begin,task_end,device_data);
-  GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device");
 }
 
 
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp
index 3cc19868..1aec0077 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp
@@ -355,7 +355,7 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   // Do EXX integration in task batches
   device_data.reset_allocations();
   device_data.allocate_static_data_exx( nbf, nshells, shell_pairs.npairs(), shell_pairs.nprim_pair_total(), basis_map.max_l() );
-  device_data.send_static_data_density_basis( P, ldp, basis );
+  device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis );
   device_data.send_static_data_shell_pairs( basis, shell_pairs );
 
   // Zero integrands
@@ -401,7 +401,7 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
 
 #if 1
   // Symmetrize K in device memory
-  lwd->symmetrize_exx_k( &device_data );
+  lwd->symmetrize_exx_k( &device_data);
 #endif
 
 }
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp
index eb610769..764bf3c0 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp
@@ -103,7 +103,7 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   const auto nshells = basis.nshells();
   device_data.reset_allocations();
   device_data.allocate_static_data_den( nbf, nshells );
-  device_data.send_static_data_density_basis( P, ldp, basis );
+  device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0,  basis );
 
   // Zero integrands
   device_data.zero_den_integrands();
@@ -125,10 +125,12 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     lwd->eval_collocation( &device_data );
 
     // Evaluate X matrix
-    lwd->eval_xmat( 1.0, &device_data );
+    const bool do_xmat_grad = false;
+    lwd->eval_xmat( 1.0, &device_data, do_xmat_grad, DEN_S );
 
     // Evaluate the density
-    lwd->eval_uvvar_lda_rks( &device_data );
+    const bool do_vvar_grad = false;
+    lwd->eval_vvar( &device_data, DEN_S, do_vvar_grad );
 
     // Do scalar N_EL integration
     lwd->inc_nel( &device_data );
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
index 960d81f2..e62ae760 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
@@ -384,6 +384,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     const auto xmat_fac = is_rks ? 2.0 : 1.0; // TODO Fix for spinor RKS input
     lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, Ps, ldps, basis_eval, nbe,
       zmat, nbe, nbe_scr );
+		
 
     // X matrix for Pz
     if(not is_rks) {
@@ -451,7 +452,6 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       vrho[sds*i] *= weights[i];
       if(not is_rks) vrho[sds*i+1] *= weights[i];
     }
-     
     if( func.is_gga() ){
       for( int32_t i = 0; i < npts; ++i ) {
          vgamma[gga_dim_scal*i] *= weights[i];
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
index f6505826..4b56692b 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
@@ -390,18 +390,11 @@ void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType
   // Process selected task batch
 #ifdef GAUXC_HAS_DEVICE
   if constexpr (IncoreIntegratorType::is_device) {
-    if(Pz or Py or Py) 
-      GAUXC_GENERIC_EXCEPTION("Device UKS/GKS + ShellBatched NYI");
 
     incore_integrator.exc_vxc_local_work( basis_subset, Ps_submat, nbe, 
-      VXCs_submat, nbe, &EXC_tmp, &NEL_tmp, task_begin, task_end, 
-      *device_data_ptr_ );
-    // TODO: Make this work for UKS/GKS after 
-    // https://github.com/wavefunction91/GauXC/pull/91
-    //incore_integrator.exc_vxc_local_work( basis_subset, Ps_submat, nbe, 
-    //  Pz_submat, nbe, Py_submat, nbe, Px_submat, nbe, VXCs_submat, nbe,
-    //  VXCz_submat, nbe, VXCy_submat, nbe, VXCx_submat, nbe,
-    //  &EXC_tmp, &NEL_tmp, task_begin, task_end, *device_data_ptr_ );
+      Pz_submat, nbe, Py_submat, nbe, Px_submat, nbe, VXCs_submat, nbe,
+      VXCz_submat, nbe, VXCy_submat, nbe, VXCx_submat, nbe,
+      &EXC_tmp, &NEL_tmp, task_begin, task_end, *device_data_ptr_ );
   } else if constexpr (not IncoreIntegratorType::is_device) {
 #endif
     incore_integrator.exc_vxc_local_work( basis_subset, Ps_submat, nbe, 
diff --git a/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx b/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx
index a12acbee..af985115 100644
--- a/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx
+++ b/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx
@@ -161,13 +161,11 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack(
     const size_t ntask = std::distance( task_begin, task_end );
     aos_stack.device_tasks = mem.aligned_alloc<XCDeviceTask>( ntask, csl );
   }
-
   // Map packed to unpacked indices
   if(reqt.task_bfn_shell_indirection) {
     aos_stack.bfn_shell_indirection_device =
       mem.aligned_alloc<int32_t>( total_nbe_bfn_task_batch, csl );
   }
-
   // Collocation + derivatives 
   const size_t bfn_msz = total_nbe_bfn_npts_task_batch;
   if(reqt.task_bfn) {
@@ -198,7 +196,6 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack(
     aos_stack.zmat_vxc_device = 
       mem.aligned_alloc<double>( bfn_msz, csl);
   }
-
   // X Matrix Gradient (for GGA EXC Gradient)
   if(reqt.task_xmat_grad) {
     aos_stack.xmat_dx_device = mem.aligned_alloc<double>( bfn_msz, csl);
@@ -242,7 +239,6 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack(
       mem.aligned_alloc<int32_t>(total_nblock_cou_task_batch, csl);
   }
 
-
   // Update dynmem data for derived impls
   return device_buffer_t{ mem.stack(), mem.nleft() };
 }
@@ -473,20 +469,66 @@ void XCDeviceAoSData::pack_and_send(
     buffer_adaptor xmat_dx_mem( aos_stack.xmat_dx_device, total_nbe_bfn_npts );
     buffer_adaptor xmat_dy_mem( aos_stack.xmat_dy_device, total_nbe_bfn_npts );
     buffer_adaptor xmat_dz_mem( aos_stack.xmat_dz_device, total_nbe_bfn_npts );
-
-    buffer_adaptor den_mem   ( base_stack.den_eval_device,   total_npts );
-    buffer_adaptor dden_x_mem( base_stack.den_x_eval_device, total_npts );
-    buffer_adaptor dden_y_mem( base_stack.den_y_eval_device, total_npts );
-    buffer_adaptor dden_z_mem( base_stack.den_z_eval_device, total_npts );
+    
+    const bool is_rks = terms.ks_scheme == RKS;
+    const bool is_uks = terms.ks_scheme == UKS;
+    const bool is_gks = terms.ks_scheme == GKS;
+    const bool is_pol  = is_uks or is_gks;
+    const bool is_gga = terms.xc_approx == GGA;
+    const int den_fac   = is_pol ? 2 : 1;
+    const int gamma_fac = is_pol ? 3 : 1;
+    
+
+
+    buffer_adaptor eps_mem    ( base_stack.eps_eval_device,     total_npts             );
+
+    // RKS
+    buffer_adaptor den_s_mem  ( base_stack.den_s_eval_device,     total_npts  );
+    buffer_adaptor gamma_mem  ( base_stack.gamma_eval_device,     total_npts * gamma_fac );
+    buffer_adaptor vrho_mem   ( base_stack.vrho_eval_device,      total_npts * den_fac   );
+    buffer_adaptor vgamma_mem ( base_stack.vgamma_eval_device,    total_npts * gamma_fac );
+    
+    buffer_adaptor den_mem    ( base_stack.den_eval_device,       total_npts * den_fac   );
+       
+
+    // Polarized KS
+    buffer_adaptor den_z_mem  ( base_stack.den_z_eval_device,     total_npts  );
+    buffer_adaptor den_y_mem  ( base_stack.den_y_eval_device,     total_npts  );
+    buffer_adaptor den_x_mem  ( base_stack.den_x_eval_device,     total_npts  );
+    buffer_adaptor vrho_pos_mem( base_stack.vrho_pos_eval_device, total_npts );
+    buffer_adaptor vrho_neg_mem( base_stack.vrho_neg_eval_device, total_npts );
+    buffer_adaptor K_z_mem    ( base_stack.K_z_eval_device,       total_npts );
+    buffer_adaptor K_y_mem    ( base_stack.K_y_eval_device,       total_npts );
+    buffer_adaptor K_x_mem    ( base_stack.K_x_eval_device,       total_npts );
+    buffer_adaptor H_z_mem    ( base_stack.H_z_eval_device,       total_npts );
+    buffer_adaptor H_y_mem    ( base_stack.H_y_eval_device,       total_npts );
+    buffer_adaptor H_x_mem    ( base_stack.H_x_eval_device,       total_npts );
+    buffer_adaptor gamma_pp_mem( base_stack.gamma_pp_eval_device, total_npts );
+    buffer_adaptor gamma_pm_mem( base_stack.gamma_pm_eval_device, total_npts );
+    buffer_adaptor gamma_mm_mem( base_stack.gamma_mm_eval_device, total_npts );
+    buffer_adaptor vgamma_pp_mem( base_stack.vgamma_pp_eval_device, total_npts );
+    buffer_adaptor vgamma_pm_mem( base_stack.vgamma_pm_eval_device, total_npts );
+    buffer_adaptor vgamma_mm_mem( base_stack.vgamma_mm_eval_device, total_npts );
+
+    // Gradients
+    buffer_adaptor dden_sx_mem( base_stack.dden_sx_eval_device,     total_npts );
+    buffer_adaptor dden_sy_mem( base_stack.dden_sy_eval_device,     total_npts );
+    buffer_adaptor dden_sz_mem( base_stack.dden_sz_eval_device,     total_npts );
+    buffer_adaptor dden_zx_mem( base_stack.dden_zx_eval_device,     total_npts );
+    buffer_adaptor dden_zy_mem( base_stack.dden_zy_eval_device,     total_npts );
+    buffer_adaptor dden_zz_mem( base_stack.dden_zz_eval_device,     total_npts );
+    buffer_adaptor dden_yx_mem( base_stack.dden_yx_eval_device,     total_npts );
+    buffer_adaptor dden_yy_mem( base_stack.dden_yy_eval_device,     total_npts );
+    buffer_adaptor dden_yz_mem( base_stack.dden_yz_eval_device,     total_npts );
+    buffer_adaptor dden_xx_mem( base_stack.dden_xx_eval_device,     total_npts );
+    buffer_adaptor dden_xy_mem( base_stack.dden_xy_eval_device,     total_npts );
+    buffer_adaptor dden_xz_mem( base_stack.dden_xz_eval_device,     total_npts );
+    
+    // MGGA
     buffer_adaptor dden_lapl_mem( base_stack.den_lapl_eval_device, total_npts );
-
-    buffer_adaptor eps_mem( base_stack.eps_eval_device, total_npts );
-    buffer_adaptor gamma_mem( base_stack.gamma_eval_device, total_npts );
+    buffer_adaptor vlapl_mem( base_stack.vlapl_eval_device, total_npts );
     buffer_adaptor tau_mem( base_stack.tau_eval_device, total_npts );
-    buffer_adaptor vrho_mem( base_stack.vrho_eval_device, total_npts );
-    buffer_adaptor vgamma_mem( base_stack.vgamma_eval_device, total_npts );
     buffer_adaptor vtau_mem( base_stack.vtau_eval_device, total_npts );
-    buffer_adaptor vlapl_mem( base_stack.vlapl_eval_device, total_npts );
 
     for( auto& task : host_device_tasks ) {
       const auto npts    = task.npts;
@@ -562,29 +604,84 @@ void XCDeviceAoSData::pack_and_send(
 
 
       // Grid function evaluations
-      task.den = den_mem.aligned_alloc<double>(reqt.grid_den_size(npts), csl);
+      if (reqt.grid_den) {
+        task.den_s        = den_s_mem.aligned_alloc<double>( npts, csl );
+        if(is_pol) {
+          task.den          = den_mem.aligned_alloc<double>(npts*2, csl); //Interleaved memory
+          task.den_z        = den_z_mem.aligned_alloc<double>( npts, csl);
+          if ( is_gks ) {
+            task.den_y        = den_y_mem.aligned_alloc<double>( npts, csl);
+            task.den_x        = den_x_mem.aligned_alloc<double>( npts, csl);
+          }
+        }
+      }
+
+      if( reqt.grid_vrho ) {
+        task.vrho =   vrho_mem.aligned_alloc<double>( npts*den_fac, csl);
+        if( is_pol ) {
+          task.vrho_pos     = vrho_pos_mem.aligned_alloc<double>( npts, csl);
+          task.vrho_neg     = vrho_neg_mem.aligned_alloc<double>( npts, csl); 
+        }
+      }
+
+      if( reqt.grid_vgamma ) {
+        task.vgamma = vgamma_mem.aligned_alloc<double>( npts*gamma_fac, csl);
+        if( is_pol ) {
+            task.vgamma_pp    = vgamma_pp_mem.aligned_alloc<double>( npts, csl);
+            task.vgamma_pm    = vgamma_pm_mem.aligned_alloc<double>( npts, csl);
+            task.vgamma_mm    = vgamma_mm_mem.aligned_alloc<double>( npts, csl);
+        }
+      }
+      if( reqt.grid_gamma ) {
+        task.gamma = gamma_mem.aligned_alloc<double>( npts*gamma_fac, csl);
+        if( is_pol ) {
+            task.gamma_pp    = gamma_pp_mem.aligned_alloc<double>( npts, csl);
+            task.gamma_pm    = gamma_pm_mem.aligned_alloc<double>( npts, csl);
+            task.gamma_mm    = gamma_mm_mem.aligned_alloc<double>( npts, csl);
+        }
+      }
 
       if(reqt.grid_den_grad) {
-        task.ddenx = dden_x_mem.aligned_alloc<double>(npts, csl);
-        task.ddeny = dden_y_mem.aligned_alloc<double>(npts, csl);
-        task.ddenz = dden_z_mem.aligned_alloc<double>(npts, csl);
+        task.dden_sx = dden_sx_mem.aligned_alloc<double>(npts, csl);
+        task.dden_sy = dden_sy_mem.aligned_alloc<double>(npts, csl);
+        task.dden_sz = dden_sz_mem.aligned_alloc<double>(npts, csl);
+        if( is_pol ) {
+          task.dden_zx    = dden_zx_mem.aligned_alloc<double>( npts, csl );
+          task.dden_zy    = dden_zy_mem.aligned_alloc<double>( npts, csl );
+          task.dden_zz    = dden_zz_mem.aligned_alloc<double>( npts, csl );
+          if( is_gks ) {
+            task.dden_yx    = dden_yx_mem.aligned_alloc<double>( npts, csl );
+            task.dden_yy    = dden_yy_mem.aligned_alloc<double>( npts, csl );
+            task.dden_yz    = dden_yz_mem.aligned_alloc<double>( npts, csl );
+            task.dden_xx    = dden_xx_mem.aligned_alloc<double>( npts, csl );
+            task.dden_xy    = dden_xy_mem.aligned_alloc<double>( npts, csl );
+            task.dden_xz    = dden_xz_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+      }
+      
+      // H, K terms (GKS)
+      if( is_gks ) {
+        task.K_x    = K_x_mem.aligned_alloc<double>( npts, csl );
+        task.K_y    = K_y_mem.aligned_alloc<double>( npts, csl );
+        task.K_z    = K_z_mem.aligned_alloc<double>( npts, csl );
+        if( is_gga ) {
+          task.H_x    = H_x_mem.aligned_alloc<double>( npts, csl );
+          task.H_y    = H_y_mem.aligned_alloc<double>( npts, csl );
+          task.H_z    = H_z_mem.aligned_alloc<double>( npts, csl );
+        }
       }
+      
+      task.eps  =   eps_mem.aligned_alloc<double>( reqt.grid_eps_size(npts), csl);
 
+        
       if(reqt.grid_den_lapl) {
         task.denlapl = dden_lapl_mem.aligned_alloc<double>(npts, csl);
       }
 
-      task.gamma = 
-        gamma_mem.aligned_alloc<double>( reqt.grid_gamma_size(npts), csl);
       task.tau = 
         tau_mem.aligned_alloc<double>( reqt.grid_tau_size(npts), csl);
 
-      task.eps  = 
-        eps_mem.aligned_alloc<double>( reqt.grid_eps_size(npts), csl);
-      task.vrho = 
-        vrho_mem.aligned_alloc<double>( reqt.grid_vrho_size(npts), csl);
-      task.vgamma = 
-        vgamma_mem.aligned_alloc<double>( reqt.grid_vgamma_size(npts), csl);
       task.vtau = 
         vtau_mem.aligned_alloc<double>( reqt.grid_vtau_size(npts), csl);
       task.vlapl = 
diff --git a/src/xc_integrator/xc_data/device/xc_device_data.hpp b/src/xc_integrator/xc_data/device/xc_device_data.hpp
index 6c1d6a21..fde8158b 100644
--- a/src/xc_integrator/xc_data/device/xc_device_data.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_data.hpp
@@ -20,11 +20,26 @@
 namespace GauXC {
 
 enum integrator_xc_approx : uint32_t {
-  _UNDEFINED = 0,
-  LDA        = 1,
-  GGA        = 2,
-  MGGA_TAU   = 3,
-  MGGA_LAPL  = 4
+  _UNDEF_APPROX         = 0,
+  LDA                   = 1,
+  GGA                   = 2,
+  MGGA_TAU              = 3,
+  MGGA_LAPL             = 4
+};
+
+enum integrator_ks_scheme : uint32_t {
+  _UNDEF_SCHEME             = 0,
+  RKS                       = 1,
+  UKS                       = 2,
+  GKS                       = 3
+};
+
+enum density_id : uint32_t {
+  _UNDEF_DEN      = 0,
+  DEN_S           = 1,    // RKS, UKS, GKS
+  DEN_Z           = 2,    // UKS, GKS
+  DEN_Y           = 3,    // GKS
+  DEN_X           = 4     // GKS
 };
 
 struct integrator_term_tracker {
@@ -34,13 +49,14 @@ struct integrator_term_tracker {
   bool exc_grad                  = false;
   bool exx                       = false;
   bool exx_ek_screening          = false;
-  integrator_xc_approx xc_approx = _UNDEFINED;
+  integrator_xc_approx xc_approx = _UNDEF_APPROX;
+  integrator_ks_scheme ks_scheme = _UNDEF_SCHEME;
   inline void reset() {
     std::memset( this, 0, sizeof(integrator_term_tracker) );
   }
 };
 
-#define PRDVL(pred,val) (pred) ? (val) : 0ul;
+#define PRDVL(pred,val) (pred) ? (val) : 0ul
 
 struct required_term_storage {
   bool grid_points  = false;
@@ -65,18 +81,42 @@ struct required_term_storage {
   bool grid_vtau     = false;
   bool grid_vlapl    = false;
 
+
+  // Reference flags for memory management use
+  integrator_term_tracker ref_tracker;
+  
   inline size_t grid_den_size(size_t npts){ 
-    return PRDVL(grid_den, npts);
+    // For RKS, only den_s_eval is used
+    if( grid_den ) {
+      if( ref_tracker.ks_scheme == RKS ) return npts; 
+      if( ref_tracker.den )              return npts; 
+      // 2*npts for S,Z densities, 2*npts for interleaved density
+      if( ref_tracker.ks_scheme == UKS ) return 4*npts;
+      // Same as above, but also X,Y densities
+      if( ref_tracker.ks_scheme == GKS ) return 6*npts;  
+    }
+    return 0ul;
   }
   inline size_t grid_den_grad_size(size_t npts){ 
-    return PRDVL(grid_den_grad, 3 * npts);
+    if( grid_den_grad ) {
+      // 3*npts for each density in play
+      if( ref_tracker.ks_scheme == RKS ) return 3*npts;
+      if( ref_tracker.ks_scheme == UKS ) return 6*npts;
+      if( ref_tracker.ks_scheme == GKS ) return 12*npts;
+    }
+    return 0ul;
+  }
+  inline size_t grid_gamma_size(size_t npts){
+    if( grid_gamma ) {
+      if(  ref_tracker.ks_scheme == RKS ) return npts;
+      if(  ref_tracker.ks_scheme == UKS 
+        or ref_tracker.ks_scheme == GKS ) return 6*npts;
+    }
+    return 0ul;
   }
   inline size_t grid_den_lapl_size(size_t npts){ 
     return PRDVL(grid_den_lapl, npts);
   }
-  inline size_t grid_gamma_size(size_t npts){ 
-    return PRDVL(grid_gamma, npts);
-  }
   inline size_t grid_tau_size(size_t npts){ 
     return PRDVL(grid_tau, npts);
   }
@@ -84,10 +124,27 @@ struct required_term_storage {
     return PRDVL(grid_eps, npts);
   }
   inline size_t grid_vrho_size(size_t npts){ 
-    return PRDVL(grid_vrho, npts);
+    if( grid_vrho ) {
+      if(   ref_tracker.ks_scheme == RKS ) return npts;
+      if(   ref_tracker.ks_scheme == UKS 
+        or  ref_tracker.ks_scheme == GKS ) return 4*npts;
+    }
+    return 0ul;
   }
   inline size_t grid_vgamma_size(size_t npts){ 
-    return PRDVL(grid_vgamma, npts);
+    if( grid_vgamma ) {
+      if(   ref_tracker.ks_scheme == RKS ) return npts;
+      if(   ref_tracker.ks_scheme == UKS 
+        or  ref_tracker.ks_scheme == GKS ) return 6*npts;
+    }
+    return 0ul;
+  }
+  inline size_t grid_HK_size(size_t npts){
+    if( ref_tracker.ks_scheme == GKS ) {
+      if( ref_tracker.xc_approx == GGA ) return 6*npts;
+      if( ref_tracker.xc_approx == LDA ) return 3*npts;
+    }
+    return 0ul;
   }
   inline size_t grid_vtau_size(size_t npts){ 
     return PRDVL(grid_vtau, npts);
@@ -96,6 +153,8 @@ struct required_term_storage {
     return PRDVL(grid_vlapl, npts);
   }
 
+
+
   // Task-local matrices
   bool task_bfn           = false;
   bool task_bfn_grad      = false;
@@ -245,11 +304,16 @@ struct required_term_storage {
       grid_to_parent_center       = true;
     }
 
-    // Allocated terms for XC aclculations
+    // Allocated terms for XC calculations
     const bool is_xc = tracker.exc_vxc or tracker.exc_grad;
+    
+    ref_tracker = tracker;
+
     if(is_xc) {
-      if( tracker.xc_approx == _UNDEFINED )
+      if( tracker.xc_approx == _UNDEF_APPROX )
         GAUXC_GENERIC_EXCEPTION("No XC Approx Set");
+      if( tracker.ks_scheme == _UNDEF_SCHEME )
+        GAUXC_GENERIC_EXCEPTION("No KS Scheme Set");
       //const bool is_lda  = is_xc and tracker.xc_approx == LDA;
       const bool is_gga  = is_xc and tracker.xc_approx == GGA;
       const bool need_tau  = tracker.xc_approx == MGGA_TAU;
@@ -366,7 +430,7 @@ struct XCDeviceData {
   /// Allocate device memory for data that will persist on the device.
   virtual void reset_allocations() = 0;
   virtual void allocate_static_data_weights( int32_t natoms ) = 0;
-  virtual void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, bool do_vxc ) = 0;
+  virtual void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) = 0;
   virtual void allocate_static_data_den( int32_t nbf, int32_t nshells ) = 0;
   virtual void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) = 0;
   virtual void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) = 0;
@@ -374,7 +438,7 @@ struct XCDeviceData {
 
   // Send persistent data from host to device
   virtual void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) = 0;
-  virtual void send_static_data_density_basis( const double* P, int32_t ldp, const BasisSet<double>& basis ) = 0;
+  virtual void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, const double* Px, int32_t ldpx, const BasisSet<double>& basis ) = 0;
   virtual void send_static_data_shell_pairs( const BasisSet<double>&, const ShellPairCollection<double>& ) = 0;
   virtual void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection<double>& ) = 0;
 
@@ -382,7 +446,7 @@ struct XCDeviceData {
   virtual void zero_den_integrands() = 0;
 
   /// Zero out the EXC / VXC integrands in device memory
-  virtual void zero_exc_vxc_integrands() = 0;
+  virtual void zero_exc_vxc_integrands(integrator_term_tracker enabled_terms) = 0;
 
   /// Zero out the EXC Gradient integrands in device memory
   virtual void zero_exc_grad_integrands() = 0;
@@ -420,7 +484,8 @@ struct XCDeviceData {
    *  @param[out[ VXC  Integrated XC potential (host) for XC queue
    */
   virtual void retrieve_exc_vxc_integrands( double* EXC, double* N_EL,
-    double* VXC, int32_t ldvxc ) = 0;
+    double* VXCs, int32_t ldvxcs, double* VXCz, int32_t ldvxcz,
+    double* VXCy, int32_t ldvxcy, double* VXCx, int32_t ldvxcx ) = 0;
 
   /** Retreive EXC Gradient integrands from device memory
    *
@@ -444,7 +509,10 @@ struct XCDeviceData {
   virtual void copy_weights_to_tasks( host_task_iterator task_begin, host_task_iterator task_end ) = 0;
   virtual void populate_submat_maps ( size_t, host_task_iterator begin, host_task_iterator end, const BasisSetMap& ) = 0;
 
-  virtual double* vxc_device_data() = 0;
+  virtual double* vxc_z_device_data() = 0;
+  virtual double* vxc_s_device_data() = 0;
+  virtual double* vxc_y_device_data() = 0;
+  virtual double* vxc_x_device_data() = 0;
   virtual double* exc_device_data() = 0;
   virtual double* nel_device_data() = 0;
   virtual double* exx_k_device_data() = 0;
diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx
index a2635f11..1aadd1ab 100644
--- a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx
+++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx
@@ -34,7 +34,10 @@ XCDeviceStackData::XCDeviceStackData(const DeviceRuntimeEnvironment& rt) :
 XCDeviceStackData::~XCDeviceStackData() noexcept = default;
 
 
-double* XCDeviceStackData::vxc_device_data() { return static_stack.vxc_device; }
+double* XCDeviceStackData::vxc_s_device_data() { return static_stack.vxc_s_device; }
+double* XCDeviceStackData::vxc_z_device_data() { return static_stack.vxc_z_device; }
+double* XCDeviceStackData::vxc_y_device_data() { return static_stack.vxc_y_device; }
+double* XCDeviceStackData::vxc_x_device_data() { return static_stack.vxc_x_device; }
 double* XCDeviceStackData::exc_device_data() { return static_stack.exc_device; }
 double* XCDeviceStackData::nel_device_data() { return static_stack.nel_device; }
 double* XCDeviceStackData::exx_k_device_data() { return static_stack.exx_k_device; }
@@ -80,10 +83,12 @@ void XCDeviceStackData::allocate_static_data_weights( int32_t natoms ) {
   allocated_terms.weights = true;
 }
 
-void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, bool do_vxc ) {
+void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) {
 
   if( allocated_terms.exc_vxc ) 
     GAUXC_GENERIC_EXCEPTION("Attempting to reallocate Stack EXC VXC");
+  if( enabled_terms.ks_scheme == _UNDEF_SCHEME )
+    GAUXC_GENERIC_EXCEPTION("Must have a KS Scheme set to allocate Stack EXC VXC");
 
   // Save state
   global_dims.nshells = nshells;
@@ -96,18 +101,35 @@ void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshel
   static_stack.exc_device        = mem.aligned_alloc<double>( 1 , csl);
   static_stack.nel_device        = mem.aligned_alloc<double>( 1 , csl);
   static_stack.acc_scr_device    = mem.aligned_alloc<double>( 1 , csl);
+  
+  allocated_terms.ks_scheme = enabled_terms.ks_scheme;
+  static_stack.dmat_s_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+  if( not (allocated_terms.ks_scheme == RKS) ) {
+      static_stack.dmat_z_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      if( allocated_terms.ks_scheme == GKS ) {
+        static_stack.dmat_y_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        static_stack.dmat_x_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      }
+  }
 
-  if(do_vxc) static_stack.vxc_device = mem.aligned_alloc<double>( nbf * nbf , csl);
-
-  static_stack.dmat_device = mem.aligned_alloc<double>( nbf * nbf , csl);
+  if( do_vxc ) {
+    static_stack.vxc_s_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+    if( not (allocated_terms.ks_scheme == RKS) ) {
+        static_stack.vxc_z_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        if( allocated_terms.ks_scheme == GKS ) {
+          static_stack.vxc_y_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+          static_stack.vxc_x_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        }
+    }
+  }
 
   // Get current stack location
   dynmem_ptr = mem.stack();
   dynmem_sz  = mem.nleft(); 
+    
 
   allocated_terms.exc_vxc = true;
 }
-
 void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells ) {
 
   if( allocated_terms.den ) 
@@ -124,7 +146,7 @@ void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells )
   static_stack.acc_scr_device    = mem.aligned_alloc<double>( 1 , csl);
   static_stack.nel_device        = mem.aligned_alloc<double>( 1 , csl);
 
-  static_stack.dmat_device = mem.aligned_alloc<double>( nbf * nbf , csl);
+  static_stack.dmat_s_device = mem.aligned_alloc<double>( nbf * nbf , csl);
 
   // Get current stack location
   dynmem_ptr = mem.stack();
@@ -151,7 +173,7 @@ void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshe
   static_stack.nel_device        = mem.aligned_alloc<double>( 1 , csl);
   static_stack.acc_scr_device    = mem.aligned_alloc<double>( 1 , csl);
 
-  static_stack.dmat_device = mem.aligned_alloc<double>( nbf * nbf , csl);
+  static_stack.dmat_s_device = mem.aligned_alloc<double>( nbf * nbf , csl);
 
   // Get current stack location
   dynmem_ptr = mem.stack();
@@ -181,7 +203,7 @@ void XCDeviceStackData::allocate_static_data_exx( int32_t nbf, int32_t nshells,
       mem.aligned_alloc<PrimitivePair<double>>(nprim_pair_total, csl);
 
   static_stack.exx_k_device = mem.aligned_alloc<double>( nbf * nbf , csl);
-  static_stack.dmat_device  = mem.aligned_alloc<double>( nbf * nbf , csl);
+  static_stack.dmat_s_device  = mem.aligned_alloc<double>( nbf * nbf , csl);
 
   // Get current stack location
   dynmem_ptr = mem.stack();
@@ -208,7 +230,7 @@ void XCDeviceStackData::allocate_static_data_exx_ek_screening( size_t ntasks, in
   buffer_adaptor mem( dynmem_ptr, dynmem_sz );
 
   static_stack.shells_device = mem.aligned_alloc<Shell<double>>( nshells , csl);
-  static_stack.dmat_device   = mem.aligned_alloc<double>( nbf * nbf , csl);
+  static_stack.dmat_s_device   = mem.aligned_alloc<double>( nbf * nbf , csl);
   static_stack.ek_max_bfn_sum_device =
     mem.aligned_alloc<double>( ntasks , csl);
   static_stack.vshell_max_sparse_device = 
@@ -264,24 +286,39 @@ void XCDeviceStackData::send_static_data_weights( const Molecule& mol, const Mol
   device_backend_->master_queue_synchronize(); 
 }
 
-void XCDeviceStackData::send_static_data_density_basis( const double* P, int32_t ldp,
+void XCDeviceStackData::send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, const double* Px, int32_t ldpx,
   const BasisSet<double>& basis ) {
+  const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr);
+  const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr);
+  const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks);
+  if( not is_rks and not is_uks and not is_gks )
+    GAUXC_GENERIC_EXCEPTION("Densities do not match RKS, UKS, or GKS schemes");
 
   if( not (allocated_terms.exx or allocated_terms.exc_vxc or allocated_terms.exc_grad or allocated_terms.den or allocated_terms.exx_ek_screening) ) 
     GAUXC_GENERIC_EXCEPTION("Density/Basis Not Stack Allocated");
 
-  const auto nbf    = global_dims.nbf;
-  if( ldp != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDP must bf NBF");
   if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
 
-  // Copy Density
-  device_backend_->copy_async( nbf*nbf, P, static_stack.dmat_device, "P H2D" );
+
+  const auto nbf    = global_dims.nbf;
+  // Check dimensions and copy density
+  if( ldps != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPs must bf NBF");
+  device_backend_->copy_async( nbf*nbf, Ps, static_stack.dmat_s_device, "P_scalar H2D" );
+  if( not is_rks ) {
+    if( ldpz != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPz must bf NBF");
+    device_backend_->copy_async( nbf*nbf, Pz, static_stack.dmat_z_device, "P_z H2D" );
+    if( is_gks ) {
+      if( ldpy != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPy must bf NBF");
+      if( ldpx != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPx must bf NBF");
+      device_backend_->copy_async( nbf*nbf, Py, static_stack.dmat_y_device, "P_y H2D" );
+      device_backend_->copy_async( nbf*nbf, Px, static_stack.dmat_x_device, "P_x H2D" );
+    }
+  }
 
   // Copy Basis Set
   device_backend_->copy_async( basis.nshells(), basis.data(), static_stack.shells_device,
     "Shells H2D" );
 
-
   device_backend_->master_queue_synchronize(); 
 }
 
@@ -424,12 +461,15 @@ void XCDeviceStackData::zero_den_integrands() {
 }
 
 
-void XCDeviceStackData::zero_exc_vxc_integrands() {
+void XCDeviceStackData::zero_exc_vxc_integrands(integrator_term_tracker enabled_terms) {
 
   if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
 
   const auto nbf = global_dims.nbf;
-  if(static_stack.vxc_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_device, "VXC Zero" );
+  if(static_stack.vxc_s_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_s_device, "VXCs Zero" );
+  if(static_stack.vxc_z_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_z_device, "VXCz Zero" );
+  if(static_stack.vxc_y_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_y_device, "VXCy Zero" );
+  if(static_stack.vxc_x_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_x_device, "VXCx Zero" );
   device_backend_->set_zero( 1,       static_stack.exc_device, "EXC Zero" );
   device_backend_->set_zero( 1,       static_stack.nel_device, "NEL Zero" );
 
@@ -466,21 +506,31 @@ void XCDeviceStackData::zero_exx_ek_screening_intermediates() {
 }
 
 
-
-
 void XCDeviceStackData::retrieve_exc_vxc_integrands( double* EXC, double* N_EL,
-  double* VXC, int32_t ldvxc ) {
+  double* VXCs, int32_t ldvxcs, double* VXCz, int32_t ldvxcz,
+  double* VXCy, int32_t ldvxcy, double* VXCx, int32_t ldvxcx ) {
 
   const auto nbf = global_dims.nbf;
-  if( ldvxc and ldvxc != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDVXC must bf NBF");
-  if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
-  
-  if(VXC)
-  device_backend_->copy_async( nbf*nbf, static_stack.vxc_device, VXC,  "VXC D2H" );
 
   device_backend_->copy_async( 1,       static_stack.nel_device, N_EL, "NEL D2H" );
   device_backend_->copy_async( 1,       static_stack.exc_device, EXC,  "EXC D2H" );
 
+  if( ldvxcs and (ldvxcs != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCs must be NBF");
+  if( VXCs )
+    device_backend_->copy_async( nbf*nbf, static_stack.vxc_s_device, VXCs,  "VXCs D2H" );
+
+  if( ldvxcz and (ldvxcz != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCz must be NBF");
+  if( VXCz )
+    device_backend_->copy_async( nbf*nbf, static_stack.vxc_z_device, VXCz,  "VXCz D2H" );
+
+  if( ldvxcy and (ldvxcy != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCy must be NBF");
+  if( VXCy )
+    device_backend_->copy_async( nbf*nbf, static_stack.vxc_y_device, VXCy,  "VXCy D2H" );
+
+  if( ldvxcx and (ldvxcx != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCx must be NBF");
+  if( VXCx )
+    device_backend_->copy_async( nbf*nbf, static_stack.vxc_x_device, VXCx,  "VXCx D2H" );
+
 }
 
 void XCDeviceStackData::retrieve_den_integrands( double* N_EL ) {
@@ -580,26 +630,30 @@ size_t XCDeviceStackData::get_mem_req(
   const size_t npts  = points.size();
 
   required_term_storage reqt(terms);
+  
   size_t mem_req = 
     // Grid
-    reqt.grid_points_size (npts) * sizeof(double) + 
-    reqt.grid_weights_size(npts) * sizeof(double) +
+    reqt.grid_points_size (npts)  * sizeof(double) + 
+    reqt.grid_weights_size(npts)  * sizeof(double) +
 
     // U Variables
     reqt.grid_den_size(npts)      * sizeof(double) + 
     reqt.grid_den_grad_size(npts) * sizeof(double) +
     reqt.grid_den_lapl_size(npts) * sizeof(double) +
 
+    // H/K Matrices (GKS)
+    reqt.grid_HK_size(npts)       * sizeof(double) +
+
     // V Variables
-    reqt.grid_gamma_size(npts)  * sizeof(double) +
-    reqt.grid_tau_size(npts)    * sizeof(double) +
+    reqt.grid_gamma_size(npts)    * sizeof(double) +
+    reqt.grid_tau_size(npts)      * sizeof(double) +
 
     // XC output
-    reqt.grid_eps_size(npts)    * sizeof(double) +
-    reqt.grid_vrho_size(npts)   * sizeof(double) +
-    reqt.grid_vgamma_size(npts) * sizeof(double) +
-    reqt.grid_vtau_size(npts)   * sizeof(double) +
-    reqt.grid_vlapl_size(npts)  * sizeof(double) ;
+    reqt.grid_eps_size(npts)      * sizeof(double) +
+    reqt.grid_vrho_size(npts)     * sizeof(double) +
+    reqt.grid_vgamma_size(npts)   * sizeof(double) +
+    reqt.grid_vtau_size(npts)     * sizeof(double) +
+    reqt.grid_vlapl_size(npts)    * sizeof(double) ;
 
   return mem_req;
 }
@@ -630,7 +684,15 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack(
   required_term_storage reqt(terms);
   const size_t msz = total_npts_task_batch;
   const size_t aln = 256;
+  
+  const bool is_rks = terms.ks_scheme == RKS;
+  const bool is_uks = terms.ks_scheme == UKS;
+  const bool is_gks = terms.ks_scheme == GKS;
+  const bool is_pol = is_uks or is_gks;
+  const bool is_gga = terms.xc_approx == GGA;
 
+  const bool is_den = terms.den;
+  
   // Grid Points
   if( reqt.grid_points ) {
     base_stack.points_x_device = mem.aligned_alloc<double>( msz, aln, csl);
@@ -638,21 +700,37 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack(
     base_stack.points_z_device = mem.aligned_alloc<double>( msz, aln, csl);
   }
 
+
   // Grid Weights
   if( reqt.grid_weights ) {
     base_stack.weights_device = mem.aligned_alloc<double>(msz, csl);
   }
 
   // Grid function evaluations
-
   if( reqt.grid_den ) { // Density 
-    base_stack.den_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    base_stack.den_s_eval_device     = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_pol )  {  base_stack.den_eval_device       = mem.aligned_alloc<double>(2*msz, aln, csl);
+                    base_stack.den_z_eval_device     = mem.aligned_alloc<double>(msz, aln, csl); 
+    if( is_gks ){   base_stack.den_y_eval_device     = mem.aligned_alloc<double>(msz, aln, csl); 
+                    base_stack.den_x_eval_device     = mem.aligned_alloc<double>(msz, aln, csl); }
+    }
   }
 
   if( reqt.grid_den_grad ) { // Density gradient
-    base_stack.den_x_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-    base_stack.den_y_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-    base_stack.den_z_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                   base_stack.dden_sx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                   base_stack.dden_sy_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                   base_stack.dden_sz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+
+    if( is_pol )  { base_stack.dden_zx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                   base_stack.dden_zy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+                   base_stack.dden_zz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+    if( is_gks ) { base_stack.dden_yx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                   base_stack.dden_yy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+                   base_stack.dden_yz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+                   base_stack.dden_xx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                   base_stack.dden_xy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+                   base_stack.dden_xz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
+    }
   }
 
   if( reqt.grid_den_lapl ) { // Density Laplacian
@@ -660,23 +738,45 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack(
   }
 
   if( reqt.grid_gamma ) { // Gamma
-    base_stack.gamma_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_pol  ) {  base_stack.gamma_eval_device    = mem.aligned_alloc<double>(3 * msz, aln, csl);
+                    base_stack.gamma_pp_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                    base_stack.gamma_pm_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                    base_stack.gamma_mm_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
+    else            base_stack.gamma_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
   }
 
-  if( reqt.grid_tau ) { // Tau 
-    base_stack.tau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+  if( reqt.grid_vrho ) { // Vrho
+    if( is_pol  ) { base_stack.vrho_eval_device = mem.aligned_alloc<double>(2 * msz, aln, csl);
+                   base_stack.vrho_pos_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                   base_stack.vrho_neg_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
+    else           base_stack.vrho_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
   }
 
-  if( reqt.grid_eps ) { // Energy density 
-    base_stack.eps_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+  if( reqt.grid_vgamma ) { // Vgamma
+    if( is_pol  ) {  base_stack.vgamma_eval_device    = mem.aligned_alloc<double>(3*msz, aln, csl);
+                    base_stack.vgamma_pp_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                    base_stack.vgamma_pm_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+                    base_stack.vgamma_mm_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
+    else            base_stack.vgamma_eval_device    = mem.aligned_alloc<double>(msz, aln, csl);
   }
 
-  if( reqt.grid_vrho ) { // Vrho
-    base_stack.vrho_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+  if( reqt.grid_tau ) { // Tau 
+    base_stack.tau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
   }
 
-  if( reqt.grid_vgamma ) { // Vgamma
-    base_stack.vgamma_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+  if( is_gks ) {       // H, K matrices
+    base_stack.K_x_eval_device   = mem.aligned_alloc<double>(msz, aln, csl);
+    base_stack.K_y_eval_device   = mem.aligned_alloc<double>(msz, aln, csl);
+    base_stack.K_z_eval_device   = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_gga ) {
+      base_stack.H_x_eval_device   = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.H_y_eval_device   = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.H_z_eval_device   = mem.aligned_alloc<double>(msz, aln, csl);
+    }
+  }
+
+  if( reqt.grid_eps ) { // Energy density 
+    base_stack.eps_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
   }
 
   if( reqt.grid_vtau ) { // Vtau
@@ -687,11 +787,13 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack(
     base_stack.vlapl_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
   }
 
+
+
   // Update dynmem data for derived impls
   return device_buffer_t{ mem.stack(), mem.nleft() };
 }
 
-void XCDeviceStackData::pack_and_send( integrator_term_tracker ,
+void XCDeviceStackData::pack_and_send( integrator_term_tracker terms,
   host_task_iterator task_begin, host_task_iterator task_end, const BasisSetMap& ) {
 
   if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp
index af0b2b0b..e1a72ec4 100644
--- a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp
@@ -47,13 +47,11 @@ struct XCDeviceStackData : public XCDeviceData {
     Shell<double>* shells_device = nullptr; ///< Array of static basis shells (nshells)
     PrimitivePair<double>* prim_pairs_device = nullptr;
 
-    double* dmat_device   = nullptr; ///< Static density matrix storage (nbf,nbf)
     double* rab_device    = nullptr; ///< Static RAB matrix storage (*,natoms)
     double* coords_device = nullptr; ///< Static atomic positions (3 * natoms)
 
     double* exc_device     = nullptr;  ///< EXC storage (1)
     double* nel_device     = nullptr;  ///< N_EL storage (1)
-    double* vxc_device     = nullptr;  ///< VXC storage (nbf,nbf)
     double* exx_k_device   = nullptr;  ///< EXX K storage (nbf,nbf)
     double* acc_scr_device = nullptr;  ///< Accumulaion scratch (1)
     double* exc_grad_device = nullptr; ///< EXC Gradient storage (3*natoms)
@@ -66,6 +64,15 @@ struct XCDeviceStackData : public XCDeviceData {
     int32_t* shell_to_bf_device = nullptr;
     int32_t* shell_sizes_device = nullptr;
 
+    double* dmat_s_device   = nullptr;  ///< Static density matrix storage (nbf,nbf)
+    double* dmat_z_device   = nullptr;  /// Ditto for Z,Y,X densities
+    double* dmat_y_device   = nullptr;
+    double* dmat_x_device   = nullptr;
+    double* vxc_s_device    = nullptr;  ///< VXC storage (nbf, nbf)
+    double* vxc_z_device    = nullptr;  /// Ditto for Z,Y,X densities
+    double* vxc_y_device    = nullptr;
+    double* vxc_x_device    = nullptr;
+
     inline void reset() { std::memset( this, 0, sizeof(static_data) ); }
   };
 
@@ -85,10 +92,28 @@ struct XCDeviceStackData : public XCDeviceData {
     double* weights_device = nullptr; ///< Grid weights for task batch
 
     // U variables
-    double* den_eval_device      = nullptr; ///< density for task batch
-    double* den_x_eval_device    = nullptr; ///< d/dx density for task batch
-    double* den_y_eval_device    = nullptr; ///< d/dy density for task batch
-    double* den_z_eval_device    = nullptr; ///< d/dz density for task batch
+    double* den_s_eval_device   = nullptr; ///< scalar density for task batch
+    double* dden_sx_eval_device = nullptr; ///< d/dx scalar density for task batch
+    double* dden_sy_eval_device = nullptr; ///< d/dy scalar density for task batch
+    double* dden_sz_eval_device = nullptr; ///< d/dz scalar density for task batch
+    
+    double* den_z_eval_device   = nullptr; ///< z density for task batch
+    double* dden_zx_eval_device = nullptr; ///< d/dx z density for task batch
+    double* dden_zy_eval_device = nullptr; ///< d/dy z density for task batch
+    double* dden_zz_eval_device = nullptr; ///< d/dz z density for task batch
+
+    double* den_y_eval_device   = nullptr; ///< y density for task batch
+    double* dden_yx_eval_device = nullptr; ///< d/dx y density for task batch
+    double* dden_yy_eval_device = nullptr; ///< d/dy y density for task batch
+    double* dden_yz_eval_device = nullptr; ///< d/dz y density for task batch
+
+    double* den_x_eval_device   = nullptr; ///< x density for task batch
+    double* dden_xx_eval_device = nullptr; ///< d/dx x density for task batch
+    double* dden_xy_eval_device = nullptr; ///< d/dy x density for task batch
+    double* dden_xz_eval_device = nullptr; ///< d/dz x density for task batch
+    
+    double* den_eval_device     = nullptr; /// Storage for interleaved density (non-RKS only)
+
     double* den_lapl_eval_device = nullptr; ///< density Laplacian for task batch
 
     // V variables / XC output
@@ -100,6 +125,23 @@ struct XCDeviceStackData : public XCDeviceData {
     double* vtau_eval_device   = nullptr; ///< Tau XC derivative for task batch
     double* vlapl_eval_device  = nullptr; ///< Lapl XC derivative for task batch
 
+    double* vrho_pos_eval_device  = nullptr;  ///< Polarized Rho+ XC derivative for task batch
+    double* vrho_neg_eval_device  = nullptr;  ///< Polarized Rho+ XC derivative for task batch
+
+    double* gamma_pp_eval_device  = nullptr;  ///< Polarized Gamma++ for task batch
+    double* gamma_pm_eval_device  = nullptr;  ///< Polarized Gamma+- for task batch
+    double* gamma_mm_eval_device  = nullptr;  ///< Polarized Gamma-- for task batch
+    double* vgamma_pp_eval_device  = nullptr; ///< Polarized Gamma++ XC derivative for task batch
+    double* vgamma_pm_eval_device  = nullptr; ///< Polarized Gamma+- XC derivative for task batch
+    double* vgamma_mm_eval_device  = nullptr; ///< Polarized Gamma-- XC derivative for task batch
+
+    double* H_x_eval_device     = nullptr;    ///< norm(m) dependent GGA X transformation factor for task batch
+    double* H_y_eval_device     = nullptr;    ///< norm(m) dependent GGA Y transformation factor for task batch
+    double* H_z_eval_device     = nullptr;    ///< norm(m) dependent GGA Z transformation factor for task batch
+    double* K_x_eval_device     = nullptr;    ///< norm(m) dependent LDA X transformation factor for task batch
+    double* K_y_eval_device     = nullptr;    ///< norm(m) dependent LDA Y transformation factor for task batch
+    double* K_z_eval_device     = nullptr;    ///< norm(m) dependent LDA Z transformation factor for task batch
+
     inline void reset() { std::memset( this, 0, sizeof(base_stack_data) ); }
   };
 
@@ -118,31 +160,36 @@ struct XCDeviceStackData : public XCDeviceData {
   host_task_iterator generate_buffers( integrator_term_tracker, const BasisSetMap&,
     host_task_iterator, host_task_iterator) override final;
   void allocate_static_data_weights( int32_t natoms ) override final;
-  void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, bool do_vxc ) override final;
+  void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) override final;
   void allocate_static_data_den( int32_t nbf, int32_t nshells ) override final;
   void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) override final;
   void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) override final;
   void allocate_static_data_exx_ek_screening( size_t ntasks, int32_t nbf, int32_t nshells, int nshell_pairs, int32_t max_l ) override final;
   void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) override final;
-  void send_static_data_density_basis( const double* P, int32_t ldp, 
+  void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz,
+                                        const double* Py, int32_t ldpy, const double* Px, int32_t ldpx,
     const BasisSet<double>& basis ) override final;
   void send_static_data_shell_pairs( const BasisSet<double>&, const ShellPairCollection<double>& ) 
     override final;
   void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection<double>& ) override final;
   void zero_den_integrands() override final;
-  void zero_exc_vxc_integrands() override final;
+  void zero_exc_vxc_integrands(integrator_term_tracker t) override final;
   void zero_exc_grad_integrands() override final;
   void zero_exx_integrands() override final;
   void zero_exx_ek_screening_intermediates() override final;
   void retrieve_exc_vxc_integrands( double* EXC, double* N_EL,
-    double* VXC, int32_t ldvxc ) override final;
+    double* VXCscalar, int32_t ldvxcscalar, double* VXCz, int32_t ldvxcz,
+    double* VXCy     , int32_t ldvxcy     , double* VXCx, int32_t ldvxcx ) override final;
   void retrieve_exc_grad_integrands( double* EXC_GRAD, double* N_EL ) override final;
   void retrieve_den_integrands( double* N_EL ) override final;
   void retrieve_exx_integrands( double* K, int32_t ldk ) override final;
   void retrieve_exx_ek_max_bfn_sum( double* MBS, int32_t nt) override final;
   void copy_weights_to_tasks( host_task_iterator task_begin, host_task_iterator task_end ) override final;
 
-  double* vxc_device_data() override;
+  double* vxc_s_device_data() override;
+  double* vxc_z_device_data() override;
+  double* vxc_y_device_data() override;
+  double* vxc_x_device_data() override;
   double* exc_device_data() override;
   double* nel_device_data() override;
   double* exx_k_device_data() override;
diff --git a/src/xc_integrator/xc_data/device/xc_device_task.hpp b/src/xc_integrator/xc_data/device/xc_device_task.hpp
index 8b071f90..696ef185 100644
--- a/src/xc_integrator/xc_data/device/xc_device_task.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_task.hpp
@@ -50,17 +50,54 @@ struct XCDeviceTask {
   double*   d2bfyy    = nullptr;
   double*   d2bfyz    = nullptr;
   double*   d2bfzz    = nullptr;
-  double*   d2bflapl    = nullptr;
-  double*   den     = nullptr;
-  double*   ddenx   = nullptr;
-  double*   ddeny   = nullptr;
-  double*   ddenz   = nullptr;
-  double*   denlapl     = nullptr;
   double*   eps     = nullptr;
-  double*   gamma   = nullptr;
-  double*   tau   = nullptr;
+
+  double* den       = nullptr;
+  double* gamma     = nullptr;
   double*   vrho    = nullptr;
   double*   vgamma  = nullptr;
+    
+  // (S,Z,Y,X) densities
+  double* den_s     = nullptr;
+  double* den_z     = nullptr;
+  double* den_y     = nullptr;
+  double* den_x     = nullptr;
+  // Del(S,Z,Y,X) Gradients
+  double* dden_sx   = nullptr;
+  double* dden_sy   = nullptr;
+  double* dden_sz   = nullptr;
+  double* dden_zx   = nullptr;
+  double* dden_zy   = nullptr;
+  double* dden_zz   = nullptr;
+  double* dden_yx   = nullptr;
+  double* dden_yy   = nullptr;
+  double* dden_yz   = nullptr;
+  double* dden_xx   = nullptr;
+  double* dden_xy   = nullptr;
+  double* dden_xz   = nullptr;
+  
+  // 2C U vars
+  double* vrho_pos  = nullptr;
+  double* vrho_neg  = nullptr;
+  double* gamma_pp  = nullptr;
+  double* gamma_pm  = nullptr;
+  double* gamma_mm  = nullptr;
+  double* vgamma_pp  = nullptr;
+  double* vgamma_pm  = nullptr;
+  double* vgamma_mm  = nullptr;
+
+  // GKS K,H matrices
+  double* K_z        = nullptr;
+  double* K_y        = nullptr;
+  double* K_x        = nullptr;
+  double* H_z        = nullptr;
+  double* H_y        = nullptr;
+  double* H_x        = nullptr;
+
+  // MGGA
+  double*   d2bflapl    = nullptr;
+  double*   denlapl     = nullptr;
+  double*   tau   = nullptr;
   double*   vtau  = nullptr;
   double*   vlapl  = nullptr;
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2c76faf6..d3881e91 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+# GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
 # any required approvals from the U.S. Dept. of Energy). All rights reserved.
 #
diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx
index 7afb61d1..aef5d612 100644
--- a/tests/basis/parse_basis.cxx
+++ b/tests/basis/parse_basis.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/basis/parse_basis.hpp b/tests/basis/parse_basis.hpp
index 914e24a1..5815584c 100644
--- a/tests/basis/parse_basis.hpp
+++ b/tests/basis/parse_basis.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/basisset_test.cxx b/tests/basisset_test.cxx
index fd116dba..d8aec9d0 100644
--- a/tests/basisset_test.cxx
+++ b/tests/basisset_test.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/cmake/discovery/CMakeLists.txt b/tests/cmake/discovery/CMakeLists.txt
index 7ba3e00d..3a03749f 100644
--- a/tests/cmake/discovery/CMakeLists.txt
+++ b/tests/cmake/discovery/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+# GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
 # any required approvals from the U.S. Dept. of Energy). All rights reserved.
 #
diff --git a/tests/cmake/discovery/gauxc_link_tester.cxx b/tests/cmake/discovery/gauxc_link_tester.cxx
index d4eabe39..2ba40e22 100644
--- a/tests/cmake/discovery/gauxc_link_tester.cxx
+++ b/tests/cmake/discovery/gauxc_link_tester.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/cmake/subproject/CMakeLists.txt b/tests/cmake/subproject/CMakeLists.txt
index af6cc42a..0bbf72b9 100644
--- a/tests/cmake/subproject/CMakeLists.txt
+++ b/tests/cmake/subproject/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+# GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
 # any required approvals from the U.S. Dept. of Energy). All rights reserved.
 #
diff --git a/tests/cmake/subproject/gauxc_link_tester.cxx b/tests/cmake/subproject/gauxc_link_tester.cxx
index d4eabe39..2ba40e22 100644
--- a/tests/cmake/subproject/gauxc_link_tester.cxx
+++ b/tests/cmake/subproject/gauxc_link_tester.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/collocation.cxx b/tests/collocation.cxx
index 1ab9aafc..fb8a0393 100644
--- a/tests/collocation.cxx
+++ b/tests/collocation.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/collocation_common.hpp b/tests/collocation_common.hpp
index a8cf3bc1..91baa780 100644
--- a/tests/collocation_common.hpp
+++ b/tests/collocation_common.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/collocation_cuda.hpp b/tests/collocation_cuda.hpp
index 929fe47c..42ae0eb4 100644
--- a/tests/collocation_cuda.hpp
+++ b/tests/collocation_cuda.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/collocation_hip.hpp b/tests/collocation_hip.hpp
index 286dd09f..c6314aac 100644
--- a/tests/collocation_hip.hpp
+++ b/tests/collocation_hip.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/collocation_host.hpp b/tests/collocation_host.hpp
index fbe68a1c..a64ce8ee 100644
--- a/tests/collocation_host.hpp
+++ b/tests/collocation_host.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/conv_cereal_to_hdf5.cxx b/tests/conv_cereal_to_hdf5.cxx
index 7cb60381..d717cf63 100644
--- a/tests/conv_cereal_to_hdf5.cxx
+++ b/tests/conv_cereal_to_hdf5.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/eigen3_matrix_serialization.hpp b/tests/eigen3_matrix_serialization.hpp
index 9ed8a6ef..38537170 100644
--- a/tests/eigen3_matrix_serialization.hpp
+++ b/tests/eigen3_matrix_serialization.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/grid_opt.cxx b/tests/grid_opt.cxx
index 8d465a0e..ced74d20 100644
--- a/tests/grid_opt.cxx
+++ b/tests/grid_opt.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/grid_test.cxx b/tests/grid_test.cxx
index a0a0d37d..4e8782e6 100644
--- a/tests/grid_test.cxx
+++ b/tests/grid_test.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/ini_input.cxx b/tests/ini_input.cxx
index 2fc71a32..972eeaaa 100644
--- a/tests/ini_input.cxx
+++ b/tests/ini_input.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/ini_input.hpp b/tests/ini_input.hpp
index 00071a92..bd0de809 100644
--- a/tests/ini_input.hpp
+++ b/tests/ini_input.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/load_balancer_test.cxx b/tests/load_balancer_test.cxx
index 599367f9..1c55f3e8 100644
--- a/tests/load_balancer_test.cxx
+++ b/tests/load_balancer_test.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/molgrid_test.cxx b/tests/molgrid_test.cxx
index 9b657aae..191b8e7b 100644
--- a/tests/molgrid_test.cxx
+++ b/tests/molgrid_test.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/moltypes_test.cxx b/tests/moltypes_test.cxx
index 32c08376..f63dce91 100644
--- a/tests/moltypes_test.cxx
+++ b/tests/moltypes_test.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/runtime.cxx b/tests/runtime.cxx
index 189d5a34..1f8b6933 100644
--- a/tests/runtime.cxx
+++ b/tests/runtime.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx
index 2b4ffc07..efc9ce59 100644
--- a/tests/standalone_driver.cxx
+++ b/tests/standalone_driver.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/standards.cxx b/tests/standards.cxx
index 3aa63c86..6ca6473a 100644
--- a/tests/standards.cxx
+++ b/tests/standards.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/standards.hpp b/tests/standards.hpp
index 89e57035..a9db6759 100644
--- a/tests/standards.hpp
+++ b/tests/standards.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/ut_common.hpp.in b/tests/ut_common.hpp.in
index e7590d6c..628ef5f4 100644
--- a/tests/ut_common.hpp.in
+++ b/tests/ut_common.hpp.in
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/ut_main.cxx b/tests/ut_main.cxx
index 972d23d0..0ccd3be1 100644
--- a/tests/ut_main.cxx
+++ b/tests/ut_main.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/weights.cxx b/tests/weights.cxx
index 7028d31e..a56df0fc 100644
--- a/tests/weights.cxx
+++ b/tests/weights.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/weights_cuda.hpp b/tests/weights_cuda.hpp
index c332435b..bd1561a5 100644
--- a/tests/weights_cuda.hpp
+++ b/tests/weights_cuda.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/weights_generate.hpp b/tests/weights_generate.hpp
index 9b5b65d5..7d586a8b 100644
--- a/tests/weights_generate.hpp
+++ b/tests/weights_generate.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/weights_hip.hpp b/tests/weights_hip.hpp
index 0db42a48..d5ad45f7 100644
--- a/tests/weights_hip.hpp
+++ b/tests/weights_hip.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/weights_host.hpp b/tests/weights_host.hpp
index 22707fcf..e78694d8 100644
--- a/tests/weights_host.hpp
+++ b/tests/weights_host.hpp
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
diff --git a/tests/xc_integrator.cxx b/tests/xc_integrator.cxx
index 7d458dff..88527fa8 100644
--- a/tests/xc_integrator.cxx
+++ b/tests/xc_integrator.cxx
@@ -1,5 +1,5 @@
 /**
- * GauXC Copyright (c) 2020-2023, The Regents of the University of California,
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
  * any required approvals from the U.S. Dept. of Energy). All rights reserved.
  *
@@ -125,9 +125,7 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt,
     }
   }
 
-  if(uks and ex == ExecutionSpace::Device) return;
-  if(gks and ex == ExecutionSpace::Device) return;
-
+  if( (uks or gks) and ex == ExecutionSpace::Device and func.is_mgga() ) return;
 
   for( auto& sh : basis ) 
     sh.set_shell_tolerance( std::numeric_limits<double>::epsilon() );
@@ -302,19 +300,23 @@ void test_integrator(std::string reference_file, functional_type& func, PruningS
 
     #ifdef GAUXC_HAS_MAGMA
     SECTION( "Incore - MPI Reduction - MAGMA" ) {
-      test_xc_integrator( ExecutionSpace::Device, rt,
-        reference_file, func, pruning_scheme,
-        false, true, check_k, "Default", "Default", 
-        "Scheme1-MAGMA" );
+      if(not func.is_mgga() and not func.is_polarized()) {
+        test_xc_integrator( ExecutionSpace::Device, rt,
+          reference_file, func, pruning_scheme,
+          false, true, check_k, "Default", "Default", 
+          "Scheme1-MAGMA" );
+      }
     }
     #endif
 
     #ifdef GAUXC_HAS_CUTLASS
     SECTION( "Incore - MPI Reduction - CUTLASS" ) {
-      test_xc_integrator( ExecutionSpace::Device, rt, 
-        reference_file, func, pruning_scheme,
-        false, true, false, "Default", "Default", 
-        "Scheme1-CUTLASS" );
+      if(not func.is_mgga() and not func.is_polarized()) {
+        test_xc_integrator( ExecutionSpace::Device, rt, 
+          reference_file, func, pruning_scheme,
+          false, true, false, "Default", "Default", 
+          "Scheme1-CUTLASS" );
+      }
     }
     #endif