diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp index 42b6b7fc..ac16bfbd 100644 --- a/include/gauxc/exceptions.hpp +++ b/include/gauxc/exceptions.hpp @@ -34,7 +34,7 @@ class magma_exception; class cutlass_exception; #endif -/// C++ Excpetion for genertic GauXC errors +/// C++ Exception for generic GauXC errors class generic_gauxc_exception : public std::exception { std::string file_; diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx index dfbfea06..ccd716eb 100644 --- a/src/xc_integrator/integrator_util/exx_screening.cxx +++ b/src/xc_integrator/integrator_util/exx_screening.cxx @@ -255,7 +255,7 @@ void exx_ek_screening( device_data.reset_allocations(); device_data.allocate_static_data_exx_ek_screening( ntasks, nbf, nshells, shpairs.npairs(), basis_map.max_l() ); - device_data.send_static_data_density_basis( P_abs, ldp, basis ); + device_data.send_static_data_density_basis( P_abs, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis ); device_data.send_static_data_exx_ek_screening( V_shell_max, ldv, basis_map, shpairs ); diff --git a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp index 2ef12537..e590bb90 100644 --- a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp +++ b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp @@ -63,5 +63,6 @@ void syr2k( device_blas_handle handle, int M, int K, T ALPHA, const T* A, int LDA, const T* B, int LDB, T BETA, T* C, int LDC ); + } diff --git a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp index 3bc7a447..78ce5324 100644 --- a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp +++ b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp @@ -7,20 +7,23 @@ */ #pragma once #include "device/xc_device_task.hpp" +#include "device/xc_device_data.hpp" #include "device/device_queue.hpp" namespace GauXC { -void eval_uvvars_lda( size_t ntasks, int32_t nbe_max, int32_t npts_max, + +void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, XCDeviceTask* device_tasks, device_queue queue ); -void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbe_max, - int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, - const double* deny, const double* denz, double* gamma, device_queue queue ); +void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ); -void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbe_max, - int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, - const double* deny, const double* denz, double* gamma, bool do_lapl, +void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, + int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, device_queue queue ); +void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ); + } diff --git a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp index 2900f42b..514b6d38 100644 --- a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp +++ b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp @@ -6,6 +6,7 @@ * See LICENSE.txt for details */ #include "device/xc_device_task.hpp" +#include "device/xc_device_data.hpp" #include "device/device_queue.hpp" namespace GauXC { @@ -14,12 +15,16 @@ void zmat_lda_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, XCDeviceTask* tasks_device, + integrator_ks_scheme s, + density_id sel, device_queue queue ); void zmat_gga_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, XCDeviceTask* tasks_device, + integrator_ks_scheme s, + density_id sel, device_queue queue ); void zmat_mgga_vxc( size_t ntasks, diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu index bdfc8724..62557401 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu @@ -478,6 +478,88 @@ void eval_collocation_shell_to_task_hessian( } +uint32_t max_threads_shell_to_task_collocation_laplacian( int32_t l, bool pure ) { + if( pure ) { + switch(l) { + case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );\ + $for( L in range(1, L_max + 1) ) + case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_$(L) ); + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } else { + switch(l) {\ + $for( L in range(L_max + 1) ) + case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L) );\ + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } + return 0; +} + + + + + +template +void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l, + bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { + + dim3 threads = max_threads_shell_to_task_collocation(l,pure); + int nwarp_per_block = threads.x / cuda::warp_size; + int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); + dim3 block(n_task_blocks, 1, nshells); + + if( pure ) { + switch(l) { + case 0: + collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<>>( nshells, std::forward(args)... ); + break; + $for( L in range(1, L_max + 1) ) + case $(L): + collocation_device_shell_to_task_kernel_spherical_laplacian_$(L)<<>>( nshells, std::forward(args)... ); + break;\ + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } else { + switch(l) {\ + $for( L in range(0, L_max + 1) ) + case $(L): + collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L)<<>>( nshells, std::forward(args)... ); + break;\ + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } + +} + + + +void eval_collocation_shell_to_task_laplacian( + uint32_t max_l, + AngularMomentumShellToTaskBatch* l_batched_shell_to_task, + XCDeviceTask* device_tasks, + device_queue queue +) { + + cudaStream_t stream = queue.queue_as() ; + + for( auto l = 0u; l <= max_l; ++l ) { + auto pure = l_batched_shell_to_task[l].pure; + auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device; + auto nshells = l_batched_shell_to_task[l].nshells_in_batch; + auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average); + dispatch_shell_to_task_collocation_laplacian( stream, l, pure, + ntask_average, nshells, shell_to_task_device, device_tasks ); + auto stat = cudaGetLastError(); + GAUXC_CUDA_ERROR("LAP", stat); + } + + +} diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp index f1a62f95..544554b3 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp @@ -19,6 +19,10 @@ #include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_hessian.hpp"\ $endfor +$for( L in range(L_max + 1)) +#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_laplacian.hpp"\ +$endfor + $for( L in range(L_max + 1)) #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L).hpp"\ $endfor @@ -30,3 +34,7 @@ $for( L in range(L_max + 1)) #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_hessian.hpp"\ $endfor + +$for( L in range(L_max + 1)) +#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_laplacian.hpp"\ +$endfor diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu index f421c4f3..ee7c7746 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu @@ -181,5 +181,6 @@ void syr2k( device_blas_handle generic_handle, } + } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu index 242c73e4..4dd328bd 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu @@ -165,9 +165,9 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( const auto* __restrict__ vrho = task->vrho; const auto* __restrict__ vgamma = task->vgamma; - const auto* __restrict__ den_x = task->ddenx; - const auto* __restrict__ den_y = task->ddeny; - const auto* __restrict__ den_z = task->ddenz; + const auto* __restrict__ den_x = task->dden_sx; + const auto* __restrict__ den_y = task->dden_sy; + const auto* __restrict__ den_z = task->dden_sz; #pragma unroll 1 for( uint32_t ipt = threadIdx.x % cuda::warp_size; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu index c42d2182..6aea5225 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu @@ -10,10 +10,20 @@ #include "device_specific/cuda_device_constants.hpp" #include #include "device_specific/cuda_util.hpp" +#include "device/xc_device_data.hpp" namespace GauXC { -__global__ void eval_uvars_lda_kernel( size_t ntasks, +#define VVAR_KERNEL_SM_BLOCK 32 +#define GGA_KERNEL_SM_WARPS 16 +#define MGGA_KERNEL_SM_BLOCK 32 + +__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + // eval_vvars populated uvar storage already in the case of LDA+RKS + return; +} + +__global__ void eval_uvars_lda_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -22,131 +32,275 @@ __global__ void eval_uvars_lda_kernel( size_t ntasks, auto& task = tasks_device[ batch_idx ]; const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - auto* den_eval_device = task.den; + auto* den_pos_eval_device = task.den_s; + auto* den_neg_eval_device = task.den_z; - const auto* basis_eval_device = task.bf; - const auto* den_basis_prod_device = task.zmat; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - register double den_reg = 0.; + if( tid < npts ) { + const auto ps = den_pos_eval_device[ tid ]; + const auto pz = den_neg_eval_device[ tid ]; + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); - if( tid_x < nbf and tid_y < npts ) { + } +} - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; +__global__ void eval_uvars_lda_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; - } + auto& task = tasks_device[ batch_idx ]; - // Warp blocks are stored col major - constexpr auto warp_size = cuda::warp_size; - //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block; - den_reg = cuda::warp_reduce_sum( den_reg ); + const auto npts = task.npts; + auto* den_z_eval_device = task.den_s; + auto* den_s_eval_device = task.den_z; + auto* den_y_eval_device = task.den_y; + auto* den_x_eval_device = task.den_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + const double dtolsq = 1e-24; // TODO: make variable - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - } + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + + if( tid < npts ) { + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + } + + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); + } } +__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + auto* gamma_eval_device = task.gamma; -#define GGA_KERNEL_SM_BLOCK_Y 32 + const int tid = threadIdx.x + blockIdx.x * blockDim.x; -__global__ void eval_uvars_gga_kernel( size_t ntasks, - XCDeviceTask* tasks_device ) { + if( tid < npts ) { + const double dx = dden_sx_eval_device[ tid ]; + const double dy = dden_sy_eval_device[ tid ]; + const double dz = dden_sz_eval_device[ tid ]; - constexpr auto warp_size = cuda::warp_size; - //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block; + gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz; + + } + +} + +__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { const int batch_idx = blockIdx.z; if( batch_idx >= ntasks ) return; - auto& task = tasks_device[ batch_idx ]; - + const auto& task = tasks_device[ batch_idx ]; const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - auto* den_eval_device = task.den; - auto* den_x_eval_device = task.ddenx; - auto* den_y_eval_device = task.ddeny; - auto* den_z_eval_device = task.ddenz; + auto* den_pos_eval_device = task.den_s; + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; + auto* den_neg_eval_device = task.den_z; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; - const auto* den_basis_prod_device = task.zmat; + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; - __shared__ double den_shared[4][warp_size][GGA_KERNEL_SM_BLOCK_Y+1]; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; - for ( int bid_x = blockIdx.x * blockDim.x; - bid_x < nbf; - bid_x += blockDim.x * gridDim.x ) { - - for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; - bid_y < npts; - bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) { - - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { - den_shared[0][threadIdx.x][sm_y] = 0.; - den_shared[1][threadIdx.x][sm_y] = 0.; - den_shared[2][threadIdx.x][sm_y] = 0.; - den_shared[3][threadIdx.x][sm_y] = 0.; + if( tid < npts ) { + const double ps = den_pos_eval_device[ tid ]; + const double pz = den_neg_eval_device[ tid ]; + const double dndx = den_pos_x_eval_device[ tid ]; + const double dndy = den_pos_y_eval_device[ tid ]; + const double dndz = den_pos_z_eval_device[ tid ]; + const double dMzdx = den_neg_x_eval_device[ tid ]; + const double dMzdy = den_neg_y_eval_device[ tid ]; + const double dMzdz = den_neg_z_eval_device[ tid ]; + + // (del n).(del n) + const auto dn_sq = dndx*dndx + dndy*dndy + dndz*dndz; + // (del Mz).(del Mz) + const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz; + // (del n).(del Mz) + const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz; + + gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz; + gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq); + gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz; + + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); + } - if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { - const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; - const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; - const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; - const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; - const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; +} - den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - } - } - __syncthreads(); +__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { - const int tid_y = bid_y + sm_y; - register double den_reg = den_shared[0][sm_y][threadIdx.x]; - register double dx_reg = den_shared[1][sm_y][threadIdx.x]; - register double dy_reg = den_shared[2][sm_y][threadIdx.x]; - register double dz_reg = den_shared[3][sm_y][threadIdx.x]; + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; - // Warp blocks are stored col major - den_reg = cuda::warp_reduce_sum( den_reg ); - dx_reg = 2 * cuda::warp_reduce_sum( dx_reg ); - dy_reg = 2 * cuda::warp_reduce_sum( dy_reg ); - dz_reg = 2 * cuda::warp_reduce_sum( dz_reg ); + auto* den_s_eval_device = task.den_s; + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + auto* den_z_eval_device = task.den_z; + const auto* dden_zx_eval_device = task.dden_zx; + const auto* dden_zy_eval_device = task.dden_zy; + const auto* dden_zz_eval_device = task.dden_zz; - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - atomicAdd( den_x_eval_device + tid_y, dx_reg ); - atomicAdd( den_y_eval_device + tid_y, dy_reg ); - atomicAdd( den_z_eval_device + tid_y, dz_reg ); - } - } - __syncthreads(); + const auto* den_y_eval_device = task.den_y; + const auto* dden_yx_eval_device = task.dden_yx; + const auto* dden_yy_eval_device = task.dden_yy; + const auto* dden_yz_eval_device = task.dden_yz; + + const auto* den_x_eval_device = task.den_x; + const auto* dden_xx_eval_device = task.dden_xx; + const auto* dden_xy_eval_device = task.dden_xy; + const auto* dden_xz_eval_device = task.dden_xz; + + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; + + auto* H_z_eval_device = task.H_z; + auto* H_y_eval_device = task.H_y; + auto* H_x_eval_device = task.H_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + + const double dtolsq = 1e-24; // TODO: make variable + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const double dndz = dden_sz_eval_device[ tid ]; + const double dndy = dden_sy_eval_device[ tid ]; + const double dndx = dden_sx_eval_device[ tid ]; + + const double dMzdz = dden_zz_eval_device[ tid ]; + const double dMzdy = dden_zy_eval_device[ tid ]; + const double dMzdx = dden_zx_eval_device[ tid ]; + + const double dMydz = dden_yz_eval_device[ tid ]; + const double dMydy = dden_yy_eval_device[ tid ]; + const double dMydx = dden_yx_eval_device[ tid ]; + + const double dMxdz = dden_xz_eval_device[ tid ]; + const double dMxdy = dden_xy_eval_device[ tid ]; + const double dMxdx = dden_xx_eval_device[ tid ]; + + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + + const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz; + const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz; + const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz; + const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz; + + const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz; + const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz; + const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz; + + const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely; + const auto s_sum = + dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py; + + const auto inv_sqsum2 = + rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx + + dels_dot_dely * dels_dot_dely); + const auto sqsum2 = 1./inv_sqsum2; + + double sign = 1.; + if( signbit(s_sum)) + sign = -1.; + + + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2; + H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2; + H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + + H_z_eval_device[ tid ] = sign / 3.; + H_y_eval_device[ tid ] = sign / 3.; + H_x_eval_device[ tid ] = sign / 3.; } + + gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2; + gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum); + gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2; + + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); + } + } template -__global__ void eval_uvars_mgga_kernel( size_t ntasks, +__global__ void eval_uvars_mgga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { constexpr auto warp_size = cuda::warp_size; @@ -184,17 +338,17 @@ __global__ void eval_uvars_mgga_kernel( size_t ntasks, den_basis_prod_device = task.zmat; } - __shared__ double den_shared[3+!!need_lapl][warp_size][GGA_KERNEL_SM_BLOCK_Y+1]; + __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1]; for ( int bid_x = blockIdx.x * blockDim.x; bid_x < nbf; bid_x += blockDim.x * gridDim.x ) { - for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; + for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; bid_y < npts; - bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) { + bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) { - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { + for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { den_shared[0][threadIdx.x][sm_y] = 0.; den_shared[1][threadIdx.x][sm_y] = 0.; den_shared[2][threadIdx.x][sm_y] = 0.; @@ -226,7 +380,7 @@ __global__ void eval_uvars_mgga_kernel( size_t ntasks, __syncthreads(); - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { + for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { const int tid_y = bid_y + sm_y; register double tx_reg = den_shared[0][sm_y][threadIdx.x]; @@ -258,93 +412,280 @@ __global__ void eval_uvars_mgga_kernel( size_t ntasks, } -__global__ void eval_vvars_gga_kernel( - size_t npts, - const double* den_x_eval_device, - const double* den_y_eval_device, - const double* den_z_eval_device, - double* gamma_eval_device -) { +#define EVAL_UVARS_KERNEL(xc_approx) \ + cudaStream_t stream = queue.queue_as(); \ + dim3 blocks( util::div_ceil( npts_max, threads.x ), \ + 1, \ + ntasks ); \ + switch ( ks_scheme ) { \ + case RKS: \ + eval_uvars_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case UKS: \ + eval_uvars_##xc_approx##_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case GKS: \ + eval_uvars_##xc_approx##_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate UV vars" ); \ + } + +void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 ); + EVAL_UVARS_KERNEL(lda); +} - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - if( tid < npts ) { - const double dx = den_x_eval_device[ tid ]; - const double dy = den_y_eval_device[ tid ]; - const double dz = den_z_eval_device[ tid ]; - gamma_eval_device[tid] = dx*dx + dy*dy + dz*dz; +void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 ); + EVAL_UVARS_KERNEL(gga); +} + + +void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, + int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, + device_queue queue ) { + // TODO: This interface should be unified with the lda/gga interfaces + cudaStream_t stream = queue.queue_as(); + + // U Variables + { + dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); + dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), + std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + ntasks ); + if(do_lapl) + eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else + eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); } + // V variables (GAMMA) + dim3 threads( cuda::max_threads_per_thread_block ); + dim3 blocks( util::div_ceil( npts_total, threads.x ), + 1, + ntasks ); + eval_uvars_gga_rks_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); } -void eval_uvvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max, - XCDeviceTask* device_tasks, device_queue queue ) { - cudaStream_t stream = queue.queue_as(); - dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 ); - dim3 blocks( util::div_ceil( nbf_max, threads.x ), - util::div_ceil( npts_max, threads.y ), - ntasks ); - eval_uvars_lda_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + +template +__global__ void eval_vvar_grad_kern( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + double* den_eval_device = nullptr; + double* den_x_eval_device = nullptr; + double* den_y_eval_device = nullptr; + double* den_z_eval_device = nullptr; + + constexpr auto warp_size = cuda::warp_size; + + if constexpr (den_select == DEN_S) { + den_eval_device = task.den_s; + den_x_eval_device = task.dden_sx; + den_y_eval_device = task.dden_sy; + den_z_eval_device = task.dden_sz; + } + if constexpr (den_select == DEN_Z) { + den_eval_device = task.den_z; + den_x_eval_device = task.dden_zx; + den_y_eval_device = task.dden_zy; + den_z_eval_device = task.dden_zz; + } + if constexpr (den_select == DEN_Y) { + den_eval_device = task.den_y; + den_x_eval_device = task.dden_yx; + den_y_eval_device = task.dden_yy; + den_z_eval_device = task.dden_yz; + } + if constexpr (den_select == DEN_X) { + den_eval_device = task.den_x; + den_x_eval_device = task.dden_xx; + den_y_eval_device = task.dden_xy; + den_z_eval_device = task.dden_xz; + } + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + const auto* den_basis_prod_device = task.zmat; + + __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1]; + + for ( int bid_x = blockIdx.x * blockDim.x; + bid_x < nbf; + bid_x += blockDim.x * gridDim.x ) { + + for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; + bid_y < npts; + bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) { + + for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + den_shared[0][threadIdx.x][sm_y] = 0.; + den_shared[1][threadIdx.x][sm_y] = 0.; + den_shared[2][threadIdx.x][sm_y] = 0.; + den_shared[3][threadIdx.x][sm_y] = 0.; + + if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { + const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; + const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; + const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; + const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; + const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; + + den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + } + } + __syncthreads(); + + + for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + const int tid_y = bid_y + sm_y; + register double den_reg = den_shared[0][sm_y][threadIdx.x]; + register double dx_reg = den_shared[1][sm_y][threadIdx.x]; + register double dy_reg = den_shared[2][sm_y][threadIdx.x]; + register double dz_reg = den_shared[3][sm_y][threadIdx.x]; + + // Warp blocks are stored col major + den_reg = cuda::warp_reduce_sum( den_reg ); + dx_reg = 2. * cuda::warp_reduce_sum( dx_reg ); + dy_reg = 2. * cuda::warp_reduce_sum( dy_reg ); + dz_reg = 2. * cuda::warp_reduce_sum( dz_reg ); + + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( den_eval_device + tid_y, den_reg ); + atomicAdd( den_x_eval_device + tid_y, dx_reg ); + atomicAdd( den_y_eval_device + tid_y, dy_reg ); + atomicAdd( den_z_eval_device + tid_y, dz_reg ); + } + } + __syncthreads(); + } + } } -void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, - int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, - const double* deny, const double* denz, double* gamma, device_queue queue ) { - cudaStream_t stream = queue.queue_as(); +template +__global__ void eval_vvar_kern( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + double* den_eval_device = nullptr; + // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored + if constexpr (den_select == DEN_S) den_eval_device = task.den_s; + if constexpr (den_select == DEN_Z) den_eval_device = task.den_z; + if constexpr (den_select == DEN_Y) den_eval_device = task.den_y; + if constexpr (den_select == DEN_X) den_eval_device = task.den_x; + + const auto* basis_eval_device = task.bf; + + const auto* den_basis_prod_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + register double den_reg = 0.; + + if( tid_x < nbf and tid_y < npts ) { + + const double* bf_col = basis_eval_device + tid_x*npts; + const double* db_col = den_basis_prod_device + tid_x*npts; + + den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; - // U Variables - { - dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); - dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), - ntasks ); - eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); } - // V variables - dim3 threads( cuda::max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts_total, threads.x ) ); - eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>( - npts_total, denx, deny, denz, gamma - ); + // Warp blocks are stored col major + constexpr auto warp_size = cuda::warp_size; + //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block; + den_reg = cuda::warp_reduce_sum( den_reg ); + + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( den_eval_device + tid_y, den_reg ); + } + + } -void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, - int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, - const double* deny, const double* denz, double* gamma, bool do_lapl, - device_queue queue ) { - cudaStream_t stream = queue.queue_as(); - // U Variables - { - dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); - dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), - ntasks ); - eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - if(do_lapl) - eval_uvars_mgga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - else - eval_uvars_mgga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + +void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + + cudaStream_t stream = queue.queue_as(); + dim3 threads; + dim3 blocks; + if( do_grad ) { + threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); + blocks = dim3( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), + std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + ntasks ); + } else { + threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block, 1 ); + blocks = dim3( util::div_ceil( nbf_max, threads.x ), + util::div_ceil( npts_max, threads.y ), + ntasks ); + } + switch( den_select ) { + case DEN_S: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_Z: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_Y: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_X: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvar called with improper density selected" ); } - // V variables (GAMMA) - dim3 threads( cuda::max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts_total, threads.x ) ); - eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>( - npts_total, denx, deny, denz, gamma - ); } + + + + } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu index 5c7851e6..616e0bd5 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu @@ -13,7 +13,7 @@ namespace GauXC { -__global__ void zmat_lda_vxc_kernel( size_t ntasks, +__global__ void zmat_lda_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -45,34 +45,93 @@ __global__ void zmat_lda_vxc_kernel( size_t ntasks, -void zmat_lda_vxc( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCDeviceTask* tasks_device, - device_queue queue ) { - cudaStream_t stream = queue.queue_as() ; - dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - zmat_lda_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + + + + + +template +__global__ void zmat_lda_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + + + const auto* basis_eval_device = task.bf; + + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double factp = 0.5 * vrho_pos_device[tid_x]; + const double factm = 0.5 * vrho_neg_device[tid_x]; + double sign = 1.0; + if constexpr ( den_selector == DEN_Z ) sign = -1.0; + + z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + sign * factm * basis_eval_device[ ibfoff ]); + } } +template +__global__ void zmat_lda_vxc_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + double* K_device; + if constexpr ( den_selector == DEN_Z ) K_device = task.K_z; + if constexpr ( den_selector == DEN_Y ) K_device = task.K_y; + if constexpr ( den_selector == DEN_X ) K_device = task.K_x; + + const auto* basis_eval_device = task.bf; + auto* z_matrix_device = task.zmat; + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + if( tid_x < npts and tid_y < nbf ) { + const size_t ibfoff = tid_y * npts + tid_x; + const double factp = 0.5 * vrho_pos_device[tid_x]; + const double factm = 0.5 * vrho_neg_device[tid_x]; + if constexpr ( den_selector == DEN_S ) { + z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + factm * basis_eval_device[ ibfoff ]); + } + else { + const double factk = 0.5 * (factp - factm); + z_matrix_device[ ibfoff ] = K_device[ ibfoff ] * factk * basis_eval_device[ ibfoff ]; + } + } +} @@ -82,7 +141,7 @@ void zmat_lda_vxc( size_t ntasks, -__global__ void zmat_gga_vxc_kernel( size_t ntasks, +__global__ void zmat_gga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -93,9 +152,9 @@ __global__ void zmat_gga_vxc_kernel( size_t ntasks, const auto nbf = task.bfn_screening.nbe; const auto* vrho_device = task.vrho; const auto* vgamma_device = task.vgamma; - const auto* den_x_eval_device = task.ddenx; - const auto* den_y_eval_device = task.ddeny; - const auto* den_z_eval_device = task.ddenz; + const auto* den_x_eval_device = task.dden_sx; + const auto* den_y_eval_device = task.dden_sy; + const auto* den_z_eval_device = task.dden_sz; const auto* basis_eval_device = task.bf; const auto* dbasis_x_eval_device = task.dbfx; @@ -123,28 +182,219 @@ __global__ void zmat_gga_vxc_kernel( size_t ntasks, } } -void zmat_gga_vxc( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCDeviceTask* tasks_device, - device_queue queue ) { - cudaStream_t stream = queue.queue_as() ; - dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - zmat_gga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + + +template +__global__ void zmat_gga_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + const double* vgamma_pp_device = task.vgamma_pp; + const double* vgamma_pm_device = task.vgamma_pm; + const double* vgamma_mm_device = task.vgamma_mm; + + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double factp = 0.25 * vrho_pos_device[tid_x]; + const double factm = 0.25 * vrho_neg_device[tid_x]; + + const auto gga_fact_pp = vgamma_pp_device[tid_x]; + const auto gga_fact_pm = vgamma_pm_device[tid_x]; + const auto gga_fact_mm = vgamma_mm_device[tid_x]; + + const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm); + const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm); + const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm); + + double sign = 1.0; + + double x_fact, y_fact, z_fact; + + if constexpr ( den_selector == DEN_S ) { + x_fact = gga_fact_1 * den_pos_x_eval_device[ tid_x ] + gga_fact_2 * den_neg_x_eval_device[ tid_x ]; + y_fact = gga_fact_1 * den_pos_y_eval_device[ tid_x ] + gga_fact_2 * den_neg_y_eval_device[ tid_x ]; + z_fact = gga_fact_1 * den_pos_z_eval_device[ tid_x ] + gga_fact_2 * den_neg_z_eval_device[ tid_x ]; + + + } + if constexpr ( den_selector == DEN_Z ) { + sign = -1.0; + x_fact = gga_fact_3 * den_neg_x_eval_device[ tid_x ] + gga_fact_2 * den_pos_x_eval_device[ tid_x ]; + y_fact = gga_fact_3 * den_neg_y_eval_device[ tid_x ] + gga_fact_2 * den_pos_y_eval_device[ tid_x ]; + z_fact = gga_fact_3 * den_neg_z_eval_device[ tid_x ] + gga_fact_2 * den_pos_z_eval_device[ tid_x ]; + + } + + z_matrix_device[ ibfoff ] = x_fact * dbasis_x_eval_device[ ibfoff ] + + y_fact * dbasis_y_eval_device[ ibfoff ] + + z_fact * dbasis_z_eval_device[ ibfoff ] + + (factp + sign * factm) * basis_eval_device[ ibfoff ]; + } } - + + +template +__global__ void zmat_gga_vxc_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + const double* vgamma_pp_device = task.vgamma_pp; + const double* vgamma_pm_device = task.vgamma_pm; + const double* vgamma_mm_device = task.vgamma_mm; + + + // for non-DEN_S + double* K_device; + double* H_device; + if constexpr ( den_selector == DEN_Z ) { K_device = task.K_z; H_device = task.H_z; } + if constexpr ( den_selector == DEN_Y ) { K_device = task.K_y; H_device = task.H_y; } + if constexpr ( den_selector == DEN_X ) { K_device = task.K_x; H_device = task.H_x; } + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + const auto* dden_zx_eval_device = task.dden_zx; + const auto* dden_zy_eval_device = task.dden_zy; + const auto* dden_zz_eval_device = task.dden_zz; + const auto* dden_yx_eval_device = task.dden_yx; + const auto* dden_yy_eval_device = task.dden_yy; + const auto* dden_yz_eval_device = task.dden_yz; + const auto* dden_xx_eval_device = task.dden_xx; + const auto* dden_xy_eval_device = task.dden_xy; + const auto* dden_xz_eval_device = task.dden_xz; + + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double fact_p = 0.5*vrho_pos_device[tid_x]; + const double fact_m = 0.5*vrho_neg_device[tid_x]; + + const auto gga_fact_pp = vgamma_pp_device[tid_x]; + const auto gga_fact_pm = vgamma_pm_device[tid_x]; + const auto gga_fact_mm = vgamma_mm_device[tid_x]; + + const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm); + const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm); + const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm); + + double s_fact, x_fact, y_fact, z_fact; + + if constexpr ( den_selector == DEN_S ) { + const double* Hz_device = task.H_z; + const double* Hy_device = task.H_y; + const double* Hx_device = task.H_x; + + s_fact = 0.5 * (fact_p + fact_m); + + x_fact = gga_fact_1 * dden_sx_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zx_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yx_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xx_eval_device[ tid_x ] ); + y_fact = gga_fact_1 * dden_sy_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zy_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yy_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xy_eval_device[ tid_x ] ); + z_fact = gga_fact_1 * dden_sz_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zz_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yz_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xz_eval_device[ tid_x ] ); + } + + if constexpr ( den_selector == DEN_Z ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_zx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_zy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_zz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + if constexpr ( den_selector == DEN_Y ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_yx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_yy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_yz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + if constexpr ( den_selector == DEN_X ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_xx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_xy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_xz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + z_matrix_device[ ibfoff ] = x_fact * dbasis_x_eval_device[ ibfoff ] + + y_fact * dbasis_y_eval_device[ ibfoff ] + + z_fact * dbasis_z_eval_device[ ibfoff ] + + s_fact * basis_eval_device[ ibfoff ]; + + } +} + template -__global__ void zmat_mgga_vxc_kernel( size_t ntasks, +__global__ void zmat_mgga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -156,9 +406,9 @@ __global__ void zmat_mgga_vxc_kernel( size_t ntasks, const auto* vrho_device = task.vrho; const auto* vgamma_device = task.vgamma; const double* vlapl_device = need_lapl ? task.vlapl : nullptr; - const auto* den_x_eval_device = task.ddenx; - const auto* den_y_eval_device = task.ddeny; - const auto* den_z_eval_device = task.ddenz; + const auto* den_x_eval_device = task.dden_sx; + const auto* den_y_eval_device = task.dden_sy; + const auto* den_z_eval_device = task.dden_sz; const auto* basis_eval_device = task.bf; const auto* dbasis_x_eval_device = task.dbfx; @@ -194,6 +444,60 @@ __global__ void zmat_mgga_vxc_kernel( size_t ntasks, } } + + +#define ZMAT_VXC_KERN(xc_approx) \ + cudaStream_t stream = queue.queue_as(); \ + dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); \ + dim3 blocks( util::div_ceil( max_npts, threads.x ), \ + util::div_ceil( max_nbf, threads.y ), \ + ntasks ); \ + switch( scheme ) { \ + case RKS: \ + zmat_##xc_approx##_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + break; \ + case UKS: \ + if ( sel == DEN_S ) zmat_##xc_approx##_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Z ) zmat_##xc_approx##_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \ + break; \ + case GKS: \ + if ( sel == DEN_S ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Z ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Y ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_X ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid KS scheme" ); \ + } + + + +void zmat_lda_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, + device_queue queue ) { +ZMAT_VXC_KERN(lda) +} + + + +void zmat_gga_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, + device_queue queue ) { +ZMAT_VXC_KERN(gga) +} + + + void zmat_mgga_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, @@ -210,9 +514,9 @@ void zmat_mgga_vxc( size_t ntasks, ntasks ); if(do_lapl) - zmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); else - zmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); } @@ -232,7 +536,7 @@ void zmat_mgga_vxc( size_t ntasks, template -__global__ void mmat_mgga_vxc_kernel( size_t ntasks, +__global__ void mmat_mgga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -332,9 +636,9 @@ void mmat_mgga_vxc( size_t ntasks, ntasks ); if(do_lapl) - mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); else - mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx index 237f5647..3c11a5fe 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx +++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx @@ -16,9 +16,10 @@ namespace GauXC { -void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad ){ +void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ){ if( do_grad ) GAUXC_GENERIC_EXCEPTION("CUTLASS + X Gradient NYI"); + if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI"); auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -33,7 +34,7 @@ void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_g const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, + sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device, nbf, submat_block_size, data->device_backend_->queue() ); auto cutlass_stack = data->cutlass_stack; @@ -50,7 +51,7 @@ void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_g ); } -void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, bool do_m){ +void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id, bool do_m){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -58,6 +59,7 @@ void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, bool do_m){ if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); if(do_m) GAUXC_GENERIC_EXCEPTION("CUTLASS + MGGA NYI"); + if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI"); auto& tasks = data->host_device_tasks; const auto ntasks = tasks.size(); @@ -81,7 +83,7 @@ void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, bool do_m){ auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; sym_task_inc_potential( ntasks, aos_stack.device_tasks, - static_stack.vxc_device, nbf, submat_block_size, + static_stack.vxc_s_device, nbf, submat_block_size, data->device_backend_->queue() ); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp index 8654a1e7..0f3ec69e 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp @@ -17,8 +17,8 @@ namespace GauXC { struct AoSScheme1CUTLASSBase : public AoSScheme1Base { - void eval_xmat(double fac, XCDeviceData*, bool do_grad ) override final; - void inc_vxc( XCDeviceData*, bool ) override final; + void eval_xmat(double fac, XCDeviceData*, bool do_grad, density_id ) override final; + void inc_vxc( XCDeviceData*, density_id, bool ) override final; struct Data; diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx index 96fbc97d..6bf35c75 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx +++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx @@ -96,7 +96,7 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send( std::vector ld64_dmat_host( ntask ), ld64_zmat_host( ntask ), ld64_vmat_host( ntask ), ld64_bf_host( ntask ); - double* static_dmat = static_stack.dmat_device; + double* static_dmat = static_stack.dmat_s_device; const auto nbf = global_dims.nbf; // host_device_tasks should be populated by parent impl called at top diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx index e4d288b8..2a83e76c 100644 --- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx +++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx @@ -36,6 +36,28 @@ void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, bool b ) { \ pimpl_->NAME(device_data, b); \ } +#define FWD_TO_PIMPL_DEN_ID(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, density_id den ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, den); \ +} + +#define FWD_TO_PIMPL_DEN_ID_BOOL(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, density_id den, bool b ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, den, b); \ +} + +#define FWD_TO_PIMPL_KS_SCHEME(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, track); \ +} +#define FWD_TO_PIMPL_KS_SCHEME_DEN_ID(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, density_id den ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, track, den); \ +} FWD_TO_PIMPL(partition_weights) // Partition weights @@ -44,63 +66,42 @@ FWD_TO_PIMPL(eval_collocation_gradient) // Collocation Gradient FWD_TO_PIMPL(eval_collocation_hessian) // Collocation Hessian FWD_TO_PIMPL(eval_collocation_laplacian) // Collocation Laplacian -FWD_TO_PIMPL(eval_uvvar_lda_rks) // U/VVar LDA (density) -FWD_TO_PIMPL(eval_uvvar_gga_rks) // U/VVar GGA (density + grad, gamma) - -FWD_TO_PIMPL(eval_uvvar_lda_uks) // U/VVar LDA (density) -FWD_TO_PIMPL(eval_uvvar_gga_uks) // U/VVar GGA (density + grad, gamma) -FWD_TO_PIMPL(eval_uvvar_lda_gks) // U/VVar LDA (density) -FWD_TO_PIMPL(eval_uvvar_gga_gks) // U/VVar GGA (density + grad, gamma) +FWD_TO_PIMPL_KS_SCHEME(eval_uvars_lda) // U variables LDA (rho) +FWD_TO_PIMPL_KS_SCHEME(eval_uvars_gga) // U variables GGA (gamma) +FWD_TO_PIMPL_BOOL(eval_uvars_mgga) // U variables MGGA (tau, lapl) +FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvar) // V variable (density + grad) -FWD_TO_PIMPL(eval_zmat_lda_vxc_rks) // Eval Z Matrix LDA VXC -FWD_TO_PIMPL(eval_zmat_gga_vxc_rks) // Eval Z Matrix GGA VXC - -FWD_TO_PIMPL(eval_zmat_lda_vxc_uks) // Eval Z Matrix LDA VXC -FWD_TO_PIMPL(eval_zmat_gga_vxc_uks) // Eval Z Matrix GGA VXC - -FWD_TO_PIMPL(eval_zmat_lda_vxc_gks) // Eval Z Matrix LDA VXC -FWD_TO_PIMPL(eval_zmat_gga_vxc_gks) // Eval Z Matrix GGA VXC - - -FWD_TO_PIMPL_BOOL(eval_uvvar_mgga_rks) // U/VVar MGGA (density + grad, gamma, tau, lapl) -FWD_TO_PIMPL_BOOL(eval_uvvar_mgga_uks) // U/VVar MGGA (density + grad, gamma, tau, lapl) -FWD_TO_PIMPL_BOOL(eval_uvvar_mgga_gks) // U/VVar MGGA (density + grad, gamma, tau, lapl) - -FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc_rks) // Eval Z Matrix MGGA VXC -FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc_uks) // Eval Z Matrix MGGA VXC -FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc_gks) // Eval Z Matrix MGGA VXC - -FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc_rks) // Eval M Matrix MGGA VXC -FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc_uks) // Eval M Matrix MGGA VXC -FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc_gks) // Eval M Matrix MGGA VXC +FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_lda_vxc) // Eval Z Matrix LDA VXC +FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_gga_vxc) // Eval Z Matrix GGA VXC +FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc) // Eval Z Matrix mGGA VXC +FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc) // Eval M Matrix mGGA VXC FWD_TO_PIMPL(eval_exx_fmat) // Eval EXX F Matrix //FWD_TO_PIMPL(eval_exx_gmat) // Eval EXX G Matrix + FWD_TO_PIMPL(inc_exc) FWD_TO_PIMPL(inc_nel) +FWD_TO_PIMPL_DEN_ID_BOOL(inc_vxc) // Increment VXC_I by Z + FWD_TO_PIMPL(inc_exx_k) FWD_TO_PIMPL(inc_exc_grad_lda) FWD_TO_PIMPL(inc_exc_grad_gga) -FWD_TO_PIMPL(symmetrize_vxc) +FWD_TO_PIMPL_DEN_ID(symmetrize_vxc) FWD_TO_PIMPL(symmetrize_exx_k) FWD_TO_PIMPL(eval_exx_ek_screening_bfn_stats) // X = fac * P * B // dX/dx = fac * P * dB/dx (do_grad) -void LocalDeviceWorkDriver::eval_xmat( double fac, XCDeviceData* device_data, bool do_grad ) { - throw_if_invalid_pimpl(pimpl_); - pimpl_->eval_xmat(fac, device_data, do_grad); -} -// Increment VXC by Z (+ M{x,y,z} for MMGA) -void LocalDeviceWorkDriver::inc_vxc( XCDeviceData* device_data, bool do_m ) { +void LocalDeviceWorkDriver::eval_xmat( double fac, XCDeviceData* device_data, bool do_grad, density_id den ) { throw_if_invalid_pimpl(pimpl_); - pimpl_->inc_vxc(device_data, do_m); + pimpl_->eval_xmat(fac, device_data, do_grad, den); } + void LocalDeviceWorkDriver::eval_exx_gmat( XCDeviceData* device_data, const BasisSetMap& basis_map) { throw_if_invalid_pimpl(pimpl_); diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp index d02c8810..604f0739 100644 --- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp +++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp @@ -61,47 +61,30 @@ class LocalDeviceWorkDriver : public LocalWorkDriver { void eval_collocation_gradient( XCDeviceData* ); void eval_collocation_hessian( XCDeviceData* ); void eval_collocation_laplacian( XCDeviceData* ); + void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ); + + void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) ; + void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) ; + void eval_uvars_mgga( XCDeviceData*, bool ) ; + void eval_vvar( XCDeviceData*, density_id, bool ) ; - void eval_xmat( double fac, XCDeviceData*, bool do_grad = false ); - - void eval_uvvar_lda_rks( XCDeviceData* ); - void eval_uvvar_gga_rks( XCDeviceData* ); - void eval_uvvar_mgga_rks( XCDeviceData*, bool ); - - void eval_uvvar_lda_uks( XCDeviceData* ); - void eval_uvvar_gga_uks( XCDeviceData* ); - void eval_uvvar_mgga_uks( XCDeviceData*, bool ); - - void eval_uvvar_lda_gks( XCDeviceData* ); - void eval_uvvar_gga_gks( XCDeviceData* ); - void eval_uvvar_mgga_gks( XCDeviceData*, bool ); void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ); void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ); void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ); - void eval_zmat_lda_vxc_rks( XCDeviceData* ); - void eval_zmat_gga_vxc_rks( XCDeviceData* ); - void eval_zmat_mgga_vxc_rks( XCDeviceData*, bool ); - - void eval_zmat_lda_vxc_uks( XCDeviceData* ); - void eval_zmat_gga_vxc_uks( XCDeviceData* ); - void eval_zmat_mgga_vxc_uks( XCDeviceData*, bool ); - - void eval_zmat_lda_vxc_gks( XCDeviceData* ); - void eval_zmat_gga_vxc_gks( XCDeviceData* ); - void eval_zmat_mgga_vxc_gks( XCDeviceData*, bool ); - void eval_mmat_mgga_vxc_rks( XCDeviceData*, bool ); - void eval_mmat_mgga_vxc_uks( XCDeviceData*, bool ); - void eval_mmat_mgga_vxc_gks( XCDeviceData*, bool ); + void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ; + void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ; + void eval_zmat_mgga_vxc( XCDeviceData*, bool ) ; + void eval_mmat_mgga_vxc( XCDeviceData*, bool ); void eval_exx_fmat( XCDeviceData* ); void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ); void inc_exc( XCDeviceData* ); void inc_nel( XCDeviceData* ); - void inc_vxc( XCDeviceData*, bool do_m = false ); + void inc_vxc( XCDeviceData*, density_id, bool do_m = false ); void inc_exc_grad_lda( XCDeviceData* ); void inc_exc_grad_gga( XCDeviceData* ); void inc_exx_k( XCDeviceData* ); @@ -110,7 +93,7 @@ class LocalDeviceWorkDriver : public LocalWorkDriver { void exx_ek_shellpair_collision( double eps_E, double eps_K, XCDeviceData*, host_task_iterator, host_task_iterator, const ShellPairCollection& ); - void symmetrize_vxc( XCDeviceData* ); + void symmetrize_vxc( XCDeviceData*, density_id ); void symmetrize_exx_k( XCDeviceData* ); std::unique_ptr create_device_data(const DeviceRuntimeEnvironment&); diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp index 7a31b0e3..f43dd12c 100644 --- a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp +++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp @@ -34,41 +34,28 @@ struct LocalDeviceWorkDriverPIMPL { virtual void eval_collocation_gradient( XCDeviceData* ) = 0; virtual void eval_collocation_hessian( XCDeviceData* ) = 0; virtual void eval_collocation_laplacian( XCDeviceData* ) = 0; - virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad ) = 0; + virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ) = 0; virtual void eval_exx_fmat( XCDeviceData* ) = 0; //virtual void eval_exx_gmat( XCDeviceData* ) = 0; virtual void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) = 0; - virtual void eval_uvvar_lda_rks( XCDeviceData* ) = 0; - virtual void eval_uvvar_gga_rks( XCDeviceData* ) = 0; - virtual void eval_uvvar_mgga_rks( XCDeviceData*, bool ) = 0; - virtual void eval_uvvar_lda_uks( XCDeviceData* ) = 0; - virtual void eval_uvvar_gga_uks( XCDeviceData* ) = 0; - virtual void eval_uvvar_mgga_uks( XCDeviceData*, bool ) = 0; - virtual void eval_uvvar_lda_gks( XCDeviceData* ) = 0; - virtual void eval_uvvar_gga_gks( XCDeviceData* ) = 0; - virtual void eval_uvvar_mgga_gks( XCDeviceData*, bool ) = 0; + virtual void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) = 0; + virtual void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) = 0; + virtual void eval_uvars_mgga( XCDeviceData*, bool ) = 0; + virtual void eval_vvar( XCDeviceData*, density_id, bool ) = 0; virtual void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) = 0; virtual void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) = 0; virtual void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ) = 0; - virtual void eval_zmat_lda_vxc_rks( XCDeviceData* ) = 0; - virtual void eval_zmat_gga_vxc_rks( XCDeviceData* ) = 0; - virtual void eval_zmat_mgga_vxc_rks( XCDeviceData*, bool ) = 0; - virtual void eval_zmat_lda_vxc_uks( XCDeviceData* ) = 0; - virtual void eval_zmat_gga_vxc_uks( XCDeviceData* ) = 0; - virtual void eval_zmat_mgga_vxc_uks( XCDeviceData*, bool ) = 0; - virtual void eval_zmat_lda_vxc_gks( XCDeviceData* ) = 0; - virtual void eval_zmat_gga_vxc_gks( XCDeviceData* ) = 0; - virtual void eval_zmat_mgga_vxc_gks( XCDeviceData*, bool ) = 0; - virtual void eval_mmat_mgga_vxc_rks( XCDeviceData*, bool ) = 0; - virtual void eval_mmat_mgga_vxc_uks( XCDeviceData*, bool ) = 0; - virtual void eval_mmat_mgga_vxc_gks( XCDeviceData*, bool ) = 0; + virtual void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0; + virtual void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0; + virtual void eval_zmat_mgga_vxc( XCDeviceData*, bool ) = 0; + virtual void eval_mmat_mgga_vxc( XCDeviceData*, bool ) = 0; virtual void inc_exc( XCDeviceData* ) = 0; virtual void inc_nel( XCDeviceData* ) = 0; - virtual void inc_vxc( XCDeviceData*, bool ) = 0; + virtual void inc_vxc( XCDeviceData* , density_id, bool) = 0; virtual void inc_exc_grad_lda( XCDeviceData* ) = 0; virtual void inc_exc_grad_gga( XCDeviceData* ) = 0; virtual void inc_exx_k( XCDeviceData* ) = 0; - virtual void symmetrize_vxc( XCDeviceData* ) = 0; + virtual void symmetrize_vxc( XCDeviceData*, density_id ) = 0; virtual void symmetrize_exx_k( XCDeviceData* ) = 0; virtual void eval_exx_ek_screening_bfn_stats( XCDeviceData* ) = 0; diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx index f42aff3c..5ffa8443 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx @@ -17,6 +17,7 @@ #include "device/common/increment_exc_grad.hpp" #include "device/common/exx_ek_screening.hpp" +#include "buffer_adaptor.hpp" #include "device/common/shell_pair_to_task.hpp" #ifdef GAUXC_HAS_CUDA @@ -93,7 +94,7 @@ namespace XGPU { size_t max_ntask, const GauXC::ShellPairToTaskDevice* sp2task, GauXC::XCDeviceTask* device_tasks, - double *boys_table, + double *boys_table, cudaStream_t stream); void integral_1_1_task_batched( @@ -116,7 +117,7 @@ namespace XGPU { size_t max_ntask, const GauXC::ShellPairToTaskDevice* sp2task, GauXC::XCDeviceTask* device_tasks, - double *boys_table, + double *boys_table, cudaStream_t stream); void integral_2_2_task_batched( @@ -139,7 +140,7 @@ namespace XGPU { size_t max_ntask, const GauXC::ShellPairToTaskDevice* sp2task, GauXC::XCDeviceTask* device_tasks, - double *boys_table, + double *boys_table, cudaStream_t stream); void integral_1_0_task_batched( @@ -164,7 +165,7 @@ namespace XGPU { size_t max_ntask, const GauXC::ShellPairToTaskDevice* sp2task, GauXC::XCDeviceTask* device_tasks, - double *boys_table, + double *boys_table, cudaStream_t stream); void integral_2_0_task_batched( @@ -189,7 +190,7 @@ namespace XGPU { size_t max_ntask, const GauXC::ShellPairToTaskDevice* sp2task, GauXC::XCDeviceTask* device_tasks, - double *boys_table, + double *boys_table, cudaStream_t stream); void integral_2_1_task_batched( @@ -214,7 +215,7 @@ namespace XGPU { size_t max_ntask, const GauXC::ShellPairToTaskDevice* sp2task, GauXC::XCDeviceTask* device_tasks, - double *boys_table, + double *boys_table, cudaStream_t stream); } #endif @@ -234,7 +235,7 @@ AoSScheme1Base::~AoSScheme1Base() noexcept { #endif } -void AoSScheme1Base::eval_zmat_lda_vxc_rks( XCDeviceData* _data){ +void AoSScheme1Base::eval_zmat_lda_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, density_id den ) { auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -250,13 +251,13 @@ void AoSScheme1Base::eval_zmat_lda_vxc_rks( XCDeviceData* _data){ } auto aos_stack = data->aos_stack; - zmat_lda_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, + zmat_lda_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, scheme, den, data->device_backend_->queue() ); data->device_backend_->check_error("zmat_lda" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_zmat_gga_vxc_rks( XCDeviceData* _data){ +void AoSScheme1Base::eval_zmat_gga_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, density_id den ) { auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -272,13 +273,13 @@ void AoSScheme1Base::eval_zmat_gga_vxc_rks( XCDeviceData* _data){ } auto aos_stack = data->aos_stack; - zmat_gga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, + zmat_gga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, scheme, den, data->device_backend_->queue() ); data->device_backend_->check_error("zmat_gga" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_zmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){ +void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -301,39 +302,8 @@ void AoSScheme1Base::eval_zmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){ data->device_backend_->check_error("zmat_mgga" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_zmat_lda_vxc_uks( XCDeviceData* ){ - - GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); - -} - -void AoSScheme1Base::eval_zmat_gga_vxc_uks( XCDeviceData* ){ - - GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); -} - -void AoSScheme1Base::eval_zmat_mgga_vxc_uks( XCDeviceData*, bool /*do_lapl*/){ - GAUXC_GENERIC_EXCEPTION("UKS MGGA Not Yet Implemented For Device"); -} - -void AoSScheme1Base::eval_zmat_lda_vxc_gks( XCDeviceData* ){ - - GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device"); - -} - -void AoSScheme1Base::eval_zmat_gga_vxc_gks( XCDeviceData* ){ - - GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device"); - -} - -void AoSScheme1Base::eval_zmat_mgga_vxc_gks( XCDeviceData*, bool /*do_lapl*/){ - GAUXC_GENERIC_EXCEPTION("GKS MGGA Not Yet Implemented For Device"); -} - -void AoSScheme1Base::eval_mmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){ +void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -355,12 +325,6 @@ void AoSScheme1Base::eval_mmat_mgga_vxc_rks( XCDeviceData* _data, bool do_lapl){ data->device_backend_->check_error("mmat_mgga" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_mmat_mgga_vxc_uks( XCDeviceData*, bool /*do_lapl*/){ - GAUXC_GENERIC_EXCEPTION("UKS MGGA Not Yet Implemented For Device"); -} -void AoSScheme1Base::eval_mmat_mgga_vxc_gks( XCDeviceData*, bool /*do_lapl*/){ - GAUXC_GENERIC_EXCEPTION("GKS MGGA Not Yet Implemented For Device"); -} void AoSScheme1Base::eval_collocation( XCDeviceData* _data ) { @@ -479,10 +443,20 @@ void AoSScheme1Base::inc_exc( XCDeviceData* _data ){ auto base_stack = data->base_stack; auto static_stack = data->static_stack; + const bool is_RKS = data->allocated_terms.ks_scheme == RKS; + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, - base_stack.eps_eval_device, 1, base_stack.den_eval_device, 1, + base_stack.eps_eval_device, 1, base_stack.den_s_eval_device, 1, static_stack.acc_scr_device, static_stack.exc_device ); + if( is_pol ) { + gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, + base_stack.eps_eval_device, 1, base_stack.den_z_eval_device, 1, + static_stack.acc_scr_device, static_stack.exc_device ); + } data->device_backend_->check_error("inc exc" __FILE__ ": " + std::to_string(__LINE__)); } @@ -495,32 +469,29 @@ void AoSScheme1Base::inc_nel( XCDeviceData* _data ){ auto base_stack = data->base_stack; auto static_stack = data->static_stack; + + const bool is_RKS = data->allocated_terms.ks_scheme == RKS; + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, - base_stack.weights_device, 1, base_stack.den_eval_device, 1, + base_stack.weights_device, 1, base_stack.den_s_eval_device, 1, static_stack.acc_scr_device, static_stack.nel_device ); + if( is_pol ) { + gdot( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, + base_stack.weights_device, 1, base_stack.den_z_eval_device, 1, + static_stack.acc_scr_device, static_stack.nel_device ); + } data->device_backend_->check_error("inc nel" __FILE__ ": " + std::to_string(__LINE__)); } - - - - - - - - - - - - - -void AoSScheme1Base::eval_uvvar_lda_rks( XCDeviceData* _data ){ - +void AoSScheme1Base::eval_uvars_lda( XCDeviceData* _data, integrator_ks_scheme ks_scheme){ auto* data = dynamic_cast(_data); - if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); @@ -528,31 +499,23 @@ void AoSScheme1Base::eval_uvvar_lda_rks( XCDeviceData* _data ){ const auto ntasks = tasks.size(); size_t nbe_max = 0, npts_max = 0; for( auto& task : tasks ) { - nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); npts_max = std::max( npts_max, task.npts ); } - // Zero density auto base_stack = data->base_stack; - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_eval_device, "Den Zero" ); - - + // Evaluate U variables auto aos_stack = data->aos_stack; - eval_uvvars_lda( ntasks, nbe_max, npts_max, + GauXC::eval_uvars_lda( ntasks, npts_max, ks_scheme, aos_stack.device_tasks, data->device_backend_->queue() ); data->device_backend_->check_error("uvvar lda" __FILE__ ": " + std::to_string(__LINE__)); } - - - -void AoSScheme1Base::eval_uvvar_gga_rks( XCDeviceData* _data ){ - +void AoSScheme1Base::eval_uvars_gga( XCDeviceData* _data, integrator_ks_scheme ks_scheme){ auto* data = dynamic_cast(_data); - if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); @@ -560,29 +523,21 @@ void AoSScheme1Base::eval_uvvar_gga_rks( XCDeviceData* _data ){ const auto ntasks = tasks.size(); size_t nbe_max = 0, npts_max = 0; for( auto& task : tasks ) { - nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); npts_max = std::max( npts_max, task.npts ); } - // Zero density + gradient auto base_stack = data->base_stack; - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_eval_device, "Den Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_x_eval_device, "DenX Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_y_eval_device, "DenY Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_z_eval_device, "DenZ Zero" ); - - // Evaluate U variables + + // Evaluate U variable auto aos_stack = data->aos_stack; - eval_uvvars_gga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, - aos_stack.device_tasks, base_stack.den_x_eval_device, base_stack.den_y_eval_device, - base_stack.den_z_eval_device, base_stack.gamma_eval_device, - data->device_backend_->queue() ); + GauXC::eval_uvars_gga( ntasks, npts_max, ks_scheme, + aos_stack.device_tasks, data->device_backend_->queue() ); data->device_backend_->check_error("uvvar gga" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_uvvar_mgga_rks( XCDeviceData* _data, bool do_lapl ){ +void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -597,12 +552,8 @@ void AoSScheme1Base::eval_uvvar_mgga_rks( XCDeviceData* _data, bool do_lapl ){ npts_max = std::max( npts_max, task.npts ); } - // Zero density + gradient + // Zero tau auto base_stack = data->base_stack; - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_eval_device, "Den Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_x_eval_device, "DenX Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_y_eval_device, "DenY Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_z_eval_device, "DenZ Zero" ); data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.tau_eval_device, "Tau Zero" ); if(do_lapl) { data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_lapl_eval_device, "DenLapl Zero" ); @@ -611,47 +562,78 @@ void AoSScheme1Base::eval_uvvar_mgga_rks( XCDeviceData* _data, bool do_lapl ){ // Evaluate U variables auto aos_stack = data->aos_stack; - eval_uvvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, - aos_stack.device_tasks, base_stack.den_x_eval_device, base_stack.den_y_eval_device, - base_stack.den_z_eval_device, base_stack.gamma_eval_device, - do_lapl, data->device_backend_->queue() ); + GauXC::eval_uvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, do_lapl, + aos_stack.device_tasks, data->device_backend_->queue() ); data->device_backend_->check_error("uvvar mgga" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_uvvar_lda_uks( XCDeviceData* ){ - - GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); - -} - -void AoSScheme1Base::eval_uvvar_gga_uks( XCDeviceData* ){ - - GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); - -} - -void AoSScheme1Base::eval_uvvar_mgga_uks( XCDeviceData*, bool /*do_lapl*/ ){ - GAUXC_GENERIC_EXCEPTION("UKS MGGA Not Yet Implemented For Device"); -} +void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool do_grad){ + auto* data = dynamic_cast(_data); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); -void AoSScheme1Base::eval_uvvar_lda_gks( XCDeviceData* ){ + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device"); + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } -} + // Zero density + auto base_stack = data->base_stack; + double* den_eval_ptr = nullptr; + double* den_x_eval_ptr = nullptr; + double* den_y_eval_ptr = nullptr; + double* den_z_eval_ptr = nullptr; + switch ( den_select ) { + case DEN_S: + den_eval_ptr = base_stack.den_s_eval_device; + if (do_grad) { den_x_eval_ptr = base_stack.dden_sx_eval_device; + den_y_eval_ptr = base_stack.dden_sy_eval_device; + den_z_eval_ptr = base_stack.dden_sz_eval_device; } + break; + case DEN_Z: + den_eval_ptr = base_stack.den_z_eval_device; + if (do_grad) { den_x_eval_ptr = base_stack.dden_zx_eval_device; + den_y_eval_ptr = base_stack.dden_zy_eval_device; + den_z_eval_ptr = base_stack.dden_zz_eval_device; } + break; + case DEN_Y: + den_eval_ptr = base_stack.den_y_eval_device; + if (do_grad) { den_x_eval_ptr = base_stack.dden_yx_eval_device; + den_y_eval_ptr = base_stack.dden_yy_eval_device; + den_z_eval_ptr = base_stack.dden_yz_eval_device; } + break; + case DEN_X: + den_eval_ptr = base_stack.den_x_eval_device; + if (do_grad) { den_x_eval_ptr = base_stack.dden_xx_eval_device; + den_y_eval_ptr = base_stack.dden_xy_eval_device; + den_z_eval_ptr = base_stack.dden_xz_eval_device; } + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvar called with invalid density selected!" ); + } -void AoSScheme1Base::eval_uvvar_gga_gks( XCDeviceData* ){ + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" ); - GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device"); + if (do_grad) { + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" ); + } + + // Evaluate V variable + auto aos_stack = data->aos_stack; + GauXC::eval_vvar( ntasks, nbe_max, npts_max, do_grad, den_select, + aos_stack.device_tasks, data->device_backend_->queue() ); } -void AoSScheme1Base::eval_uvvar_mgga_gks( XCDeviceData*, bool /*do_lapl*/ ){ - GAUXC_GENERIC_EXCEPTION("GKS MGGA Not Yet Implemented For Device"); -} void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func, XCDeviceData* _data ) { @@ -664,15 +646,50 @@ void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func, if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!"); auto base_stack = data->base_stack; - GauXC::eval_kern_exc_vxc_lda( func, data->total_npts_task_batch, - base_stack.den_eval_device, base_stack.eps_eval_device, + + const bool is_RKS = data->allocated_terms.ks_scheme == RKS; + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + const bool is_excgrad = data->allocated_terms.exc_grad; + + const size_t npts = data->total_npts_task_batch ; + + auto* dep = base_stack.den_s_eval_device; + + if ( is_pol ) { + dep = base_stack.den_eval_device; + // Interleave pos/neg densities before passing it to ExchCXX + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device , 2, "den_s -> den_eval" ); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" ); + } + + GauXC::eval_kern_exc_vxc_lda( func, npts, + dep, base_stack.eps_eval_device, base_stack.vrho_eval_device, data->device_backend_->queue() ); hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.eps_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 ); + base_stack.weights_device, 1, base_stack.eps_eval_device, 1 ); + if( not is_pol ) { + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 ); + } + else if( is_pol ) { + // De-interleave pos/neg densities + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.vrho_eval_device , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" ); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" ); + + // Weight results point-by-point + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 ); + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 ); + } data->device_backend_->check_error("exc_vxc lda" __FILE__ ": " + std::to_string(__LINE__)); } @@ -689,17 +706,80 @@ void AoSScheme1Base::eval_kern_exc_vxc_gga( const functional_type& func, if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!"); auto base_stack = data->base_stack; + double* den_eval_ptr = base_stack.den_s_eval_device; + + const bool is_RKS = data->allocated_terms.ks_scheme == RKS; + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + const bool is_excgrad = data->allocated_terms.exc_grad; + + const size_t npts = data->total_npts_task_batch ; + + + + if ( is_pol ) { + den_eval_ptr = base_stack.den_eval_device; + // Interleave pos/neg densities before passing it to ExchCXX + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device , 2, "den_s -> den_eval" ); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" ); + // Interleave gamma pp, pm, mm + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.gamma_pp_eval_device, 1, base_stack.gamma_eval_device , 3, "gamma_pp -> gamma_eval"); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.gamma_pm_eval_device, 1, base_stack.gamma_eval_device+1, 3, "gamma_pm -> gamma_eval"); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.gamma_mm_eval_device, 1, base_stack.gamma_eval_device+2, 3, "gamma_mm -> gamma_eval"); + } + GauXC::eval_kern_exc_vxc_gga( func, data->total_npts_task_batch, - base_stack.den_eval_device, base_stack.gamma_eval_device, + den_eval_ptr, base_stack.gamma_eval_device, base_stack.eps_eval_device, base_stack.vrho_eval_device, base_stack.vgamma_eval_device, data->device_backend_->queue() ); + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, base_stack.weights_device, 1, base_stack.eps_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + + if( not is_pol ) { + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, base_stack.weights_device, 1, base_stack.vgamma_eval_device, 1 ); + } + else if( is_pol ) { + // De-interleave pos/neg densities + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.vrho_eval_device , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" ); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" ); + + // Multiply by weights point-by-point + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 ); + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 ); + + // De-interleave vgamma + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.vgamma_eval_device , 3, base_stack.vgamma_pp_eval_device, 1, "vgamma_eval -> vgamma_pp" ); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.vgamma_eval_device+1, 3, base_stack.vgamma_pm_eval_device, 1, "vgamma_eval -> vgamma_pm" ); + data->device_backend_-> + copy_async_2d( 1, npts, base_stack.vgamma_eval_device+2, 3, base_stack.vgamma_mm_eval_device, 1, "vgamma_eval -> vgamma_mm" ); + + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vgamma_pp_eval_device, 1 ); + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vgamma_pm_eval_device, 1 ); + hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, + base_stack.weights_device, 1, base_stack.vgamma_mm_eval_device, 1 ); + + + } + data->device_backend_->check_error("exc_vxc gga" __FILE__ ": " + std::to_string(__LINE__)); @@ -723,7 +803,7 @@ void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func, } GauXC::eval_kern_exc_vxc_mgga( func, data->total_npts_task_batch, - base_stack.den_eval_device, base_stack.gamma_eval_device, + base_stack.den_s_eval_device, base_stack.gamma_eval_device, base_stack.tau_eval_device, base_stack.den_lapl_eval_device, base_stack.eps_eval_device, base_stack.vrho_eval_device, base_stack.vgamma_eval_device, base_stack.vtau_eval_device, @@ -754,7 +834,7 @@ void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func, -void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){ +void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -764,12 +844,31 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){ auto tasks = data->host_device_tasks; const auto ntasks = tasks.size(); - // Pack density matrix + // Set correct density matrix pointer on the stack const auto nbf = data->global_dims.nbf; const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, + double* dmat_ptr = nullptr; + switch ( den_select ) { + case DEN_S: + dmat_ptr = static_stack.dmat_s_device; + break; + case DEN_Z: + dmat_ptr = static_stack.dmat_z_device; + break; + case DEN_X: + dmat_ptr = static_stack.dmat_x_device; + break; + case DEN_Y: + dmat_ptr = static_stack.dmat_y_device; + break; + default: + GAUXC_GENERIC_EXCEPTION("eval_xmat: den_select not set"); + } + + // Pack density matrix + sym_pack_submat( ntasks, aos_stack.device_tasks, dmat_ptr, nbf, submat_block_size, data->device_backend_->queue() ); @@ -783,10 +882,13 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){ // Launch GEMM in round-robin const auto n_blas_streams = data->device_backend_->blas_pool_size(); + + + //size_t nsingle = 0; for( size_t iT = 0; iT < ntasks; ++iT ) { auto& task = tasks[iT]; - auto den_ptr = task.bfn_screening.ncut > 1 ? task.nbe_scr : static_stack.dmat_device + task.bfn_screening.ibf_begin*(nbf+1); + auto den_ptr = task.bfn_screening.ncut > 1 ? task.nbe_scr : dmat_ptr + task.bfn_screening.ibf_begin*(nbf+1); int ldden = task.bfn_screening.ncut > 1 ? task.bfn_screening.nbe : nbf; auto handle = data->device_backend_->blas_pool_handle( iT % n_blas_streams ); do_gemm( handle, task.npts, task.bfn_screening.nbe, task.bf, den_ptr, ldden, task.zmat ); @@ -811,14 +913,7 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){ - - - - - - - -void AoSScheme1Base::inc_vxc( XCDeviceData* _data, bool do_m ){ +void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool do_m ){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -857,8 +952,25 @@ void AoSScheme1Base::inc_vxc( XCDeviceData* _data, bool do_m ){ const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - sym_task_inc_potential( ntasks, aos_stack.device_tasks, - static_stack.vxc_device, nbf, submat_block_size, + double* vxc_ptr = nullptr; + switch( den_selector ) { + case DEN_S: + vxc_ptr = static_stack.vxc_s_device; + break; + case DEN_Z: + vxc_ptr = static_stack.vxc_z_device; + break; + case DEN_Y: + vxc_ptr = static_stack.vxc_y_device; + break; + case DEN_X: + vxc_ptr = static_stack.vxc_x_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "inc_vxc called with invalid density selected" ); + } + sym_task_inc_potential( ntasks, aos_stack.device_tasks, + vxc_ptr, nbf, submat_block_size, data->device_backend_->queue() ); data->device_backend_->check_error("inc_vxc" __FILE__ ": " + std::to_string(__LINE__)); @@ -874,9 +986,7 @@ void AoSScheme1Base::inc_vxc( XCDeviceData* _data, bool do_m ){ - - -void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data) { +void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data, density_id den_selector) { auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -885,9 +995,26 @@ void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data) { const auto nbf = data->global_dims.nbf; auto static_stack = data->static_stack; - symmetrize_matrix( nbf, static_stack.vxc_device, nbf, - data->device_backend_->queue() ); - + switch ( den_selector ) { + case DEN_S: + symmetrize_matrix( nbf, static_stack.vxc_s_device, nbf, + data->device_backend_->queue() ); + break; + case DEN_Z: + symmetrize_matrix( nbf, static_stack.vxc_z_device, nbf, + data->device_backend_->queue() ); + break; + case DEN_Y: + symmetrize_matrix( nbf, static_stack.vxc_y_device, nbf, + data->device_backend_->queue() ); + break; + case DEN_X: + symmetrize_matrix( nbf, static_stack.vxc_x_device, nbf, + data->device_backend_->queue() ); + break; + default: + GAUXC_GENERIC_EXCEPTION( "symmetrize_vxc: invalid density selected" ); + } data->device_backend_->check_error("symmetrize vxc" __FILE__ ": " + std::to_string(__LINE__)); } @@ -949,7 +1076,7 @@ void AoSScheme1Base::eval_exx_fmat( XCDeviceData* _data ) { // Pack the density matrix into (bfn, cou) shape const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto aos_stack = data->aos_stack; - asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, + asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device, nbf, submat_block_size, data->device_backend_->queue() ); // Sync blas streams with master stream @@ -1426,7 +1553,7 @@ void AoSScheme1Base::exx_ek_shellpair_collision( double eps_E, double eps_K, auto static_stack = data->static_stack; GauXC::exx_ek_shellpair_collision( ntasks, nshells, nbf, - static_stack.dmat_device, nbf, + static_stack.dmat_s_device, nbf, static_stack.vshell_max_sparse_device, static_stack.shpair_row_ind_device, static_stack.shpair_col_ind_device, diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp index dc980762..37964914 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp @@ -18,30 +18,16 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL { void eval_collocation_gradient( XCDeviceData* ) override final; void eval_collocation_hessian( XCDeviceData* ) override final; void eval_collocation_laplacian( XCDeviceData* ) override final; - void eval_uvvar_lda_rks( XCDeviceData* ) override final; - void eval_uvvar_gga_rks( XCDeviceData* ) override final; - void eval_uvvar_mgga_rks( XCDeviceData*, bool ) override final; - void eval_zmat_lda_vxc_rks( XCDeviceData* ) override final; - void eval_zmat_gga_vxc_rks( XCDeviceData* ) override final; - void eval_zmat_mgga_vxc_rks( XCDeviceData*, bool ) override final; - void eval_uvvar_lda_uks( XCDeviceData* ) override final; - void eval_uvvar_gga_uks( XCDeviceData* ) override final; - void eval_uvvar_mgga_uks( XCDeviceData*, bool ) override final; - void eval_zmat_lda_vxc_uks( XCDeviceData* ) override final; - void eval_zmat_gga_vxc_uks( XCDeviceData* ) override final; - void eval_zmat_mgga_vxc_uks( XCDeviceData*, bool ) override final; + void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) override final; + void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) override final; + void eval_uvars_mgga( XCDeviceData*, bool ) override final; + void eval_vvar( XCDeviceData*, density_id, bool ) override final; - void eval_uvvar_lda_gks( XCDeviceData* ) override final; - void eval_uvvar_gga_gks( XCDeviceData* ) override final; - void eval_uvvar_mgga_gks( XCDeviceData*, bool ) override final; - void eval_zmat_lda_vxc_gks( XCDeviceData* ) override final; - void eval_zmat_gga_vxc_gks( XCDeviceData* ) override final; - void eval_zmat_mgga_vxc_gks( XCDeviceData*, bool ) override final; - - void eval_mmat_mgga_vxc_rks( XCDeviceData*, bool ) override final; - void eval_mmat_mgga_vxc_uks( XCDeviceData*, bool ) override final; - void eval_mmat_mgga_vxc_gks( XCDeviceData*, bool ) override final; + void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final; + void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final; + void eval_zmat_mgga_vxc( XCDeviceData*, bool ) override final; + void eval_mmat_mgga_vxc( XCDeviceData*, bool ) override final; void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) override final; void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) override final; @@ -50,7 +36,7 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL { void inc_nel( XCDeviceData* ) override final; void inc_exc_grad_lda( XCDeviceData* ) override final; void inc_exc_grad_gga( XCDeviceData* ) override final; - void symmetrize_vxc( XCDeviceData* ) override final; + void symmetrize_vxc( XCDeviceData* , density_id) override final; void symmetrize_exx_k( XCDeviceData* ) override final; //void eval_exx_gmat( XCDeviceData* ) override final; void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) override final; @@ -62,11 +48,12 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL { // Overridable APIs - virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad ) override; + virtual void eval_xmat( double fac, XCDeviceData*, bool , density_id ) override; virtual void eval_exx_fmat( XCDeviceData* ) override; - virtual void inc_vxc( XCDeviceData*, bool ) override; + virtual void inc_vxc( XCDeviceData*, density_id, bool ) override; virtual void inc_exx_k( XCDeviceData* ) override; + using Data = Scheme1DataBase; AoSScheme1Base(); diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx index fb484ed0..8c3c15cc 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx @@ -16,7 +16,7 @@ namespace GauXC { -void AoSScheme1MAGMABase::eval_xmat( double fac, XCDeviceData* _data, bool do_grad ){ +void AoSScheme1MAGMABase::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den ){ if( do_grad ) GAUXC_GENERIC_EXCEPTION("MAGMA + X Gradient NYI"); @@ -33,12 +33,42 @@ void AoSScheme1MAGMABase::eval_xmat( double fac, XCDeviceData* _data, bool do_gr const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, - nbf, submat_block_size, data->device_backend_->queue() ); - - auto master_queue = data->device_backend_->master_magma_queue(); auto magma_stack = data->magma_stack; + double* dmat_ptr = nullptr; + switch( den ) { + case DEN_S: + dmat_ptr = static_stack.dmat_s_device; + break; + case DEN_Z: + dmat_ptr = static_stack.dmat_z_device; + break; + case DEN_Y: + dmat_ptr = static_stack.dmat_y_device; + break; + case DEN_X: + dmat_ptr = static_stack.dmat_x_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_xmat called with invalid density specifier" ); + } + sym_pack_submat( ntasks, aos_stack.device_tasks, dmat_ptr, + nbf, submat_block_size, data->device_backend_->queue() ); + + // Update dmat on magma_stack + std::vector dmat_host( ntasks ); + + for( auto i = 0; i < ntasks; i++ ) { + auto& task = tasks[i]; + if( task.bfn_screening.ncut > 1 ) { + dmat_host[i] = task.nbe_scr; + } else { + dmat_host[i] = dmat_ptr + task.bfn_screening.ibf_begin*(nbf+1); + } + } + data->device_backend_->copy_async( ntasks, dmat_host.data(), + magma_stack.xdmat_array_device, "send xdmat array"); + magmablas_dgemm_vbatched( MagmaNoTrans, MagmaNoTrans, magma_stack.xmat_m_array_device, magma_stack.xmat_n_array_device, magma_stack.xmat_k_array_device, @@ -66,7 +96,7 @@ void AoSScheme1MAGMABase::eval_exx_fmat( XCDeviceData* _data ) { const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_device, + asym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device, nbf, submat_block_size, data->device_backend_->queue() ); @@ -82,7 +112,7 @@ void AoSScheme1MAGMABase::eval_exx_fmat( XCDeviceData* _data ) { #endif } -void AoSScheme1MAGMABase::inc_vxc( XCDeviceData* _data, bool do_m){ +void AoSScheme1MAGMABase::inc_vxc( XCDeviceData* _data, density_id den, bool do_m){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -108,8 +138,25 @@ void AoSScheme1MAGMABase::inc_vxc( XCDeviceData* _data, bool do_m){ const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; + double* vxc_ptr = nullptr; + switch (den) { + case DEN_S: + vxc_ptr = static_stack.vxc_s_device; + break; + case DEN_Z: + vxc_ptr = static_stack.vxc_z_device; + break; + case DEN_Y: + vxc_ptr = static_stack.vxc_y_device; + break; + case DEN_X: + vxc_ptr = static_stack.vxc_x_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "Inc_vxc called with invalid density specifier" ); + } sym_task_inc_potential( ntasks, aos_stack.device_tasks, - static_stack.vxc_device, nbf, submat_block_size, + vxc_ptr, nbf, submat_block_size, data->device_backend_->queue() ); } diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp index 2b99bb60..99dde319 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp +++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp @@ -12,9 +12,9 @@ namespace GauXC { struct AoSScheme1MAGMABase : public AoSScheme1Base { - void eval_xmat( double fac, XCDeviceData*, bool do_grad ) override final; + void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ) override final; void eval_exx_fmat( XCDeviceData* ) override final; - void inc_vxc( XCDeviceData*, bool ) override final; + void inc_vxc( XCDeviceData*, density_id den, bool ) override final; void inc_exx_k( XCDeviceData* ) override final; struct Data; diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx index fda0d11b..95b818be 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx @@ -104,6 +104,7 @@ void AoSScheme1MAGMABase::Data::pack_and_send( } + void AoSScheme1MAGMABase::Data::pack_and_send_xmat( host_task_iterator task_begin, host_task_iterator task_end ) { @@ -115,7 +116,7 @@ void AoSScheme1MAGMABase::Data::pack_and_send_xmat( ld_dmat_host( ntask ), ld_zmat_host( ntask ), ld_vmat_host( ntask ), ld_bf_host( ntask ); - double* static_dmat = static_stack.dmat_device; + double* static_dmat = static_stack.dmat_s_device; const auto nbf = global_dims.nbf; // host_device_tasks should be populated by parent impl called at top diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp index 8ac2fc57..30403175 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp @@ -83,33 +83,13 @@ class IncoreReplicatedXCDeviceIntegrator : host_task_iterator task_begin, host_task_iterator task_end, XCDeviceData& device_data ); - void exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data, bool do_vxc = true ); - - void exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, - value_type* VXC, int64_t ldvxc, value_type* EXC, value_type *N_EL, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ); - - void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, - const value_type* Pz, int64_t ldpz, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ); - - void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, - const value_type* Pz, int64_t ldpz, - value_type* VXC, int64_t ldvxc, - value_type* VXCz, int64_t ldvxcz, value_type* EXC, value_type *N_EL, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ); void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, const value_type* Pz, int64_t ldpz, const value_type* Py, int64_t ldpy, const value_type* Px, int64_t ldpx, host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ); + XCDeviceData& device_data, bool do_vxc ); void exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, const value_type* Pz, int64_t ldpz, @@ -157,7 +137,7 @@ class IncoreReplicatedXCDeviceIntegrator : virtual ~IncoreReplicatedXCDeviceIntegrator() noexcept; - + template void exc_vxc_local_work(Args&&... args) { exc_vxc_local_work_( std::forward(args)... ); diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp index 36b4ebe3..a8c32da3 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp @@ -24,7 +24,6 @@ void IncoreReplicatedXCDeviceIntegrator:: value_type* EXC, const IntegratorSettingsXC& settings ) { - if(Pz) GAUXC_GENERIC_EXCEPTION("UKS/GKS + EXC Only Device NYI"); const auto& basis = this->load_balancer_->basis(); // Check that P / VXC are sane @@ -53,8 +52,10 @@ void IncoreReplicatedXCDeviceIntegrator:: // Compute local contributions to EXC/VXC and retrieve // data from device this->timer_.time_op("XCIntegrator.LocalWork_EXC", [&](){ - exc_vxc_local_work_( basis, Ps, ldps, nullptr, 0, EXC, - &N_EL, tasks.begin(), tasks.end(), *device_data_ptr); + exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx, + // Passing nullptr for VXCs disables VXC entirely + nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, EXC, &N_EL, + tasks.begin(), tasks.end(), *device_data_ptr); }); GAUXC_MPI_CODE( diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp index ff88e14a..a49eee7f 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp @@ -108,7 +108,7 @@ void IncoreReplicatedXCDeviceIntegrator:: const auto natoms = mol.size(); device_data.reset_allocations(); device_data.allocate_static_data_exc_grad( nbf, nshells, natoms ); - device_data.send_static_data_density_basis( P, ldp, basis ); + device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis ); // Zero integrands device_data.zero_exc_grad_integrands(); @@ -116,6 +116,7 @@ void IncoreReplicatedXCDeviceIntegrator:: // Processes batches in groups that saturadate available device memory integrator_term_tracker enabled_terms; enabled_terms.exc_grad = true; + enabled_terms.ks_scheme = RKS; if( func.is_lda() ) enabled_terms.xc_approx = integrator_xc_approx::LDA; else if( func.is_gga() ) enabled_terms.xc_approx = integrator_xc_approx::GGA; else GAUXC_GENERIC_EXCEPTION("XC Approx NYI"); @@ -135,11 +136,14 @@ void IncoreReplicatedXCDeviceIntegrator:: // Evaluate X matrix const bool do_xmat_grad = func.is_gga(); - lwd->eval_xmat( 2.0, &device_data, do_xmat_grad ); - - // Evaluate U/V variables - if( func.is_gga() ) lwd->eval_uvvar_gga_rks( &device_data ); - else lwd->eval_uvvar_lda_rks( &device_data ); + lwd->eval_xmat( 2.0, &device_data, do_xmat_grad, DEN_S ); + + // Evaluate V variable + lwd->eval_vvar( &device_data, DEN_S, do_xmat_grad ); + + // Evaluate U variables + if( func.is_gga() ) lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme ); + else lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme ); // Evaluate XC functional (we need VXC for EXC Gradient) if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data ); diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp index cae5bec5..c8cae61d 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp @@ -20,7 +20,42 @@ void IncoreReplicatedXCDeviceIntegrator:: eval_exc_vxc_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* VXC, int64_t ldvxc, value_type* EXC, const IntegratorSettingsXC& settings ) { + eval_exc_vxc_( m, n, P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, + VXC, ldvxc, nullptr, 0, nullptr, 0, nullptr, 0, EXC, settings ); +} + + +template +void IncoreReplicatedXCDeviceIntegrator:: + eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps, + int64_t ldps, + const value_type* Pz, + int64_t ldpz, + value_type* VXCs, int64_t ldvxcs, + value_type* VXCz, int64_t ldvxcz, + value_type* EXC, const IntegratorSettingsXC& settings ) { + eval_exc_vxc_( m, n, Ps, ldps, Pz, ldpz, nullptr, 0, nullptr, 0, + VXCs, ldvxcs, VXCz, ldvxcz, nullptr, 0, nullptr, 0, EXC, settings ); +} +template +void IncoreReplicatedXCDeviceIntegrator:: + eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps, + int64_t ldps, + const value_type* Pz, + int64_t ldpz, + const value_type* Py, + int64_t ldpy, + const value_type* Px, + int64_t ldpx, + value_type* VXCs, int64_t ldvxcs, + value_type* VXCz, int64_t ldvxcz, + value_type* VXCy, int64_t ldvxcy, + value_type* VXCx, int64_t ldvxcx, + value_type* EXC, const IntegratorSettingsXC& settings ) { + const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr); + const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr); + const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks); const auto& basis = this->load_balancer_->basis(); @@ -30,10 +65,27 @@ void IncoreReplicatedXCDeviceIntegrator:: GAUXC_GENERIC_EXCEPTION("P/VXC Must Be Square"); if( m != nbf ) GAUXC_GENERIC_EXCEPTION("P/VXC Must Have Same Dimension as Basis"); - if( ldp < nbf ) - GAUXC_GENERIC_EXCEPTION("Invalid LDP"); - if( ldvxc < nbf ) - GAUXC_GENERIC_EXCEPTION("Invalid LDVXC"); + if( ldps < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPs"); + if( ldvxcs < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDVXCs"); + + if( not is_rks ) { + if( ldpz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPz"); + if( ldvxcz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDVXCz"); + if( is_gks ) { + if( ldpy < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPy"); + if( ldvxcy < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDVXCy"); + if( ldpx < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPx"); + if( ldvxcx < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDVXCx"); + } + } // Get Tasks @@ -52,10 +104,10 @@ void IncoreReplicatedXCDeviceIntegrator:: if( this->reduction_driver_->takes_device_memory() ) { // If we can do reductions on the device (e.g. NCCL) - // Don't communicate data back to the hot before reduction + // Don't communicate data back to the host before reduction this->timer_.time_op("XCIntegrator.LocalWork_EXC_VXC", [&](){ - exc_vxc_local_work_( basis, P, ldp, tasks.begin(), tasks.end(), - *device_data_ptr); + exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx, tasks.begin(), tasks.end(), + *device_data_ptr, true); }); GAUXC_MPI_CODE( @@ -65,19 +117,52 @@ void IncoreReplicatedXCDeviceIntegrator:: ) // Reduce results in device memory - auto vxc_device = device_data_ptr->vxc_device_data(); + double* vxc_s_device = device_data_ptr->vxc_s_device_data(); + double* vxc_z_device; + double* vxc_y_device; + double* vxc_x_device; auto exc_device = device_data_ptr->exc_device_data(); auto nel_device = device_data_ptr->nel_device_data(); auto queue = device_data_ptr->queue(); - this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ - this->reduction_driver_->allreduce_inplace( vxc_device, nbf*nbf, ReductionOp::Sum, queue ); - this->reduction_driver_->allreduce_inplace( exc_device, 1, ReductionOp::Sum, queue ); - this->reduction_driver_->allreduce_inplace( nel_device, 1, ReductionOp::Sum, queue ); - }); + + if( not is_rks ) { + vxc_z_device = device_data_ptr->vxc_z_device_data(); + if( is_gks ) { + // GKS + vxc_y_device = device_data_ptr->vxc_y_device_data(); + vxc_x_device = device_data_ptr->vxc_x_device_data(); + this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ + this->reduction_driver_->allreduce_inplace( vxc_s_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( vxc_z_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( vxc_y_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( vxc_x_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( exc_device, 1, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( nel_device, 1, ReductionOp::Sum, queue ); + }); + } else { + // UKS + this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ + this->reduction_driver_->allreduce_inplace( vxc_s_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( vxc_z_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( exc_device, 1, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( nel_device, 1, ReductionOp::Sum, queue ); + }); + + } + } else { + // RKS + this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ + this->reduction_driver_->allreduce_inplace( vxc_s_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( exc_device, 1, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( nel_device, 1, ReductionOp::Sum, queue ); + }); + } + // Retrieve data to host this->timer_.time_op("XCIntegrator.DeviceToHostCopy_EXC_VXC",[&](){ - device_data_ptr->retrieve_exc_vxc_integrands( EXC, &N_EL, VXC, ldvxc ); + device_data_ptr->retrieve_exc_vxc_integrands( EXC, &N_EL, VXCs, ldvxcs, VXCz, ldvxcz, + VXCy, ldvxcy, VXCx, ldvxcx ); }); @@ -86,8 +171,9 @@ void IncoreReplicatedXCDeviceIntegrator:: // Compute local contributions to EXC/VXC and retrieve // data from device this->timer_.time_op("XCIntegrator.LocalWork_EXC_VXC", [&](){ - exc_vxc_local_work_( basis, P, ldp, VXC, ldvxc, EXC, - &N_EL, tasks.begin(), tasks.end(), *device_data_ptr); + exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx, + VXCs, ldvxcs, VXCz, ldvxcz, VXCy, ldvxcy, VXCx, ldvxcx, EXC, + &N_EL, tasks.begin(), tasks.end(), *device_data_ptr); }); GAUXC_MPI_CODE( @@ -97,62 +183,62 @@ void IncoreReplicatedXCDeviceIntegrator:: ) // Reduce Results in host mem - this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ - this->reduction_driver_->allreduce_inplace( VXC, nbf*nbf, ReductionOp::Sum ); - this->reduction_driver_->allreduce_inplace( EXC, 1 , ReductionOp::Sum ); - this->reduction_driver_->allreduce_inplace( &N_EL, 1 , ReductionOp::Sum ); - }); - + if( is_rks ) { + this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ + this->reduction_driver_->allreduce_inplace( VXCs, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( EXC, 1, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + } else { + if( is_gks ) { + this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ + this->reduction_driver_->allreduce_inplace( VXCs, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( VXCz, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( VXCy, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( VXCx, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( EXC, 1, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + } else { + // UKS + this->timer_.time_op("XCIntegrator.Allreduce_EXC_VXC", [&](){ + this->reduction_driver_->allreduce_inplace( VXCs, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( VXCz, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( EXC, 1, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + + } + } } } template void IncoreReplicatedXCDeviceIntegrator:: - eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps, - int64_t ldps, - const value_type* Pz, - int64_t ldpz, - value_type* VXCs, int64_t ldvxcs, - value_type* VXCz, int64_t ldvxcz, - value_type* EXC, const IntegratorSettingsXC& settings ) { - GauXC::util::unused(m,n,Ps,ldps,Pz,ldpz,VXCs,ldvxcs,VXCz,ldvxcz,EXC,settings); - GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); -} - -template -void IncoreReplicatedXCDeviceIntegrator:: - eval_exc_vxc_( int64_t m, int64_t n, const value_type* Ps, - int64_t ldps, - const value_type* Pz, - int64_t ldpz, - const value_type* Py, - int64_t ldpy, - const value_type* Px, - int64_t ldpx, - value_type* VXCs, int64_t ldvxcs, - value_type* VXCz, int64_t ldvxcz, - value_type* VXCy, int64_t ldvxcy, - value_type* VXCx, int64_t ldvxcx, - value_type* EXC, const IntegratorSettingsXC& settings ) { - GauXC::util::unused(m,n,Ps,ldps,Pz,ldpz,Py,ldpy,Px,ldpx,VXCs,ldvxcs,VXCz,ldvxcz,VXCy,ldvxcy,VXCx,ldvxcx,EXC,settings); - GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device"); -} - - -template -void IncoreReplicatedXCDeviceIntegrator:: - exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data, bool do_vxc ) { - + exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* Py, int64_t ldpy, + const value_type* Px, int64_t ldpx, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data, bool do_vxc ) { + const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr); + const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr); + const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks); + if (not is_rks and not is_uks and not is_gks) { + GAUXC_GENERIC_EXCEPTION("MUST BE EITHER RKS, UKS, or GKS!"); + } + + // Cast LWD to LocalDeviceWorkDriver auto* lwd = dynamic_cast(this->local_work_driver_.get() ); // Setup Aliases const auto& func = *this->func_; const auto& mol = this->load_balancer_->molecule(); + if( func.is_mgga() and (is_uks or is_gks) ) GAUXC_GENERIC_EXCEPTION("Device + Polarized mGGAs NYI!"); + // Get basis map BasisSetMap basis_map(basis,mol); @@ -172,38 +258,15 @@ void IncoreReplicatedXCDeviceIntegrator:: if( not lb_state.modified_weights_are_stored ) { GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); } + -#if 0 - this->timer_.time_op("XCIntegrator.ScreenWeights",[&](){ - - constexpr double weight_thresh = std::numeric_limits::epsilon(); - for( auto it = task_begin; it != task_end; ++it ) { - it->max_weight = *std::max_element( it->weights.begin(), it->weights.end() ); - } - - size_t old_ntasks = std::distance( task_begin, task_end ); - task_end = std::stable_partition(task_begin, task_end, - [&](const auto& a){ return a.max_weight > weight_thresh; } ); - - size_t new_ntasks = std::distance( task_begin, task_end ); - std::cout << old_ntasks << ", " << new_ntasks << std::endl; - - }); -#endif - - // Do XC integration in task batches - const auto nbf = basis.nbf(); - const auto nshells = basis.nshells(); - device_data.reset_allocations(); - device_data.allocate_static_data_exc_vxc( nbf, nshells, do_vxc ); - device_data.send_static_data_density_basis( P, ldp, basis ); - - // Zero integrands - device_data.zero_exc_vxc_integrands(); - - // Processes batches in groups that saturadate available device memory integrator_term_tracker enabled_terms; enabled_terms.exc_vxc = true; + + if (is_rks) enabled_terms.ks_scheme = RKS; + else if (is_uks) enabled_terms.ks_scheme = UKS; + else if (is_gks) enabled_terms.ks_scheme = GKS; + if( func.is_lda() ) enabled_terms.xc_approx = integrator_xc_approx::LDA; else if( func.is_gga() ) @@ -212,15 +275,20 @@ void IncoreReplicatedXCDeviceIntegrator:: enabled_terms.xc_approx = integrator_xc_approx::MGGA_LAPL; else enabled_terms.xc_approx = integrator_xc_approx::MGGA_TAU; + + // Do XC integration in task batches + const auto nbf = basis.nbf(); + const auto nshells = basis.nshells(); + device_data.reset_allocations(); + device_data.allocate_static_data_exc_vxc( nbf, nshells, enabled_terms, do_vxc ); + + device_data.send_static_data_density_basis( Ps, ldps, Pz, ldpz, Px, ldpx, Py, ldpy, basis ); + + + + // Zero integrands + device_data.zero_exc_vxc_integrands(enabled_terms); -#if 0 - std::vector full_shell_list(nshells); - std::iota(full_shell_list.begin(),full_shell_list.end(),0); - for( auto it = task_begin; it != task_end; ++it ) { - it->cou_screening.shell_list = full_shell_list; - it->cou_screening.nbe = nbf; - } -#endif auto task_it = task_begin; while( task_it != task_end ) { @@ -230,8 +298,8 @@ void IncoreReplicatedXCDeviceIntegrator:: device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); /*** Process the batches ***/ + const bool need_lapl = func.needs_laplacian(); - // Evaluate collocation if( func.is_mgga() ) { if(need_lapl) lwd->eval_collocation_laplacian( &device_data ); @@ -239,100 +307,77 @@ void IncoreReplicatedXCDeviceIntegrator:: } else if( func.is_gga() ) lwd->eval_collocation_gradient( &device_data ); else lwd->eval_collocation( &device_data ); - - // Evaluate X matrix + + const double xmat_fac = is_rks ? 2.0 : 1.0; const bool need_xmat_grad = func.is_mgga(); - lwd->eval_xmat( 2.0, &device_data, need_xmat_grad ); + const bool need_vvar_grad = func.is_mgga() or func.is_gga(); + + // Evaluate X matrix and V vars + auto do_xmat_vvar = [&](density_id den_id) { + lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id ); + lwd->eval_vvar( &device_data, den_id, need_vvar_grad ); + }; + + do_xmat_vvar(DEN_S); + if (not is_rks) { + do_xmat_vvar(DEN_Z); + if (not is_uks) { + do_xmat_vvar(DEN_Y); + do_xmat_vvar(DEN_X); + } + } + - // Evaluate U/V variables - if( func.is_mgga() ) lwd->eval_uvvar_mgga_rks( &device_data, need_lapl ); - else if( func.is_gga() ) lwd->eval_uvvar_gga_rks( &device_data ); - else lwd->eval_uvvar_lda_rks( &device_data ); + // Evaluate U variables + if( func.is_mgga() ) lwd->eval_uvars_mgga( &device_data, need_lapl ); //<<< TODO: Fn call is different because MGGA U/GKS NYI + else if( func.is_gga() ) lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme ); + else lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme ); // Evaluate XC functional if( func.is_mgga() ) lwd->eval_kern_exc_vxc_mgga( func, &device_data ); else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data ); else lwd->eval_kern_exc_vxc_lda( func, &device_data ); + // Do scalar EXC/N_EL integrations lwd->inc_exc( &device_data ); lwd->inc_nel( &device_data ); if( not do_vxc ) continue; - // Evaluate Z (+ M) matrix - if( func.is_mgga() ) { - lwd->eval_zmat_mgga_vxc_rks( &device_data, need_lapl ); - lwd->eval_mmat_mgga_vxc_rks( &device_data, need_lapl ); - } - else if( func.is_gga() ) lwd->eval_zmat_gga_vxc_rks( &device_data ); - else lwd->eval_zmat_lda_vxc_rks( &device_data ); + auto do_zmat_vxc = [&](density_id den_id) { + if( func.is_mgga() ) { + lwd->eval_zmat_mgga_vxc( &device_data, need_lapl); + lwd->eval_mmat_mgga_vxc( &device_data, need_lapl); + } + else if( func.is_gga() ) + lwd->eval_zmat_gga_vxc( &device_data, enabled_terms.ks_scheme, den_id ); + else + lwd->eval_zmat_lda_vxc( &device_data, enabled_terms.ks_scheme, den_id ); + lwd->inc_vxc( &device_data, den_id, func.is_mgga() ); + }; - // Increment VXC (LT) - lwd->inc_vxc( &device_data, func.is_mgga() ); + do_zmat_vxc(DEN_S); + if(not is_rks) { + do_zmat_vxc(DEN_Z); + if(not is_uks) { + do_zmat_vxc(DEN_Y); + do_zmat_vxc(DEN_X); + } + } } // Loop over batches of batches // Symmetrize VXC in device memory - if(do_vxc) lwd->symmetrize_vxc( &device_data ); - -} - - - -template -void IncoreReplicatedXCDeviceIntegrator:: - exc_vxc_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, - value_type* VXC, int64_t ldvxc, value_type* EXC, - value_type *N_EL, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ) { - - // Get integrate and keep data on device - const bool do_vxc = VXC; - exc_vxc_local_work_( basis, P, ldp, task_begin, task_end, device_data, do_vxc ); - auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); - rt.device_backend()->master_queue_synchronize(); - - // Receive XC terms from host - this->timer_.time_op("XCIntegrator.DeviceToHostCopy_EXC_VXC",[&](){ - device_data.retrieve_exc_vxc_integrands( EXC, N_EL, VXC, ldvxc ); - }); - -} - -template -void IncoreReplicatedXCDeviceIntegrator:: - exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, - const value_type* Pz, int64_t ldpz, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ) { - GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,task_begin,task_end,device_data); - GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); -} - -template -void IncoreReplicatedXCDeviceIntegrator:: - exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, - const value_type* Pz, int64_t ldpz, - value_type* VXCs, int64_t ldvxcs, - value_type* VXCz, int64_t ldvxcz, value_type* EXC, value_type *N_EL, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ) { - - GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,VXCs,ldvxcs,VXCz,ldvxcz,EXC,N_EL,task_begin,task_end,device_data); - GAUXC_GENERIC_EXCEPTION("UKS Not Yet Implemented For Device"); -} - -template -void IncoreReplicatedXCDeviceIntegrator:: - exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, - const value_type* Pz, int64_t ldpz, - const value_type* Py, int64_t ldpy, - const value_type* Px, int64_t ldpx, - host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ) { - GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,Py,ldpy,Px,ldpx,task_begin,task_end,device_data); - GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device"); + if( do_vxc ) { + lwd->symmetrize_vxc( &device_data, DEN_S ); + if (not is_rks) { + lwd->symmetrize_vxc( &device_data, DEN_Z ); + if (not is_uks) { + lwd->symmetrize_vxc( &device_data, DEN_Y ); + lwd->symmetrize_vxc( &device_data, DEN_X ); + } + } + } } template @@ -347,9 +392,18 @@ void IncoreReplicatedXCDeviceIntegrator:: value_type* VXCx, int64_t ldvxcx, value_type* EXC, value_type *N_EL, host_task_iterator task_begin, host_task_iterator task_end, XCDeviceData& device_data ) { + + // Get integrate and keep data on device + const bool do_vxc = VXCs; + exc_vxc_local_work_( basis, Ps, ldps, Pz, ldpz, Py, ldpy, Px, ldpx, task_begin, task_end, device_data, do_vxc ); + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + rt.device_backend()->master_queue_synchronize(); + + // Receive XC terms from host + this->timer_.time_op("XCIntegrator.DeviceToHostCopy_EXC_VXC",[&](){ + device_data.retrieve_exc_vxc_integrands( EXC, N_EL, VXCs, ldvxcs, VXCz, ldvxcz, VXCy, ldvxcy, VXCx, ldvxcx ); + }); - GauXC::util::unused(basis,Ps,ldps,Pz,ldpz,Py,ldpy,Px,ldpx,VXCs,ldvxcs,VXCz,ldvxcz,VXCy,ldvxcy,VXCx,ldvxcx,EXC,N_EL,task_begin,task_end,device_data); - GAUXC_GENERIC_EXCEPTION("GKS Not Yet Implemented For Device"); } diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp index 3cc19868..1aec0077 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp @@ -355,7 +355,7 @@ void IncoreReplicatedXCDeviceIntegrator:: // Do EXX integration in task batches device_data.reset_allocations(); device_data.allocate_static_data_exx( nbf, nshells, shell_pairs.npairs(), shell_pairs.nprim_pair_total(), basis_map.max_l() ); - device_data.send_static_data_density_basis( P, ldp, basis ); + device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis ); device_data.send_static_data_shell_pairs( basis, shell_pairs ); // Zero integrands @@ -401,7 +401,7 @@ void IncoreReplicatedXCDeviceIntegrator:: #if 1 // Symmetrize K in device memory - lwd->symmetrize_exx_k( &device_data ); + lwd->symmetrize_exx_k( &device_data); #endif } diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp index eb610769..764bf3c0 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp @@ -103,7 +103,7 @@ void IncoreReplicatedXCDeviceIntegrator:: const auto nshells = basis.nshells(); device_data.reset_allocations(); device_data.allocate_static_data_den( nbf, nshells ); - device_data.send_static_data_density_basis( P, ldp, basis ); + device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis ); // Zero integrands device_data.zero_den_integrands(); @@ -125,10 +125,12 @@ void IncoreReplicatedXCDeviceIntegrator:: lwd->eval_collocation( &device_data ); // Evaluate X matrix - lwd->eval_xmat( 1.0, &device_data ); + const bool do_xmat_grad = false; + lwd->eval_xmat( 1.0, &device_data, do_xmat_grad, DEN_S ); // Evaluate the density - lwd->eval_uvvar_lda_rks( &device_data ); + const bool do_vvar_grad = false; + lwd->eval_vvar( &device_data, DEN_S, do_vvar_grad ); // Do scalar N_EL integration lwd->inc_nel( &device_data ); diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp index 960d81f2..e62ae760 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp @@ -384,6 +384,7 @@ void ReferenceReplicatedXCHostIntegrator:: const auto xmat_fac = is_rks ? 2.0 : 1.0; // TODO Fix for spinor RKS input lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, Ps, ldps, basis_eval, nbe, zmat, nbe, nbe_scr ); + // X matrix for Pz if(not is_rks) { @@ -451,7 +452,6 @@ void ReferenceReplicatedXCHostIntegrator:: vrho[sds*i] *= weights[i]; if(not is_rks) vrho[sds*i+1] *= weights[i]; } - if( func.is_gga() ){ for( int32_t i = 0; i < npts; ++i ) { vgamma[gga_dim_scal*i] *= weights[i]; diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp index f6505826..4b56692b 100644 --- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp @@ -390,18 +390,11 @@ void ShellBatchedReplicatedXCIntegrator( ntask, csl ); } - // Map packed to unpacked indices if(reqt.task_bfn_shell_indirection) { aos_stack.bfn_shell_indirection_device = mem.aligned_alloc( total_nbe_bfn_task_batch, csl ); } - // Collocation + derivatives const size_t bfn_msz = total_nbe_bfn_npts_task_batch; if(reqt.task_bfn) { @@ -198,7 +196,6 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack( aos_stack.zmat_vxc_device = mem.aligned_alloc( bfn_msz, csl); } - // X Matrix Gradient (for GGA EXC Gradient) if(reqt.task_xmat_grad) { aos_stack.xmat_dx_device = mem.aligned_alloc( bfn_msz, csl); @@ -242,7 +239,6 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack( mem.aligned_alloc(total_nblock_cou_task_batch, csl); } - // Update dynmem data for derived impls return device_buffer_t{ mem.stack(), mem.nleft() }; } @@ -473,20 +469,66 @@ void XCDeviceAoSData::pack_and_send( buffer_adaptor xmat_dx_mem( aos_stack.xmat_dx_device, total_nbe_bfn_npts ); buffer_adaptor xmat_dy_mem( aos_stack.xmat_dy_device, total_nbe_bfn_npts ); buffer_adaptor xmat_dz_mem( aos_stack.xmat_dz_device, total_nbe_bfn_npts ); - - buffer_adaptor den_mem ( base_stack.den_eval_device, total_npts ); - buffer_adaptor dden_x_mem( base_stack.den_x_eval_device, total_npts ); - buffer_adaptor dden_y_mem( base_stack.den_y_eval_device, total_npts ); - buffer_adaptor dden_z_mem( base_stack.den_z_eval_device, total_npts ); + + const bool is_rks = terms.ks_scheme == RKS; + const bool is_uks = terms.ks_scheme == UKS; + const bool is_gks = terms.ks_scheme == GKS; + const bool is_pol = is_uks or is_gks; + const bool is_gga = terms.xc_approx == GGA; + const int den_fac = is_pol ? 2 : 1; + const int gamma_fac = is_pol ? 3 : 1; + + + + buffer_adaptor eps_mem ( base_stack.eps_eval_device, total_npts ); + + // RKS + buffer_adaptor den_s_mem ( base_stack.den_s_eval_device, total_npts ); + buffer_adaptor gamma_mem ( base_stack.gamma_eval_device, total_npts * gamma_fac ); + buffer_adaptor vrho_mem ( base_stack.vrho_eval_device, total_npts * den_fac ); + buffer_adaptor vgamma_mem ( base_stack.vgamma_eval_device, total_npts * gamma_fac ); + + buffer_adaptor den_mem ( base_stack.den_eval_device, total_npts * den_fac ); + + + // Polarized KS + buffer_adaptor den_z_mem ( base_stack.den_z_eval_device, total_npts ); + buffer_adaptor den_y_mem ( base_stack.den_y_eval_device, total_npts ); + buffer_adaptor den_x_mem ( base_stack.den_x_eval_device, total_npts ); + buffer_adaptor vrho_pos_mem( base_stack.vrho_pos_eval_device, total_npts ); + buffer_adaptor vrho_neg_mem( base_stack.vrho_neg_eval_device, total_npts ); + buffer_adaptor K_z_mem ( base_stack.K_z_eval_device, total_npts ); + buffer_adaptor K_y_mem ( base_stack.K_y_eval_device, total_npts ); + buffer_adaptor K_x_mem ( base_stack.K_x_eval_device, total_npts ); + buffer_adaptor H_z_mem ( base_stack.H_z_eval_device, total_npts ); + buffer_adaptor H_y_mem ( base_stack.H_y_eval_device, total_npts ); + buffer_adaptor H_x_mem ( base_stack.H_x_eval_device, total_npts ); + buffer_adaptor gamma_pp_mem( base_stack.gamma_pp_eval_device, total_npts ); + buffer_adaptor gamma_pm_mem( base_stack.gamma_pm_eval_device, total_npts ); + buffer_adaptor gamma_mm_mem( base_stack.gamma_mm_eval_device, total_npts ); + buffer_adaptor vgamma_pp_mem( base_stack.vgamma_pp_eval_device, total_npts ); + buffer_adaptor vgamma_pm_mem( base_stack.vgamma_pm_eval_device, total_npts ); + buffer_adaptor vgamma_mm_mem( base_stack.vgamma_mm_eval_device, total_npts ); + + // Gradients + buffer_adaptor dden_sx_mem( base_stack.dden_sx_eval_device, total_npts ); + buffer_adaptor dden_sy_mem( base_stack.dden_sy_eval_device, total_npts ); + buffer_adaptor dden_sz_mem( base_stack.dden_sz_eval_device, total_npts ); + buffer_adaptor dden_zx_mem( base_stack.dden_zx_eval_device, total_npts ); + buffer_adaptor dden_zy_mem( base_stack.dden_zy_eval_device, total_npts ); + buffer_adaptor dden_zz_mem( base_stack.dden_zz_eval_device, total_npts ); + buffer_adaptor dden_yx_mem( base_stack.dden_yx_eval_device, total_npts ); + buffer_adaptor dden_yy_mem( base_stack.dden_yy_eval_device, total_npts ); + buffer_adaptor dden_yz_mem( base_stack.dden_yz_eval_device, total_npts ); + buffer_adaptor dden_xx_mem( base_stack.dden_xx_eval_device, total_npts ); + buffer_adaptor dden_xy_mem( base_stack.dden_xy_eval_device, total_npts ); + buffer_adaptor dden_xz_mem( base_stack.dden_xz_eval_device, total_npts ); + + // MGGA buffer_adaptor dden_lapl_mem( base_stack.den_lapl_eval_device, total_npts ); - - buffer_adaptor eps_mem( base_stack.eps_eval_device, total_npts ); - buffer_adaptor gamma_mem( base_stack.gamma_eval_device, total_npts ); + buffer_adaptor vlapl_mem( base_stack.vlapl_eval_device, total_npts ); buffer_adaptor tau_mem( base_stack.tau_eval_device, total_npts ); - buffer_adaptor vrho_mem( base_stack.vrho_eval_device, total_npts ); - buffer_adaptor vgamma_mem( base_stack.vgamma_eval_device, total_npts ); buffer_adaptor vtau_mem( base_stack.vtau_eval_device, total_npts ); - buffer_adaptor vlapl_mem( base_stack.vlapl_eval_device, total_npts ); for( auto& task : host_device_tasks ) { const auto npts = task.npts; @@ -562,29 +604,84 @@ void XCDeviceAoSData::pack_and_send( // Grid function evaluations - task.den = den_mem.aligned_alloc(reqt.grid_den_size(npts), csl); + if (reqt.grid_den) { + task.den_s = den_s_mem.aligned_alloc( npts, csl ); + if(is_pol) { + task.den = den_mem.aligned_alloc(npts*2, csl); //Interleaved memory + task.den_z = den_z_mem.aligned_alloc( npts, csl); + if ( is_gks ) { + task.den_y = den_y_mem.aligned_alloc( npts, csl); + task.den_x = den_x_mem.aligned_alloc( npts, csl); + } + } + } + + if( reqt.grid_vrho ) { + task.vrho = vrho_mem.aligned_alloc( npts*den_fac, csl); + if( is_pol ) { + task.vrho_pos = vrho_pos_mem.aligned_alloc( npts, csl); + task.vrho_neg = vrho_neg_mem.aligned_alloc( npts, csl); + } + } + + if( reqt.grid_vgamma ) { + task.vgamma = vgamma_mem.aligned_alloc( npts*gamma_fac, csl); + if( is_pol ) { + task.vgamma_pp = vgamma_pp_mem.aligned_alloc( npts, csl); + task.vgamma_pm = vgamma_pm_mem.aligned_alloc( npts, csl); + task.vgamma_mm = vgamma_mm_mem.aligned_alloc( npts, csl); + } + } + if( reqt.grid_gamma ) { + task.gamma = gamma_mem.aligned_alloc( npts*gamma_fac, csl); + if( is_pol ) { + task.gamma_pp = gamma_pp_mem.aligned_alloc( npts, csl); + task.gamma_pm = gamma_pm_mem.aligned_alloc( npts, csl); + task.gamma_mm = gamma_mm_mem.aligned_alloc( npts, csl); + } + } if(reqt.grid_den_grad) { - task.ddenx = dden_x_mem.aligned_alloc(npts, csl); - task.ddeny = dden_y_mem.aligned_alloc(npts, csl); - task.ddenz = dden_z_mem.aligned_alloc(npts, csl); + task.dden_sx = dden_sx_mem.aligned_alloc(npts, csl); + task.dden_sy = dden_sy_mem.aligned_alloc(npts, csl); + task.dden_sz = dden_sz_mem.aligned_alloc(npts, csl); + if( is_pol ) { + task.dden_zx = dden_zx_mem.aligned_alloc( npts, csl ); + task.dden_zy = dden_zy_mem.aligned_alloc( npts, csl ); + task.dden_zz = dden_zz_mem.aligned_alloc( npts, csl ); + if( is_gks ) { + task.dden_yx = dden_yx_mem.aligned_alloc( npts, csl ); + task.dden_yy = dden_yy_mem.aligned_alloc( npts, csl ); + task.dden_yz = dden_yz_mem.aligned_alloc( npts, csl ); + task.dden_xx = dden_xx_mem.aligned_alloc( npts, csl ); + task.dden_xy = dden_xy_mem.aligned_alloc( npts, csl ); + task.dden_xz = dden_xz_mem.aligned_alloc( npts, csl ); + } + } + } + + // H, K terms (GKS) + if( is_gks ) { + task.K_x = K_x_mem.aligned_alloc( npts, csl ); + task.K_y = K_y_mem.aligned_alloc( npts, csl ); + task.K_z = K_z_mem.aligned_alloc( npts, csl ); + if( is_gga ) { + task.H_x = H_x_mem.aligned_alloc( npts, csl ); + task.H_y = H_y_mem.aligned_alloc( npts, csl ); + task.H_z = H_z_mem.aligned_alloc( npts, csl ); + } } + + task.eps = eps_mem.aligned_alloc( reqt.grid_eps_size(npts), csl); + if(reqt.grid_den_lapl) { task.denlapl = dden_lapl_mem.aligned_alloc(npts, csl); } - task.gamma = - gamma_mem.aligned_alloc( reqt.grid_gamma_size(npts), csl); task.tau = tau_mem.aligned_alloc( reqt.grid_tau_size(npts), csl); - task.eps = - eps_mem.aligned_alloc( reqt.grid_eps_size(npts), csl); - task.vrho = - vrho_mem.aligned_alloc( reqt.grid_vrho_size(npts), csl); - task.vgamma = - vgamma_mem.aligned_alloc( reqt.grid_vgamma_size(npts), csl); task.vtau = vtau_mem.aligned_alloc( reqt.grid_vtau_size(npts), csl); task.vlapl = diff --git a/src/xc_integrator/xc_data/device/xc_device_data.hpp b/src/xc_integrator/xc_data/device/xc_device_data.hpp index 6c1d6a21..fde8158b 100644 --- a/src/xc_integrator/xc_data/device/xc_device_data.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_data.hpp @@ -20,11 +20,26 @@ namespace GauXC { enum integrator_xc_approx : uint32_t { - _UNDEFINED = 0, - LDA = 1, - GGA = 2, - MGGA_TAU = 3, - MGGA_LAPL = 4 + _UNDEF_APPROX = 0, + LDA = 1, + GGA = 2, + MGGA_TAU = 3, + MGGA_LAPL = 4 +}; + +enum integrator_ks_scheme : uint32_t { + _UNDEF_SCHEME = 0, + RKS = 1, + UKS = 2, + GKS = 3 +}; + +enum density_id : uint32_t { + _UNDEF_DEN = 0, + DEN_S = 1, // RKS, UKS, GKS + DEN_Z = 2, // UKS, GKS + DEN_Y = 3, // GKS + DEN_X = 4 // GKS }; struct integrator_term_tracker { @@ -34,13 +49,14 @@ struct integrator_term_tracker { bool exc_grad = false; bool exx = false; bool exx_ek_screening = false; - integrator_xc_approx xc_approx = _UNDEFINED; + integrator_xc_approx xc_approx = _UNDEF_APPROX; + integrator_ks_scheme ks_scheme = _UNDEF_SCHEME; inline void reset() { std::memset( this, 0, sizeof(integrator_term_tracker) ); } }; -#define PRDVL(pred,val) (pred) ? (val) : 0ul; +#define PRDVL(pred,val) (pred) ? (val) : 0ul struct required_term_storage { bool grid_points = false; @@ -65,18 +81,42 @@ struct required_term_storage { bool grid_vtau = false; bool grid_vlapl = false; + + // Reference flags for memory management use + integrator_term_tracker ref_tracker; + inline size_t grid_den_size(size_t npts){ - return PRDVL(grid_den, npts); + // For RKS, only den_s_eval is used + if( grid_den ) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.den ) return npts; + // 2*npts for S,Z densities, 2*npts for interleaved density + if( ref_tracker.ks_scheme == UKS ) return 4*npts; + // Same as above, but also X,Y densities + if( ref_tracker.ks_scheme == GKS ) return 6*npts; + } + return 0ul; } inline size_t grid_den_grad_size(size_t npts){ - return PRDVL(grid_den_grad, 3 * npts); + if( grid_den_grad ) { + // 3*npts for each density in play + if( ref_tracker.ks_scheme == RKS ) return 3*npts; + if( ref_tracker.ks_scheme == UKS ) return 6*npts; + if( ref_tracker.ks_scheme == GKS ) return 12*npts; + } + return 0ul; + } + inline size_t grid_gamma_size(size_t npts){ + if( grid_gamma ) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS + or ref_tracker.ks_scheme == GKS ) return 6*npts; + } + return 0ul; } inline size_t grid_den_lapl_size(size_t npts){ return PRDVL(grid_den_lapl, npts); } - inline size_t grid_gamma_size(size_t npts){ - return PRDVL(grid_gamma, npts); - } inline size_t grid_tau_size(size_t npts){ return PRDVL(grid_tau, npts); } @@ -84,10 +124,27 @@ struct required_term_storage { return PRDVL(grid_eps, npts); } inline size_t grid_vrho_size(size_t npts){ - return PRDVL(grid_vrho, npts); + if( grid_vrho ) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS + or ref_tracker.ks_scheme == GKS ) return 4*npts; + } + return 0ul; } inline size_t grid_vgamma_size(size_t npts){ - return PRDVL(grid_vgamma, npts); + if( grid_vgamma ) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS + or ref_tracker.ks_scheme == GKS ) return 6*npts; + } + return 0ul; + } + inline size_t grid_HK_size(size_t npts){ + if( ref_tracker.ks_scheme == GKS ) { + if( ref_tracker.xc_approx == GGA ) return 6*npts; + if( ref_tracker.xc_approx == LDA ) return 3*npts; + } + return 0ul; } inline size_t grid_vtau_size(size_t npts){ return PRDVL(grid_vtau, npts); @@ -96,6 +153,8 @@ struct required_term_storage { return PRDVL(grid_vlapl, npts); } + + // Task-local matrices bool task_bfn = false; bool task_bfn_grad = false; @@ -245,11 +304,16 @@ struct required_term_storage { grid_to_parent_center = true; } - // Allocated terms for XC aclculations + // Allocated terms for XC calculations const bool is_xc = tracker.exc_vxc or tracker.exc_grad; + + ref_tracker = tracker; + if(is_xc) { - if( tracker.xc_approx == _UNDEFINED ) + if( tracker.xc_approx == _UNDEF_APPROX ) GAUXC_GENERIC_EXCEPTION("No XC Approx Set"); + if( tracker.ks_scheme == _UNDEF_SCHEME ) + GAUXC_GENERIC_EXCEPTION("No KS Scheme Set"); //const bool is_lda = is_xc and tracker.xc_approx == LDA; const bool is_gga = is_xc and tracker.xc_approx == GGA; const bool need_tau = tracker.xc_approx == MGGA_TAU; @@ -366,7 +430,7 @@ struct XCDeviceData { /// Allocate device memory for data that will persist on the device. virtual void reset_allocations() = 0; virtual void allocate_static_data_weights( int32_t natoms ) = 0; - virtual void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, bool do_vxc ) = 0; + virtual void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) = 0; virtual void allocate_static_data_den( int32_t nbf, int32_t nshells ) = 0; virtual void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) = 0; virtual void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) = 0; @@ -374,7 +438,7 @@ struct XCDeviceData { // Send persistent data from host to device virtual void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) = 0; - virtual void send_static_data_density_basis( const double* P, int32_t ldp, const BasisSet& basis ) = 0; + virtual void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, const double* Px, int32_t ldpx, const BasisSet& basis ) = 0; virtual void send_static_data_shell_pairs( const BasisSet&, const ShellPairCollection& ) = 0; virtual void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection& ) = 0; @@ -382,7 +446,7 @@ struct XCDeviceData { virtual void zero_den_integrands() = 0; /// Zero out the EXC / VXC integrands in device memory - virtual void zero_exc_vxc_integrands() = 0; + virtual void zero_exc_vxc_integrands(integrator_term_tracker enabled_terms) = 0; /// Zero out the EXC Gradient integrands in device memory virtual void zero_exc_grad_integrands() = 0; @@ -420,7 +484,8 @@ struct XCDeviceData { * @param[out[ VXC Integrated XC potential (host) for XC queue */ virtual void retrieve_exc_vxc_integrands( double* EXC, double* N_EL, - double* VXC, int32_t ldvxc ) = 0; + double* VXCs, int32_t ldvxcs, double* VXCz, int32_t ldvxcz, + double* VXCy, int32_t ldvxcy, double* VXCx, int32_t ldvxcx ) = 0; /** Retreive EXC Gradient integrands from device memory * @@ -444,7 +509,10 @@ struct XCDeviceData { virtual void copy_weights_to_tasks( host_task_iterator task_begin, host_task_iterator task_end ) = 0; virtual void populate_submat_maps ( size_t, host_task_iterator begin, host_task_iterator end, const BasisSetMap& ) = 0; - virtual double* vxc_device_data() = 0; + virtual double* vxc_z_device_data() = 0; + virtual double* vxc_s_device_data() = 0; + virtual double* vxc_y_device_data() = 0; + virtual double* vxc_x_device_data() = 0; virtual double* exc_device_data() = 0; virtual double* nel_device_data() = 0; virtual double* exx_k_device_data() = 0; diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx index a2635f11..1aadd1ab 100644 --- a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx +++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx @@ -34,7 +34,10 @@ XCDeviceStackData::XCDeviceStackData(const DeviceRuntimeEnvironment& rt) : XCDeviceStackData::~XCDeviceStackData() noexcept = default; -double* XCDeviceStackData::vxc_device_data() { return static_stack.vxc_device; } +double* XCDeviceStackData::vxc_s_device_data() { return static_stack.vxc_s_device; } +double* XCDeviceStackData::vxc_z_device_data() { return static_stack.vxc_z_device; } +double* XCDeviceStackData::vxc_y_device_data() { return static_stack.vxc_y_device; } +double* XCDeviceStackData::vxc_x_device_data() { return static_stack.vxc_x_device; } double* XCDeviceStackData::exc_device_data() { return static_stack.exc_device; } double* XCDeviceStackData::nel_device_data() { return static_stack.nel_device; } double* XCDeviceStackData::exx_k_device_data() { return static_stack.exx_k_device; } @@ -80,10 +83,12 @@ void XCDeviceStackData::allocate_static_data_weights( int32_t natoms ) { allocated_terms.weights = true; } -void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, bool do_vxc ) { +void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) { if( allocated_terms.exc_vxc ) GAUXC_GENERIC_EXCEPTION("Attempting to reallocate Stack EXC VXC"); + if( enabled_terms.ks_scheme == _UNDEF_SCHEME ) + GAUXC_GENERIC_EXCEPTION("Must have a KS Scheme set to allocate Stack EXC VXC"); // Save state global_dims.nshells = nshells; @@ -96,18 +101,35 @@ void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshel static_stack.exc_device = mem.aligned_alloc( 1 , csl); static_stack.nel_device = mem.aligned_alloc( 1 , csl); static_stack.acc_scr_device = mem.aligned_alloc( 1 , csl); + + allocated_terms.ks_scheme = enabled_terms.ks_scheme; + static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl ); + if( not (allocated_terms.ks_scheme == RKS) ) { + static_stack.dmat_z_device = mem.aligned_alloc( nbf * nbf , csl ); + if( allocated_terms.ks_scheme == GKS ) { + static_stack.dmat_y_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.dmat_x_device = mem.aligned_alloc( nbf * nbf , csl ); + } + } - if(do_vxc) static_stack.vxc_device = mem.aligned_alloc( nbf * nbf , csl); - - static_stack.dmat_device = mem.aligned_alloc( nbf * nbf , csl); + if( do_vxc ) { + static_stack.vxc_s_device = mem.aligned_alloc( nbf * nbf , csl ); + if( not (allocated_terms.ks_scheme == RKS) ) { + static_stack.vxc_z_device = mem.aligned_alloc( nbf * nbf , csl ); + if( allocated_terms.ks_scheme == GKS ) { + static_stack.vxc_y_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.vxc_x_device = mem.aligned_alloc( nbf * nbf , csl ); + } + } + } // Get current stack location dynmem_ptr = mem.stack(); dynmem_sz = mem.nleft(); + allocated_terms.exc_vxc = true; } - void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells ) { if( allocated_terms.den ) @@ -124,7 +146,7 @@ void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells ) static_stack.acc_scr_device = mem.aligned_alloc( 1 , csl); static_stack.nel_device = mem.aligned_alloc( 1 , csl); - static_stack.dmat_device = mem.aligned_alloc( nbf * nbf , csl); + static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl); // Get current stack location dynmem_ptr = mem.stack(); @@ -151,7 +173,7 @@ void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshe static_stack.nel_device = mem.aligned_alloc( 1 , csl); static_stack.acc_scr_device = mem.aligned_alloc( 1 , csl); - static_stack.dmat_device = mem.aligned_alloc( nbf * nbf , csl); + static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl); // Get current stack location dynmem_ptr = mem.stack(); @@ -181,7 +203,7 @@ void XCDeviceStackData::allocate_static_data_exx( int32_t nbf, int32_t nshells, mem.aligned_alloc>(nprim_pair_total, csl); static_stack.exx_k_device = mem.aligned_alloc( nbf * nbf , csl); - static_stack.dmat_device = mem.aligned_alloc( nbf * nbf , csl); + static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl); // Get current stack location dynmem_ptr = mem.stack(); @@ -208,7 +230,7 @@ void XCDeviceStackData::allocate_static_data_exx_ek_screening( size_t ntasks, in buffer_adaptor mem( dynmem_ptr, dynmem_sz ); static_stack.shells_device = mem.aligned_alloc>( nshells , csl); - static_stack.dmat_device = mem.aligned_alloc( nbf * nbf , csl); + static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl); static_stack.ek_max_bfn_sum_device = mem.aligned_alloc( ntasks , csl); static_stack.vshell_max_sparse_device = @@ -264,24 +286,39 @@ void XCDeviceStackData::send_static_data_weights( const Molecule& mol, const Mol device_backend_->master_queue_synchronize(); } -void XCDeviceStackData::send_static_data_density_basis( const double* P, int32_t ldp, +void XCDeviceStackData::send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, const double* Px, int32_t ldpx, const BasisSet& basis ) { + const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr); + const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr); + const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks); + if( not is_rks and not is_uks and not is_gks ) + GAUXC_GENERIC_EXCEPTION("Densities do not match RKS, UKS, or GKS schemes"); if( not (allocated_terms.exx or allocated_terms.exc_vxc or allocated_terms.exc_grad or allocated_terms.den or allocated_terms.exx_ek_screening) ) GAUXC_GENERIC_EXCEPTION("Density/Basis Not Stack Allocated"); - const auto nbf = global_dims.nbf; - if( ldp != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDP must bf NBF"); if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); - // Copy Density - device_backend_->copy_async( nbf*nbf, P, static_stack.dmat_device, "P H2D" ); + + const auto nbf = global_dims.nbf; + // Check dimensions and copy density + if( ldps != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPs must bf NBF"); + device_backend_->copy_async( nbf*nbf, Ps, static_stack.dmat_s_device, "P_scalar H2D" ); + if( not is_rks ) { + if( ldpz != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPz must bf NBF"); + device_backend_->copy_async( nbf*nbf, Pz, static_stack.dmat_z_device, "P_z H2D" ); + if( is_gks ) { + if( ldpy != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPy must bf NBF"); + if( ldpx != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDPx must bf NBF"); + device_backend_->copy_async( nbf*nbf, Py, static_stack.dmat_y_device, "P_y H2D" ); + device_backend_->copy_async( nbf*nbf, Px, static_stack.dmat_x_device, "P_x H2D" ); + } + } // Copy Basis Set device_backend_->copy_async( basis.nshells(), basis.data(), static_stack.shells_device, "Shells H2D" ); - device_backend_->master_queue_synchronize(); } @@ -424,12 +461,15 @@ void XCDeviceStackData::zero_den_integrands() { } -void XCDeviceStackData::zero_exc_vxc_integrands() { +void XCDeviceStackData::zero_exc_vxc_integrands(integrator_term_tracker enabled_terms) { if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); const auto nbf = global_dims.nbf; - if(static_stack.vxc_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_device, "VXC Zero" ); + if(static_stack.vxc_s_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_s_device, "VXCs Zero" ); + if(static_stack.vxc_z_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_z_device, "VXCz Zero" ); + if(static_stack.vxc_y_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_y_device, "VXCy Zero" ); + if(static_stack.vxc_x_device) device_backend_->set_zero( nbf*nbf, static_stack.vxc_x_device, "VXCx Zero" ); device_backend_->set_zero( 1, static_stack.exc_device, "EXC Zero" ); device_backend_->set_zero( 1, static_stack.nel_device, "NEL Zero" ); @@ -466,21 +506,31 @@ void XCDeviceStackData::zero_exx_ek_screening_intermediates() { } - - void XCDeviceStackData::retrieve_exc_vxc_integrands( double* EXC, double* N_EL, - double* VXC, int32_t ldvxc ) { + double* VXCs, int32_t ldvxcs, double* VXCz, int32_t ldvxcz, + double* VXCy, int32_t ldvxcy, double* VXCx, int32_t ldvxcx ) { const auto nbf = global_dims.nbf; - if( ldvxc and ldvxc != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDVXC must bf NBF"); - if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); - - if(VXC) - device_backend_->copy_async( nbf*nbf, static_stack.vxc_device, VXC, "VXC D2H" ); device_backend_->copy_async( 1, static_stack.nel_device, N_EL, "NEL D2H" ); device_backend_->copy_async( 1, static_stack.exc_device, EXC, "EXC D2H" ); + if( ldvxcs and (ldvxcs != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCs must be NBF"); + if( VXCs ) + device_backend_->copy_async( nbf*nbf, static_stack.vxc_s_device, VXCs, "VXCs D2H" ); + + if( ldvxcz and (ldvxcz != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCz must be NBF"); + if( VXCz ) + device_backend_->copy_async( nbf*nbf, static_stack.vxc_z_device, VXCz, "VXCz D2H" ); + + if( ldvxcy and (ldvxcy != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCy must be NBF"); + if( VXCy ) + device_backend_->copy_async( nbf*nbf, static_stack.vxc_y_device, VXCy, "VXCy D2H" ); + + if( ldvxcx and (ldvxcx != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDVXCx must be NBF"); + if( VXCx ) + device_backend_->copy_async( nbf*nbf, static_stack.vxc_x_device, VXCx, "VXCx D2H" ); + } void XCDeviceStackData::retrieve_den_integrands( double* N_EL ) { @@ -580,26 +630,30 @@ size_t XCDeviceStackData::get_mem_req( const size_t npts = points.size(); required_term_storage reqt(terms); + size_t mem_req = // Grid - reqt.grid_points_size (npts) * sizeof(double) + - reqt.grid_weights_size(npts) * sizeof(double) + + reqt.grid_points_size (npts) * sizeof(double) + + reqt.grid_weights_size(npts) * sizeof(double) + // U Variables reqt.grid_den_size(npts) * sizeof(double) + reqt.grid_den_grad_size(npts) * sizeof(double) + reqt.grid_den_lapl_size(npts) * sizeof(double) + + // H/K Matrices (GKS) + reqt.grid_HK_size(npts) * sizeof(double) + + // V Variables - reqt.grid_gamma_size(npts) * sizeof(double) + - reqt.grid_tau_size(npts) * sizeof(double) + + reqt.grid_gamma_size(npts) * sizeof(double) + + reqt.grid_tau_size(npts) * sizeof(double) + // XC output - reqt.grid_eps_size(npts) * sizeof(double) + - reqt.grid_vrho_size(npts) * sizeof(double) + - reqt.grid_vgamma_size(npts) * sizeof(double) + - reqt.grid_vtau_size(npts) * sizeof(double) + - reqt.grid_vlapl_size(npts) * sizeof(double) ; + reqt.grid_eps_size(npts) * sizeof(double) + + reqt.grid_vrho_size(npts) * sizeof(double) + + reqt.grid_vgamma_size(npts) * sizeof(double) + + reqt.grid_vtau_size(npts) * sizeof(double) + + reqt.grid_vlapl_size(npts) * sizeof(double) ; return mem_req; } @@ -630,7 +684,15 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack( required_term_storage reqt(terms); const size_t msz = total_npts_task_batch; const size_t aln = 256; + + const bool is_rks = terms.ks_scheme == RKS; + const bool is_uks = terms.ks_scheme == UKS; + const bool is_gks = terms.ks_scheme == GKS; + const bool is_pol = is_uks or is_gks; + const bool is_gga = terms.xc_approx == GGA; + const bool is_den = terms.den; + // Grid Points if( reqt.grid_points ) { base_stack.points_x_device = mem.aligned_alloc( msz, aln, csl); @@ -638,21 +700,37 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack( base_stack.points_z_device = mem.aligned_alloc( msz, aln, csl); } + // Grid Weights if( reqt.grid_weights ) { base_stack.weights_device = mem.aligned_alloc(msz, csl); } // Grid function evaluations - if( reqt.grid_den ) { // Density - base_stack.den_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.den_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { base_stack.den_eval_device = mem.aligned_alloc(2*msz, aln, csl); + base_stack.den_z_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_gks ){ base_stack.den_y_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.den_x_eval_device = mem.aligned_alloc(msz, aln, csl); } + } } if( reqt.grid_den_grad ) { // Density gradient - base_stack.den_x_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.den_y_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.den_z_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_sx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_sy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_sz_eval_device = mem.aligned_alloc(msz, aln, csl); + + if( is_pol ) { base_stack.dden_zx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_zy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_zz_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_gks ) { base_stack.dden_yx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_yy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_yz_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_xx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_xy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_xz_eval_device = mem.aligned_alloc(msz, aln, csl); } + } } if( reqt.grid_den_lapl ) { // Density Laplacian @@ -660,23 +738,45 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack( } if( reqt.grid_gamma ) { // Gamma - base_stack.gamma_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { base_stack.gamma_eval_device = mem.aligned_alloc(3 * msz, aln, csl); + base_stack.gamma_pp_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.gamma_pm_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.gamma_mm_eval_device = mem.aligned_alloc(msz, aln, csl); } + else base_stack.gamma_eval_device = mem.aligned_alloc(msz, aln, csl); } - if( reqt.grid_tau ) { // Tau - base_stack.tau_eval_device = mem.aligned_alloc(msz, aln, csl); + if( reqt.grid_vrho ) { // Vrho + if( is_pol ) { base_stack.vrho_eval_device = mem.aligned_alloc(2 * msz, aln, csl); + base_stack.vrho_pos_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vrho_neg_eval_device = mem.aligned_alloc(msz, aln, csl); } + else base_stack.vrho_eval_device = mem.aligned_alloc(msz, aln, csl); } - if( reqt.grid_eps ) { // Energy density - base_stack.eps_eval_device = mem.aligned_alloc(msz, aln, csl); + if( reqt.grid_vgamma ) { // Vgamma + if( is_pol ) { base_stack.vgamma_eval_device = mem.aligned_alloc(3*msz, aln, csl); + base_stack.vgamma_pp_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vgamma_pm_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vgamma_mm_eval_device = mem.aligned_alloc(msz, aln, csl); } + else base_stack.vgamma_eval_device = mem.aligned_alloc(msz, aln, csl); } - if( reqt.grid_vrho ) { // Vrho - base_stack.vrho_eval_device = mem.aligned_alloc(msz, aln, csl); + if( reqt.grid_tau ) { // Tau + base_stack.tau_eval_device = mem.aligned_alloc(msz, aln, csl); } - if( reqt.grid_vgamma ) { // Vgamma - base_stack.vgamma_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_gks ) { // H, K matrices + base_stack.K_x_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.K_y_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.K_z_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_gga ) { + base_stack.H_x_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.H_y_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.H_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_eps ) { // Energy density + base_stack.eps_eval_device = mem.aligned_alloc(msz, aln, csl); } if( reqt.grid_vtau ) { // Vtau @@ -687,11 +787,13 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack( base_stack.vlapl_eval_device = mem.aligned_alloc(msz, aln, csl); } + + // Update dynmem data for derived impls return device_buffer_t{ mem.stack(), mem.nleft() }; } -void XCDeviceStackData::pack_and_send( integrator_term_tracker , +void XCDeviceStackData::pack_and_send( integrator_term_tracker terms, host_task_iterator task_begin, host_task_iterator task_end, const BasisSetMap& ) { if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp index af0b2b0b..e1a72ec4 100644 --- a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp @@ -47,13 +47,11 @@ struct XCDeviceStackData : public XCDeviceData { Shell* shells_device = nullptr; ///< Array of static basis shells (nshells) PrimitivePair* prim_pairs_device = nullptr; - double* dmat_device = nullptr; ///< Static density matrix storage (nbf,nbf) double* rab_device = nullptr; ///< Static RAB matrix storage (*,natoms) double* coords_device = nullptr; ///< Static atomic positions (3 * natoms) double* exc_device = nullptr; ///< EXC storage (1) double* nel_device = nullptr; ///< N_EL storage (1) - double* vxc_device = nullptr; ///< VXC storage (nbf,nbf) double* exx_k_device = nullptr; ///< EXX K storage (nbf,nbf) double* acc_scr_device = nullptr; ///< Accumulaion scratch (1) double* exc_grad_device = nullptr; ///< EXC Gradient storage (3*natoms) @@ -66,6 +64,15 @@ struct XCDeviceStackData : public XCDeviceData { int32_t* shell_to_bf_device = nullptr; int32_t* shell_sizes_device = nullptr; + double* dmat_s_device = nullptr; ///< Static density matrix storage (nbf,nbf) + double* dmat_z_device = nullptr; /// Ditto for Z,Y,X densities + double* dmat_y_device = nullptr; + double* dmat_x_device = nullptr; + double* vxc_s_device = nullptr; ///< VXC storage (nbf, nbf) + double* vxc_z_device = nullptr; /// Ditto for Z,Y,X densities + double* vxc_y_device = nullptr; + double* vxc_x_device = nullptr; + inline void reset() { std::memset( this, 0, sizeof(static_data) ); } }; @@ -85,10 +92,28 @@ struct XCDeviceStackData : public XCDeviceData { double* weights_device = nullptr; ///< Grid weights for task batch // U variables - double* den_eval_device = nullptr; ///< density for task batch - double* den_x_eval_device = nullptr; ///< d/dx density for task batch - double* den_y_eval_device = nullptr; ///< d/dy density for task batch - double* den_z_eval_device = nullptr; ///< d/dz density for task batch + double* den_s_eval_device = nullptr; ///< scalar density for task batch + double* dden_sx_eval_device = nullptr; ///< d/dx scalar density for task batch + double* dden_sy_eval_device = nullptr; ///< d/dy scalar density for task batch + double* dden_sz_eval_device = nullptr; ///< d/dz scalar density for task batch + + double* den_z_eval_device = nullptr; ///< z density for task batch + double* dden_zx_eval_device = nullptr; ///< d/dx z density for task batch + double* dden_zy_eval_device = nullptr; ///< d/dy z density for task batch + double* dden_zz_eval_device = nullptr; ///< d/dz z density for task batch + + double* den_y_eval_device = nullptr; ///< y density for task batch + double* dden_yx_eval_device = nullptr; ///< d/dx y density for task batch + double* dden_yy_eval_device = nullptr; ///< d/dy y density for task batch + double* dden_yz_eval_device = nullptr; ///< d/dz y density for task batch + + double* den_x_eval_device = nullptr; ///< x density for task batch + double* dden_xx_eval_device = nullptr; ///< d/dx x density for task batch + double* dden_xy_eval_device = nullptr; ///< d/dy x density for task batch + double* dden_xz_eval_device = nullptr; ///< d/dz x density for task batch + + double* den_eval_device = nullptr; /// Storage for interleaved density (non-RKS only) + double* den_lapl_eval_device = nullptr; ///< density Laplacian for task batch // V variables / XC output @@ -100,6 +125,23 @@ struct XCDeviceStackData : public XCDeviceData { double* vtau_eval_device = nullptr; ///< Tau XC derivative for task batch double* vlapl_eval_device = nullptr; ///< Lapl XC derivative for task batch + double* vrho_pos_eval_device = nullptr; ///< Polarized Rho+ XC derivative for task batch + double* vrho_neg_eval_device = nullptr; ///< Polarized Rho+ XC derivative for task batch + + double* gamma_pp_eval_device = nullptr; ///< Polarized Gamma++ for task batch + double* gamma_pm_eval_device = nullptr; ///< Polarized Gamma+- for task batch + double* gamma_mm_eval_device = nullptr; ///< Polarized Gamma-- for task batch + double* vgamma_pp_eval_device = nullptr; ///< Polarized Gamma++ XC derivative for task batch + double* vgamma_pm_eval_device = nullptr; ///< Polarized Gamma+- XC derivative for task batch + double* vgamma_mm_eval_device = nullptr; ///< Polarized Gamma-- XC derivative for task batch + + double* H_x_eval_device = nullptr; ///< norm(m) dependent GGA X transformation factor for task batch + double* H_y_eval_device = nullptr; ///< norm(m) dependent GGA Y transformation factor for task batch + double* H_z_eval_device = nullptr; ///< norm(m) dependent GGA Z transformation factor for task batch + double* K_x_eval_device = nullptr; ///< norm(m) dependent LDA X transformation factor for task batch + double* K_y_eval_device = nullptr; ///< norm(m) dependent LDA Y transformation factor for task batch + double* K_z_eval_device = nullptr; ///< norm(m) dependent LDA Z transformation factor for task batch + inline void reset() { std::memset( this, 0, sizeof(base_stack_data) ); } }; @@ -118,31 +160,36 @@ struct XCDeviceStackData : public XCDeviceData { host_task_iterator generate_buffers( integrator_term_tracker, const BasisSetMap&, host_task_iterator, host_task_iterator) override final; void allocate_static_data_weights( int32_t natoms ) override final; - void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, bool do_vxc ) override final; + void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) override final; void allocate_static_data_den( int32_t nbf, int32_t nshells ) override final; void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) override final; void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) override final; void allocate_static_data_exx_ek_screening( size_t ntasks, int32_t nbf, int32_t nshells, int nshell_pairs, int32_t max_l ) override final; void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) override final; - void send_static_data_density_basis( const double* P, int32_t ldp, + void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, + const double* Py, int32_t ldpy, const double* Px, int32_t ldpx, const BasisSet& basis ) override final; void send_static_data_shell_pairs( const BasisSet&, const ShellPairCollection& ) override final; void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection& ) override final; void zero_den_integrands() override final; - void zero_exc_vxc_integrands() override final; + void zero_exc_vxc_integrands(integrator_term_tracker t) override final; void zero_exc_grad_integrands() override final; void zero_exx_integrands() override final; void zero_exx_ek_screening_intermediates() override final; void retrieve_exc_vxc_integrands( double* EXC, double* N_EL, - double* VXC, int32_t ldvxc ) override final; + double* VXCscalar, int32_t ldvxcscalar, double* VXCz, int32_t ldvxcz, + double* VXCy , int32_t ldvxcy , double* VXCx, int32_t ldvxcx ) override final; void retrieve_exc_grad_integrands( double* EXC_GRAD, double* N_EL ) override final; void retrieve_den_integrands( double* N_EL ) override final; void retrieve_exx_integrands( double* K, int32_t ldk ) override final; void retrieve_exx_ek_max_bfn_sum( double* MBS, int32_t nt) override final; void copy_weights_to_tasks( host_task_iterator task_begin, host_task_iterator task_end ) override final; - double* vxc_device_data() override; + double* vxc_s_device_data() override; + double* vxc_z_device_data() override; + double* vxc_y_device_data() override; + double* vxc_x_device_data() override; double* exc_device_data() override; double* nel_device_data() override; double* exx_k_device_data() override; diff --git a/src/xc_integrator/xc_data/device/xc_device_task.hpp b/src/xc_integrator/xc_data/device/xc_device_task.hpp index 8b071f90..696ef185 100644 --- a/src/xc_integrator/xc_data/device/xc_device_task.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_task.hpp @@ -50,17 +50,54 @@ struct XCDeviceTask { double* d2bfyy = nullptr; double* d2bfyz = nullptr; double* d2bfzz = nullptr; - double* d2bflapl = nullptr; - double* den = nullptr; - double* ddenx = nullptr; - double* ddeny = nullptr; - double* ddenz = nullptr; - double* denlapl = nullptr; double* eps = nullptr; - double* gamma = nullptr; - double* tau = nullptr; + + double* den = nullptr; + double* gamma = nullptr; double* vrho = nullptr; double* vgamma = nullptr; + + // (S,Z,Y,X) densities + double* den_s = nullptr; + double* den_z = nullptr; + double* den_y = nullptr; + double* den_x = nullptr; + // Del(S,Z,Y,X) Gradients + double* dden_sx = nullptr; + double* dden_sy = nullptr; + double* dden_sz = nullptr; + double* dden_zx = nullptr; + double* dden_zy = nullptr; + double* dden_zz = nullptr; + double* dden_yx = nullptr; + double* dden_yy = nullptr; + double* dden_yz = nullptr; + double* dden_xx = nullptr; + double* dden_xy = nullptr; + double* dden_xz = nullptr; + + // 2C U vars + double* vrho_pos = nullptr; + double* vrho_neg = nullptr; + double* gamma_pp = nullptr; + double* gamma_pm = nullptr; + double* gamma_mm = nullptr; + double* vgamma_pp = nullptr; + double* vgamma_pm = nullptr; + double* vgamma_mm = nullptr; + + // GKS K,H matrices + double* K_z = nullptr; + double* K_y = nullptr; + double* K_x = nullptr; + double* H_z = nullptr; + double* H_y = nullptr; + double* H_x = nullptr; + + // MGGA + double* d2bflapl = nullptr; + double* denlapl = nullptr; + double* tau = nullptr; double* vtau = nullptr; double* vlapl = nullptr; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2c76faf6..d3881e91 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # -# GauXC Copyright (c) 2020-2023, The Regents of the University of California, +# GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of # any required approvals from the U.S. Dept. of Energy). All rights reserved. # diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx index 7afb61d1..aef5d612 100644 --- a/tests/basis/parse_basis.cxx +++ b/tests/basis/parse_basis.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/basis/parse_basis.hpp b/tests/basis/parse_basis.hpp index 914e24a1..5815584c 100644 --- a/tests/basis/parse_basis.hpp +++ b/tests/basis/parse_basis.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/basisset_test.cxx b/tests/basisset_test.cxx index fd116dba..d8aec9d0 100644 --- a/tests/basisset_test.cxx +++ b/tests/basisset_test.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/cmake/discovery/CMakeLists.txt b/tests/cmake/discovery/CMakeLists.txt index 7ba3e00d..3a03749f 100644 --- a/tests/cmake/discovery/CMakeLists.txt +++ b/tests/cmake/discovery/CMakeLists.txt @@ -1,5 +1,5 @@ # -# GauXC Copyright (c) 2020-2023, The Regents of the University of California, +# GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of # any required approvals from the U.S. Dept. of Energy). All rights reserved. # diff --git a/tests/cmake/discovery/gauxc_link_tester.cxx b/tests/cmake/discovery/gauxc_link_tester.cxx index d4eabe39..2ba40e22 100644 --- a/tests/cmake/discovery/gauxc_link_tester.cxx +++ b/tests/cmake/discovery/gauxc_link_tester.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/cmake/subproject/CMakeLists.txt b/tests/cmake/subproject/CMakeLists.txt index af6cc42a..0bbf72b9 100644 --- a/tests/cmake/subproject/CMakeLists.txt +++ b/tests/cmake/subproject/CMakeLists.txt @@ -1,5 +1,5 @@ # -# GauXC Copyright (c) 2020-2023, The Regents of the University of California, +# GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of # any required approvals from the U.S. Dept. of Energy). All rights reserved. # diff --git a/tests/cmake/subproject/gauxc_link_tester.cxx b/tests/cmake/subproject/gauxc_link_tester.cxx index d4eabe39..2ba40e22 100644 --- a/tests/cmake/subproject/gauxc_link_tester.cxx +++ b/tests/cmake/subproject/gauxc_link_tester.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/collocation.cxx b/tests/collocation.cxx index 1ab9aafc..fb8a0393 100644 --- a/tests/collocation.cxx +++ b/tests/collocation.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/collocation_common.hpp b/tests/collocation_common.hpp index a8cf3bc1..91baa780 100644 --- a/tests/collocation_common.hpp +++ b/tests/collocation_common.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/collocation_cuda.hpp b/tests/collocation_cuda.hpp index 929fe47c..42ae0eb4 100644 --- a/tests/collocation_cuda.hpp +++ b/tests/collocation_cuda.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/collocation_hip.hpp b/tests/collocation_hip.hpp index 286dd09f..c6314aac 100644 --- a/tests/collocation_hip.hpp +++ b/tests/collocation_hip.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/collocation_host.hpp b/tests/collocation_host.hpp index fbe68a1c..a64ce8ee 100644 --- a/tests/collocation_host.hpp +++ b/tests/collocation_host.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/conv_cereal_to_hdf5.cxx b/tests/conv_cereal_to_hdf5.cxx index 7cb60381..d717cf63 100644 --- a/tests/conv_cereal_to_hdf5.cxx +++ b/tests/conv_cereal_to_hdf5.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/eigen3_matrix_serialization.hpp b/tests/eigen3_matrix_serialization.hpp index 9ed8a6ef..38537170 100644 --- a/tests/eigen3_matrix_serialization.hpp +++ b/tests/eigen3_matrix_serialization.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/grid_opt.cxx b/tests/grid_opt.cxx index 8d465a0e..ced74d20 100644 --- a/tests/grid_opt.cxx +++ b/tests/grid_opt.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/grid_test.cxx b/tests/grid_test.cxx index a0a0d37d..4e8782e6 100644 --- a/tests/grid_test.cxx +++ b/tests/grid_test.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/ini_input.cxx b/tests/ini_input.cxx index 2fc71a32..972eeaaa 100644 --- a/tests/ini_input.cxx +++ b/tests/ini_input.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/ini_input.hpp b/tests/ini_input.hpp index 00071a92..bd0de809 100644 --- a/tests/ini_input.hpp +++ b/tests/ini_input.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/load_balancer_test.cxx b/tests/load_balancer_test.cxx index 599367f9..1c55f3e8 100644 --- a/tests/load_balancer_test.cxx +++ b/tests/load_balancer_test.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/molgrid_test.cxx b/tests/molgrid_test.cxx index 9b657aae..191b8e7b 100644 --- a/tests/molgrid_test.cxx +++ b/tests/molgrid_test.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/moltypes_test.cxx b/tests/moltypes_test.cxx index 32c08376..f63dce91 100644 --- a/tests/moltypes_test.cxx +++ b/tests/moltypes_test.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/runtime.cxx b/tests/runtime.cxx index 189d5a34..1f8b6933 100644 --- a/tests/runtime.cxx +++ b/tests/runtime.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx index 2b4ffc07..efc9ce59 100644 --- a/tests/standalone_driver.cxx +++ b/tests/standalone_driver.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/standards.cxx b/tests/standards.cxx index 3aa63c86..6ca6473a 100644 --- a/tests/standards.cxx +++ b/tests/standards.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/standards.hpp b/tests/standards.hpp index 89e57035..a9db6759 100644 --- a/tests/standards.hpp +++ b/tests/standards.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/ut_common.hpp.in b/tests/ut_common.hpp.in index e7590d6c..628ef5f4 100644 --- a/tests/ut_common.hpp.in +++ b/tests/ut_common.hpp.in @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/ut_main.cxx b/tests/ut_main.cxx index 972d23d0..0ccd3be1 100644 --- a/tests/ut_main.cxx +++ b/tests/ut_main.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/weights.cxx b/tests/weights.cxx index 7028d31e..a56df0fc 100644 --- a/tests/weights.cxx +++ b/tests/weights.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/weights_cuda.hpp b/tests/weights_cuda.hpp index c332435b..bd1561a5 100644 --- a/tests/weights_cuda.hpp +++ b/tests/weights_cuda.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/weights_generate.hpp b/tests/weights_generate.hpp index 9b5b65d5..7d586a8b 100644 --- a/tests/weights_generate.hpp +++ b/tests/weights_generate.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/weights_hip.hpp b/tests/weights_hip.hpp index 0db42a48..d5ad45f7 100644 --- a/tests/weights_hip.hpp +++ b/tests/weights_hip.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/weights_host.hpp b/tests/weights_host.hpp index 22707fcf..e78694d8 100644 --- a/tests/weights_host.hpp +++ b/tests/weights_host.hpp @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * diff --git a/tests/xc_integrator.cxx b/tests/xc_integrator.cxx index 7d458dff..88527fa8 100644 --- a/tests/xc_integrator.cxx +++ b/tests/xc_integrator.cxx @@ -1,5 +1,5 @@ /** - * GauXC Copyright (c) 2020-2023, The Regents of the University of California, + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of * any required approvals from the U.S. Dept. of Energy). All rights reserved. * @@ -125,9 +125,7 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt, } } - if(uks and ex == ExecutionSpace::Device) return; - if(gks and ex == ExecutionSpace::Device) return; - + if( (uks or gks) and ex == ExecutionSpace::Device and func.is_mgga() ) return; for( auto& sh : basis ) sh.set_shell_tolerance( std::numeric_limits::epsilon() ); @@ -302,19 +300,23 @@ void test_integrator(std::string reference_file, functional_type& func, PruningS #ifdef GAUXC_HAS_MAGMA SECTION( "Incore - MPI Reduction - MAGMA" ) { - test_xc_integrator( ExecutionSpace::Device, rt, - reference_file, func, pruning_scheme, - false, true, check_k, "Default", "Default", - "Scheme1-MAGMA" ); + if(not func.is_mgga() and not func.is_polarized()) { + test_xc_integrator( ExecutionSpace::Device, rt, + reference_file, func, pruning_scheme, + false, true, check_k, "Default", "Default", + "Scheme1-MAGMA" ); + } } #endif #ifdef GAUXC_HAS_CUTLASS SECTION( "Incore - MPI Reduction - CUTLASS" ) { - test_xc_integrator( ExecutionSpace::Device, rt, - reference_file, func, pruning_scheme, - false, true, false, "Default", "Default", - "Scheme1-CUTLASS" ); + if(not func.is_mgga() and not func.is_polarized()) { + test_xc_integrator( ExecutionSpace::Device, rt, + reference_file, func, pruning_scheme, + false, true, false, "Default", "Default", + "Scheme1-CUTLASS" ); + } } #endif