Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
4093dee
add uks device data structures
mikovtun Nov 21, 2023
5781851
update data structures, compiling
mikovtun Nov 22, 2023
51a568f
add skeleton of incore eval_exc_vxc and eval_xmat implementation
mikovtun Nov 22, 2023
7b4a34c
enabled tests and more eval_den
mikovtun Nov 23, 2023
cb7377d
debug
mikovtun Nov 23, 2023
f8cf5ac
migrate working files from laptop to cluster
mikovtun Nov 27, 2023
55634c5
fixed uks gpu den eval bug and added some debug prints
Nov 27, 2023
0a3de39
fixed malloc bug, passes test now
Nov 28, 2023
d587a06
enable all tests
Nov 28, 2023
3eb6c3b
refactor for unification
Nov 28, 2023
9496ea6
fix memory-related bugs
Nov 28, 2023
b9e6bbb
unify shellbatched exc vxc
Nov 30, 2023
30a953b
removed overloaded eval_xmat
Nov 30, 2023
93ae05f
replace unlabelled RKS densities with scalar-labeled ones and removed…
Nov 30, 2023
eaff555
remove overloaded data handling functions
Nov 30, 2023
65da2f7
consolidate eval_uvvars
Nov 30, 2023
d55959b
zmat consistent naming scheme
Dec 1, 2023
54c02b3
unify inc_vxc and symmetrize_vxc
Dec 1, 2023
a33f10a
tabs->spaces and re-enabled tests (whoops)
Dec 1, 2023
a786f97
Cleaned up memory management for GGAs. Implemented gga lwd funcs up t…
Dec 2, 2023
4035d97
bugfixes and added awareness of job type to memory manager
Dec 4, 2023
e92977a
fix vgamma memory allocation bug and implement eval_zmat_gga_uks
Dec 5, 2023
0252afe
fix typo
Dec 5, 2023
27ee85e
bugfix and re-enable tests. Passes EXC/VXC tests now
Dec 5, 2023
c34610a
bugfix
Dec 6, 2023
971903b
implemented magma data handling
Dec 9, 2023
1630063
initial memory manager work
Dec 11, 2023
87757b1
more memory management
mikovtun Dec 11, 2023
3bbb595
enable gks tests
mikovtun Dec 11, 2023
5eff70c
further refactoring of density evaluation step for GKS
mikovtun Dec 11, 2023
661c243
bugfix
mikovtun Dec 11, 2023
46f5bcf
eval_uvvar uks and K,H matrices
mikovtun Dec 12, 2023
f5fb7a3
Merge branch 'GKS_device' into UKS_device
mikovtun Dec 12, 2023
c6c72a7
split uvar and vvar evaluation, removed eval_den
mikovtun Dec 12, 2023
abb574e
refactor eval_uvar device code
mikovtun Dec 14, 2023
c3631b0
bugfixes
mikovtun Dec 14, 2023
4371283
cleanup integrator hpps
mikovtun Dec 14, 2023
0fd8561
consolidate zmat_lda_<>
mikovtun Dec 14, 2023
6295271
bugfix and add GKS
mikovtun Dec 14, 2023
ed0aa93
correct nomenclature for polarized refs
mikovtun Jan 4, 2024
1b77672
bugfix
mikovtun Jan 4, 2024
f857d04
laptop->cluster wip
mikovtun Jan 8, 2024
0586f2c
bugfix thread handling in uvvars
Jan 9, 2024
27be499
migrate cudaMemcpy to device backend
Jan 10, 2024
d7bdff5
fix grid_gamma_size causing bug in WSL
mikovtun Jan 11, 2024
efc7dd6
update standalone_driver
mikovtun Feb 5, 2024
c51a4f1
Fix memory management bug
mikovtun Feb 13, 2024
66eaad5
device memory manager fix 2
mikovtun Feb 21, 2024
7b7fb03
remove underscore from eval_uvvars
mikovtun Feb 28, 2024
1741bd2
remove redundant 2d memcpy
mikovtun Feb 28, 2024
986a904
reduce divisions in uvvars
mikovtun Feb 29, 2024
be9dded
macro'd eval_uvars
mikovtun Feb 29, 2024
3b63747
cleanup zmat_vxc.cu
mikovtun Mar 1, 2024
6b883c1
remove eval_xmat default args
mikovtun Mar 1, 2024
645935b
replace single instance of send_dmat with function body
mikovtun Mar 5, 2024
805b339
make eval_exc_vxc and retrieve_integrands nullptr safe
mikovtun Mar 5, 2024
39a090a
make send_static_data_density_basis nullptr safe
mikovtun Mar 5, 2024
ee1dcd7
add do_grad descriptive bools
mikovtun Mar 5, 2024
ebbda65
cleanup eval_exc_vxc with lambdas
mikovtun Mar 5, 2024
c752e47
clarify stack data comments
mikovtun Mar 5, 2024
17a59b6
Merge changes from master
mikovtun Apr 3, 2024
0d9e5bf
Merge with upstream master
mikovtun Apr 4, 2024
03b5f87
Merge with b253ce5
mikovtun May 8, 2024
2e11cd0
Merge MGGAs in. Fails tests
mikovtun May 8, 2024
fb2ab31
Fixed memory allocation bug
mikovtun May 10, 2024
054fbd1
Fix UVar kernel
mikovtun May 10, 2024
36bbb0e
Merge branch 'master' of https://github.com/wavefunction91/GauXC into…
mikovtun May 10, 2024
792322a
Update device collocation code generation for laplacian
mikovtun May 15, 2024
683f0aa
Merge with #127
mikovtun May 20, 2024
e2951e8
Add polarized + mgga + device exception
mikovtun May 20, 2024
b70025f
Enable tests on all integrators except polarized mGGAs
mikovtun May 20, 2024
0249c34
Merge branch 'UKS_device' into UKS_shellbatch_refactor
mikovtun May 20, 2024
3b9e055
Merge with master
mikovtun May 21, 2024
6d29a00
Merged #121 and enabled EXC-only interface for UKS/GKS on device
mikovtun Jun 3, 2024
74515ec
Merge remote-tracking branch 'upstream/master' into UKS_device
mikovtun Jul 3, 2024
9f3242c
Fixed CUTLASS compilation. Error out if CUTLASS + U/GKS is attempted.
mikovtun Jul 10, 2024
1e0a6af
Disable CUTLASS MGGA and U/GKS Unit Tests
wavefunction91 Jul 11, 2024
bb7fed7
Disable MAGMA MGGA and U/GKS Unit Tests
wavefunction91 Jul 11, 2024
547adbc
Add comments for future mGGA+UKS/GKS work
mikovtun Jul 30, 2024
4c0d65b
Bump copyright ver.
mikovtun Jul 30, 2024
448bec8
Fixed kernel launch bounds degrading RKS performance
mikovtun Jul 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/gauxc/exceptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class magma_exception;
class cutlass_exception;
#endif

/// C++ Excpetion for genertic GauXC errors
/// C++ Exception for generic GauXC errors
class generic_gauxc_exception : public std::exception {

std::string file_;
Expand Down
2 changes: 1 addition & 1 deletion src/xc_integrator/integrator_util/exx_screening.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ void exx_ek_screening(
device_data.reset_allocations();
device_data.allocate_static_data_exx_ek_screening( ntasks, nbf, nshells,
shpairs.npairs(), basis_map.max_l() );
device_data.send_static_data_density_basis( P_abs, ldp, basis );
device_data.send_static_data_density_basis( P_abs, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis );
device_data.send_static_data_exx_ek_screening( V_shell_max, ldv, basis_map,
shpairs );

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,6 @@ void syr2k( device_blas_handle handle,
int M, int K, T ALPHA,
const T* A, int LDA, const T* B, int LDB,
T BETA, T* C, int LDC );

}

17 changes: 10 additions & 7 deletions src/xc_integrator/local_work_driver/device/common/uvvars.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@
*/
#pragma once
#include "device/xc_device_task.hpp"
#include "device/xc_device_data.hpp"
#include "device/device_queue.hpp"

namespace GauXC {

void eval_uvvars_lda( size_t ntasks, int32_t nbe_max, int32_t npts_max,

void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
XCDeviceTask* device_tasks, device_queue queue );

void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbe_max,
int32_t npts_max, XCDeviceTask* device_tasks, const double* denx,
const double* deny, const double* denz, double* gamma, device_queue queue );
void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
XCDeviceTask* device_tasks, device_queue queue );

void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbe_max,
int32_t npts_max, XCDeviceTask* device_tasks, const double* denx,
const double* deny, const double* denz, double* gamma, bool do_lapl,
void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks,
device_queue queue );

void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select,
XCDeviceTask* device_tasks, device_queue queue );

}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* See LICENSE.txt for details
*/
#include "device/xc_device_task.hpp"
#include "device/xc_device_data.hpp"
#include "device/device_queue.hpp"

namespace GauXC {
Expand All @@ -14,12 +15,16 @@ void zmat_lda_vxc( size_t ntasks,
int32_t max_nbf,
int32_t max_npts,
XCDeviceTask* tasks_device,
integrator_ks_scheme s,
density_id sel,
device_queue queue );

void zmat_gga_vxc( size_t ntasks,
int32_t max_nbf,
int32_t max_npts,
XCDeviceTask* tasks_device,
integrator_ks_scheme s,
density_id sel,
device_queue queue );

void zmat_mgga_vxc( size_t ntasks,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,88 @@ void eval_collocation_shell_to_task_hessian(
}


uint32_t max_threads_shell_to_task_collocation_laplacian( int32_t l, bool pure ) {
if( pure ) {
switch(l) {
case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );\
$for( L in range(1, L_max + 1) )
case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_$(L) );
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
} else {
switch(l) {\
$for( L in range(L_max + 1) )
case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L) );\
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
}
return 0;
}





template <typename... Args>
void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l,
bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {

dim3 threads = max_threads_shell_to_task_collocation(l,pure);
int nwarp_per_block = threads.x / cuda::warp_size;
int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
dim3 block(n_task_blocks, 1, nshells);

if( pure ) {
switch(l) {
case 0:
collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
break;
$for( L in range(1, L_max + 1) )
case $(L):
collocation_device_shell_to_task_kernel_spherical_laplacian_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
break;\
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
} else {
switch(l) {\
$for( L in range(0, L_max + 1) )
case $(L):
collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
break;\
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
}

}



void eval_collocation_shell_to_task_laplacian(
uint32_t max_l,
AngularMomentumShellToTaskBatch* l_batched_shell_to_task,
XCDeviceTask* device_tasks,
device_queue queue
) {

cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;

for( auto l = 0u; l <= max_l; ++l ) {
auto pure = l_batched_shell_to_task[l].pure;
auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device;
auto nshells = l_batched_shell_to_task[l].nshells_in_batch;
auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average);
dispatch_shell_to_task_collocation_laplacian( stream, l, pure,
ntask_average, nshells, shell_to_task_device, device_tasks );
auto stat = cudaGetLastError();
GAUXC_CUDA_ERROR("LAP", stat);
}


}



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_hessian.hpp"\
$endfor

$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_laplacian.hpp"\
$endfor

$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L).hpp"\
$endfor
Expand All @@ -30,3 +34,7 @@
$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_hessian.hpp"\
$endfor

$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_laplacian.hpp"\
$endfor
Original file line number Diff line number Diff line change
Expand Up @@ -181,5 +181,6 @@ void syr2k( device_blas_handle generic_handle,

}


}

Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
const auto* __restrict__ vrho = task->vrho;
const auto* __restrict__ vgamma = task->vgamma;

const auto* __restrict__ den_x = task->ddenx;
const auto* __restrict__ den_y = task->ddeny;
const auto* __restrict__ den_z = task->ddenz;
const auto* __restrict__ den_x = task->dden_sx;
const auto* __restrict__ den_y = task->dden_sy;
const auto* __restrict__ den_z = task->dden_sz;

#pragma unroll 1
for( uint32_t ipt = threadIdx.x % cuda::warp_size;
Expand Down
Loading