Skip to content

Commit ccb1298

Browse files
committed
Merge branch 'lowintelligence-shm'
PR#196
2 parents 7f2b337 + fad079a commit ccb1298

File tree

3 files changed

+12
-10
lines changed

3 files changed

+12
-10
lines changed

src/include/utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <stdint.h>
1212

1313
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
14+
uint64_t getHash(const char* string, int n);
1415
uint64_t getHostHash();
1516
uint64_t getPidHash();
1617

src/init.cc

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -308,12 +308,12 @@ static void showVersion() {
308308
}
309309
}
310310

311-
static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
311+
static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
312312
info->rank = rank;
313313
CUDACHECK(cudaGetDevice(&info->cudaDev));
314314
NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
315-
info->hostHash=getHostHash();
316-
info->pidHash=getPidHash();
315+
info->hostHash=getHostHash()+commHash;
316+
info->pidHash=getPidHash()+commHash;
317317

318318
// Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
319319
// cudaDev is a CUDA runtime dev number which could be different from the
@@ -691,7 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
691691

692692
int rank = comm->rank;
693693
int nranks = comm->nRanks;
694-
TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
694+
uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
695+
TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
695696
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
696697

697698
// AllGather1 - begin
@@ -702,7 +703,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
702703

703704
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
704705
allGather1Data[rank].comm = comm;
705-
NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
706+
NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
706707
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
707708

708709
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
@@ -998,7 +999,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
998999
NCCLCHECK(ncclCalloc(&allInfo, nranks));
9991000
for (int rank=0; rank<nranks; rank++) {
10001001
CUDACHECK(cudaSetDevice(devs[rank]));
1001-
NCCLCHECK(fillInfo(allInfo+rank, rank));
1002+
NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
10021003
}
10031004

10041005
int* connectTransport;

src/misc/utils.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,10 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
8787
}
8888
}
8989

90-
uint64_t getHash(const char* string) {
90+
uint64_t getHash(const char* string, int n) {
9191
// Based on DJB2, result = result * 33 + char
9292
uint64_t result = 5381;
93-
for (int c = 0; string[c] != '\0'; c++) {
93+
for (int c = 0; c < n; c++) {
9494
result = ((result << 5) + result) + string[c];
9595
}
9696
return result;
@@ -120,7 +120,7 @@ uint64_t getHostHash(void) {
120120
uname[offset]='\0';
121121
TRACE(NCCL_INIT,"unique hostname '%s'", uname);
122122

123-
return getHash(uname);
123+
return getHash(uname, strlen(uname));
124124
}
125125

126126
/* Generate a hash of the unique identifying string for this process
@@ -140,7 +140,7 @@ uint64_t getPidHash(void) {
140140
pname[plen+len]='\0';
141141
TRACE(NCCL_INIT,"unique PID '%s'", pname);
142142

143-
return getHash(pname);
143+
return getHash(pname, strlen(pname));
144144
}
145145

146146
int parseStringList(const char* string, struct netIf* ifList, int maxList) {

0 commit comments

Comments
 (0)