@@ -308,12 +308,12 @@ static void showVersion() {
308308 }
309309}
310310
311- static ncclResult_t fillInfo (struct ncclPeerInfo * info, int rank) {
311+ static ncclResult_t fillInfo (struct ncclPeerInfo * info, int rank, uint64_t commHash ) {
312312 info->rank = rank;
313313 CUDACHECK (cudaGetDevice (&info->cudaDev ));
314314 NCCLCHECK (getNvmlDevice (info->cudaDev , &info->nvmlDev ))
315- info->hostHash =getHostHash ();
316- info->pidHash =getPidHash ();
315+ info->hostHash =getHostHash ()+commHash ;
316+ info->pidHash =getPidHash ()+commHash ;
317317
318318 // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
319319 // cudaDev is a CUDA runtime dev number which could be different from the
@@ -691,7 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
691691
692692 int rank = comm->rank ;
693693 int nranks = comm->nRanks ;
694- TRACE (NCCL_INIT, " rank %d nranks %d - BEGIN" , rank, nranks);
694+ uint64_t commHash = getHash (commId->internal , NCCL_UNIQUE_ID_BYTES);
695+ TRACE (NCCL_INIT, " comm %p, commHash %lx, rank %d nranks %d - BEGIN" , comm, commHash, rank, nranks);
695696 NCCLCHECK (bootstrapInit (commId, rank, nranks, &comm->bootstrap ));
696697
697698 // AllGather1 - begin
@@ -702,7 +703,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
702703
703704 NCCLCHECK (ncclCalloc (&allGather1Data, nranks));
704705 allGather1Data[rank].comm = comm;
705- NCCLCHECK (fillInfo (&allGather1Data[rank].peerInfo , rank));
706+ NCCLCHECK (fillInfo (&allGather1Data[rank].peerInfo , rank, commHash ));
706707 NCCLCHECK (bootstrapAllGather (comm->bootstrap , allGather1Data, sizeof (*allGather1Data)));
707708
708709 NCCLCHECK (ncclCalloc (&comm->peerInfo , nranks));
@@ -998,7 +999,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
998999 NCCLCHECK (ncclCalloc (&allInfo, nranks));
9991000 for (int rank=0 ; rank<nranks; rank++) {
10001001 CUDACHECK (cudaSetDevice (devs[rank]));
1001- NCCLCHECK (fillInfo (allInfo+rank, rank));
1002+ NCCLCHECK (fillInfo (allInfo+rank, rank, 0 ));
10021003 }
10031004
10041005 int * connectTransport;
0 commit comments