Skip to content

Possible bug in the MPI_Waitall function #682

@Anet18

Description

@Anet18

New Issue for sst-macro

I tried to run LULESH https://github.com/LLNL/LULESH in SST Macro. And got the following error. In configuration file I used debug = [mpi] to print out the MPI debugging lines. I first collected traces of LULESH using dumpi then simulated those dumpi traces on SST macro 12.0.0. I used the latest SST dumpi https://github.com/sstsimulator/sst-dumpi to collect the traces. And tried both OpenMPI/4.1.1 and MPICH/3.3.2 for collecting traces. In both of these two MPI implementations I got the same error.

Another thing to mention is that for miniVite proxy application (https://github.com/Exa-Graph/miniVite), we got lucky. Switching from OpenMPI/4.1.1 to MPICH/3.3.2 made this error go away. We would like to understand the reason behind this issue, since MPI_Request is an opaque handle, and differences in the underlying implementation should not change behavior.

sstmac
  --debug="" \
  --configfile="dragonfly.ini" \	

MPI Rank 0   : MPI_Init()
MPI Rank 1   : MPI_Init()
MPI Rank 2   : MPI_Init()
MPI Rank 4   : MPI_Init()
MPI Rank 3   : MPI_Init()
MPI Rank 5   : MPI_Init()
MPI Rank 6   : MPI_Init()
MPI Rank 7   : MPI_Init()
MPI Rank 0   : MPI_Init finished
MPI Rank 0   : MPI_Irecv(961,MPI_DOUBLE=7,4:4,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(961,MPI_DOUBLE=7,2:2,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(961,MPI_DOUBLE=7,1:1,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(31,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(31,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 0   : MPI_Irecv finished
DUMPI trace   1 percent complete: dumpi-2022.07.07.17.49.05-0000.bin
MPI Rank 0   : MPI_Irecv(31,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(1,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Isend(961,MPI_DOUBLE=7,4,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(961,MPI_DOUBLE=7,2,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(961,MPI_DOUBLE=7,1,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(31,MPI_DOUBLE=7,3,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(31,MPI_DOUBLE=7,6,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 4   : MPI_Init finished
MPI Rank 4   : MPI_Irecv(961,MPI_DOUBLE=7,0:0,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(961,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(961,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(31,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(31,MPI_DOUBLE=7,2:2,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(31,MPI_DOUBLE=7,1:1,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(1,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Isend(961,MPI_DOUBLE=7,0,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 4   : MPI_Isend finished
MPI Rank 4   : MPI_Isend(961,MPI_DOUBLE=7,6,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 4   : MPI_Isend finished
MPI Rank 4   : MPI_Isend(961,MPI_DOUBLE=7,5,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 4   : MPI_Isend finished
MPI Rank 2   : MPI_Init finished
MPI Rank 2   : MPI_Irecv(961,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(961,MPI_DOUBLE=7,0:0,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(961,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(31,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(31,MPI_DOUBLE=7,4:4,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(31,MPI_DOUBLE=7,1:1,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(1,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Isend(961,MPI_DOUBLE=7,6,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 2   : MPI_Isend finished
MPI Rank 2   : MPI_Isend(961,MPI_DOUBLE=7,0,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 2   : MPI_Isend finished
MPI Rank 2   : MPI_Isend(961,MPI_DOUBLE=7,3,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 2   : MPI_Isend finished
MPI Rank 1   : MPI_Init finished
MPI Rank 1   : MPI_Irecv(961,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(961,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(961,MPI_DOUBLE=7,0:0,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(31,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(31,MPI_DOUBLE=7,2:2,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(31,MPI_DOUBLE=7,4:4,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(1,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Isend(961,MPI_DOUBLE=7,5,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 1   : MPI_Isend finished
MPI Rank 1   : MPI_Isend(961,MPI_DOUBLE=7,3,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 1   : MPI_Isend finished
MPI Rank 1   : MPI_Isend(961,MPI_DOUBLE=7,0,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 1   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(31,MPI_DOUBLE=7,5,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(1,MPI_DOUBLE=7,7,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
DUMPI trace   2 percent complete: dumpi-2022.07.07.17.49.05-0000.bin
MPI Rank 0   : MPI_Waitall(26,...)
MPI Rank 0   :    MPI_Wait_nonnull(9)
MPI Rank 0   :    MPI_Wait_nonnull(9)
thread terminated with exception: sprockit::SpktError: could not find mpi request 9 for rank 0
../../sumi-mpi/mpi_api.cc 498
aborting
Aborted (core dumped)

I used the following dragonfly.ini configuration file to run the trace

debug=[mpi]
topology {
 name = dragonfly
 geometry = [8,9]
 #group_connections = 8
 concentration = 8
 #inter_group = alltoall
 h = 8
 #redundant = [1,2]
}

switch {
 router {
  #pb_latency = 0.0
  #name = dragonfly_valiant
  #name = dragonfly_minimal
  name = dragonfly_par
  #name = dragonfly_scatter
  #name = dragonfly_ugal
  #name = dragonfly_rotate
  seed = 14
 }
}
node {
 app1 {
    ftq {
     type = ftq_calendar
     epoch_length = 1ms
     output = ftq
     group = app1
    }
  name = parsedumpi
  #random_allocation_seed = 116
  #indexing = random
  #allocation = random
  allocation = first_available
  size = 8
  launch_cmd = aprun -n 8 -N 1
  dumpi_metaname = dumpi-2022.07.07.17.49.05.meta   
  #coordinate_file = coords.txt
  start = 0ms
 }
 nic {
  name = pisces
  injection {
   mtu = 4096
   arbitrator = cut_through
   bandwidth = 1.0GB/s
   latency = 50ns
   credits = 64KB
  }
  ejection {
   latency = 50ns
  }
 }
 memory {
  name = pisces
  total_bandwidth = 10GB/s
  latency = 10ns
  max_single_bandwidth = 10GB/s
 }
 proc {
  ncores = 1
  frequency = 2GHz
 }
 name = simple
}


switch {
 name = pisces
 arbitrator = cut_through
 mtu = 4096
 link {
  bandwidth = 1.0GB/s
  latency = 100ns
  credits = 64KB
 }
 xbar {
  bandwidth = 10GB/s
 }
 logp {
  bandwidth = 1GB/s
  hop_latency = 100ns
  out_in_latency = 100ns
 }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions