Alex Park
2015-04-28 22:41:33 UTC
Hi,
I'm trying to use multiple gpus with mpi and ipc handles instead of the
built-in mpi primitives to p2p communication.
I think I'm not quite understanding how contexts should be managed. For
example, I have two versions of a toy example to try out accessing data
between nodes via ipc handle. Both seem to work, in the sense that process
1 can 'see' the data from process 0, but the first version completes
without any error, while the second version generates the following error:
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
â···········
cuMemFree failed: invalid value
The two versions are attached below. Would appreciate any insight as to
what I'm doing wrong.
-Alex
Here are the two versions:
*VERSION 1*
from mpi4py import MPIimport numpy as npimport atexitimport
pycuda.driver as drvimport pycuda.gpuarray as gpuarrayclass
TestMGPU(object): def __init__(self): self.mpi_size =
MPI.COMM_WORLD.size self.mpi_rank = MPI.COMM_WORLD.rank def
proc(self): if self.mpi_rank == 0: ctx =
drv.Device(self.mpi_rank).make_context() self.x_gpu =
gpuarray.to_gpu(np.random.rand(8)) h =
drv.mem_get_ipc_handle(self.x_gpu.ptr)
MPI.COMM_WORLD.send((h, self.x_gpu.shape, self.x_gpu.dtype), dest=1)
print 'p1 self.x_gpu:', self.x_gpu ctx.detach()
else: ctx = drv.Device(self.mpi_rank).make_context()
h, s, d = MPI.COMM_WORLD.recv(source=0) ptr =
drv.IPCMemoryHandle(h) xt_gpu = gpuarray.GPUArray(s, d,
gpudata=ptr) print 'xt_gpu: ', xt_gpu
ctx.detach()if __name__ == '__main__': drv.init()
atexit.register(MPI.Finalize) a = TestMGPU() a.proc()
*VERSION 2 (Imports are the same)*
class TestMGPU(object): def __init__(self): self.mpi_size =
MPI.COMM_WORLD.size self.mpi_rank = MPI.COMM_WORLD.rank
self.x_gpu = gpuarray.to_gpu(np.random.rand(8)) def proc(self):
if self.mpi_rank == 0: h =
drv.mem_get_ipc_handle(self.x_gpu.ptr)
MPI.COMM_WORLD.send((h, self.x_gpu.shape, self.x_gpu.dtype), dest=1)
print 'p1 self.x_gpu:', self.x_gpu else: h,
s, d = MPI.COMM_WORLD.recv(source=0) ptr =
drv.IPCMemoryHandle(h) xt_gpu = gpuarray.GPUArray(s, d,
gpudata=ptr) print 'xt_gpu: ', xt_gpuif __name__ ==
'__main__': drv.init() ctx =
drv.Device(MPI.COMM_WORLD.rank).make_context()
atexit.register(ctx.pop) atexit.register(MPI.Finalize) a =
TestMGPU() a.proc()
I'm trying to use multiple gpus with mpi and ipc handles instead of the
built-in mpi primitives to p2p communication.
I think I'm not quite understanding how contexts should be managed. For
example, I have two versions of a toy example to try out accessing data
between nodes via ipc handle. Both seem to work, in the sense that process
1 can 'see' the data from process 0, but the first version completes
without any error, while the second version generates the following error:
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
â···········
cuMemFree failed: invalid value
The two versions are attached below. Would appreciate any insight as to
what I'm doing wrong.
-Alex
Here are the two versions:
*VERSION 1*
from mpi4py import MPIimport numpy as npimport atexitimport
pycuda.driver as drvimport pycuda.gpuarray as gpuarrayclass
TestMGPU(object): def __init__(self): self.mpi_size =
MPI.COMM_WORLD.size self.mpi_rank = MPI.COMM_WORLD.rank def
proc(self): if self.mpi_rank == 0: ctx =
drv.Device(self.mpi_rank).make_context() self.x_gpu =
gpuarray.to_gpu(np.random.rand(8)) h =
drv.mem_get_ipc_handle(self.x_gpu.ptr)
MPI.COMM_WORLD.send((h, self.x_gpu.shape, self.x_gpu.dtype), dest=1)
print 'p1 self.x_gpu:', self.x_gpu ctx.detach()
else: ctx = drv.Device(self.mpi_rank).make_context()
h, s, d = MPI.COMM_WORLD.recv(source=0) ptr =
drv.IPCMemoryHandle(h) xt_gpu = gpuarray.GPUArray(s, d,
gpudata=ptr) print 'xt_gpu: ', xt_gpu
ctx.detach()if __name__ == '__main__': drv.init()
atexit.register(MPI.Finalize) a = TestMGPU() a.proc()
*VERSION 2 (Imports are the same)*
class TestMGPU(object): def __init__(self): self.mpi_size =
MPI.COMM_WORLD.size self.mpi_rank = MPI.COMM_WORLD.rank
self.x_gpu = gpuarray.to_gpu(np.random.rand(8)) def proc(self):
if self.mpi_rank == 0: h =
drv.mem_get_ipc_handle(self.x_gpu.ptr)
MPI.COMM_WORLD.send((h, self.x_gpu.shape, self.x_gpu.dtype), dest=1)
print 'p1 self.x_gpu:', self.x_gpu else: h,
s, d = MPI.COMM_WORLD.recv(source=0) ptr =
drv.IPCMemoryHandle(h) xt_gpu = gpuarray.GPUArray(s, d,
gpudata=ptr) print 'xt_gpu: ', xt_gpuif __name__ ==
'__main__': drv.init() ctx =
drv.Device(MPI.COMM_WORLD.rank).make_context()
atexit.register(ctx.pop) atexit.register(MPI.Finalize) a =
TestMGPU() a.proc()