slegrand
2016-10-17 10:11:54 UTC
Hello everybody,
I'm currently using pycuda and scikit-cuda to parallelize a simple code.
Basically I repeat this structure inside a for loop:
1-matrix/vector product (cublas.cublasDgemv)
2-elementwise division(cumisc.divide)
3-matrix/vector product
4-elementwise division
5-Error calculation
and I leave the loop when the error is small enough (You can see the
code at the end of the mail). I want to calculate the error on the GPU
and check with a if condition if it's small enough before breaking the
loop. error_dev and error_min_dev are both (1,) array but when I try to
compare them in the if condition, I get the following error:
File "./lib/Solvers/IPFP_GPU/functionsGPU.py", line 109, in
solve_IPFP_simple_gpu
if(error_dev < error_min_dev):
TypeError: an integer is required
and if I try to access to the only element of these arrays:
File "./lib/Solvers/IPFP_GPU/functionsGPU.py", line 129, in
solve_IPFP_simple_gpu
if(error_dev[0] < error_min_dev[0]):
File
"/home/slegrand/miniconda/lib/python2.7/site-packages/pycuda/gpuarray.py",
line 838, in __getitem__
array_shape = self.shape[array_axis]
IndexError: tuple index out of range
The only solution I found was to use the get_async() and to compare both
arrays on the CPU but I guess this is not the best solution... I
wondered if there is a way to compare these arrays without sending them
back to the CPU.
On the other hand, I wondered how is controlled the for loop. How are
the iterations synchronized with the GPU calculations?
Thanks for your time!
Best regards,
Simon Legrand
def solve_IPFP_simple_gpu(Mu, Nu, epsilon):
dtype = np.float64
mu = np.reshape(Mu.values,(1,np.size(Mu.values)))
nu = np.reshape(Nu.values,(np.size(Nu.values),1))
a = np.copy(nu)
C = quad_cost_matrix(Mu.vertices, Nu.vertices)
K = np.exp(-C/epsilon).astype(dtype)
handle = cublas.cublasCreate()
m = np.shape(K)[0]
n = np.shape(K)[1]
alpha = np.float64(1.0)
beta = np.float64(0.0)
mu_dev = gpuarray.to_gpu(mu)
s1_dev = gpuarray.empty(mu.T.shape,dtype)
nu_dev = gpuarray.to_gpu(nu)
s2_dev = gpuarray.empty(nu.shape,dtype)
K_dev = gpuarray.to_gpu(K)
a_dev = gpuarray.to_gpu(a)
an_dev = gpuarray.empty(a.shape,dtype)
b_dev = gpuarray.to_gpu(mu)
error_min_dev = gpuarray.to_gpu(np.array(1e-3).astype(np.float64))
niter_max = 1000
culinalg.init()
for i in xrange(0, niter_max):
cublas.cublasDgemv(handle, 't', m, n, alpha, K_dev.gpudata, m,
a_dev.gpudata, 1, beta, s1_dev.gpudata, 1)
b_dev = cumisc.divide(mu_dev,culinalg.transpose(s1_dev))
cublas.cublasDgemv(handle, 'n', n, m, alpha, K_dev.gpudata, n,
b_dev.gpudata, 1, beta, s2_dev.gpudata, 1)
an_dev = cumisc.divide(nu_dev, s2_dev)
error_dev =
cumisc.divide(cumisc.sum(cumisc.subtract(an_dev,a_dev)),cumisc.sum(a_dev))
a_dev = an_dev
print(error_dev.get_async(), error_min_dev.get_async())
if(error_dev < error_min_dev):
break
a = a_dev.get()
b = b_dev.get()
psi = np.reshape(epsilon*np.log(a),(np.size(a),))
phi = np.reshape(epsilon*np.log(b),(np.size(b),))
Gamma = K*a*b
cublas.cublasDestroy(handle)
return Gamma, phi, psi
I'm currently using pycuda and scikit-cuda to parallelize a simple code.
Basically I repeat this structure inside a for loop:
1-matrix/vector product (cublas.cublasDgemv)
2-elementwise division(cumisc.divide)
3-matrix/vector product
4-elementwise division
5-Error calculation
and I leave the loop when the error is small enough (You can see the
code at the end of the mail). I want to calculate the error on the GPU
and check with a if condition if it's small enough before breaking the
loop. error_dev and error_min_dev are both (1,) array but when I try to
compare them in the if condition, I get the following error:
File "./lib/Solvers/IPFP_GPU/functionsGPU.py", line 109, in
solve_IPFP_simple_gpu
if(error_dev < error_min_dev):
TypeError: an integer is required
and if I try to access to the only element of these arrays:
File "./lib/Solvers/IPFP_GPU/functionsGPU.py", line 129, in
solve_IPFP_simple_gpu
if(error_dev[0] < error_min_dev[0]):
File
"/home/slegrand/miniconda/lib/python2.7/site-packages/pycuda/gpuarray.py",
line 838, in __getitem__
array_shape = self.shape[array_axis]
IndexError: tuple index out of range
The only solution I found was to use the get_async() and to compare both
arrays on the CPU but I guess this is not the best solution... I
wondered if there is a way to compare these arrays without sending them
back to the CPU.
On the other hand, I wondered how is controlled the for loop. How are
the iterations synchronized with the GPU calculations?
Thanks for your time!
Best regards,
Simon Legrand
def solve_IPFP_simple_gpu(Mu, Nu, epsilon):
dtype = np.float64
mu = np.reshape(Mu.values,(1,np.size(Mu.values)))
nu = np.reshape(Nu.values,(np.size(Nu.values),1))
a = np.copy(nu)
C = quad_cost_matrix(Mu.vertices, Nu.vertices)
K = np.exp(-C/epsilon).astype(dtype)
handle = cublas.cublasCreate()
m = np.shape(K)[0]
n = np.shape(K)[1]
alpha = np.float64(1.0)
beta = np.float64(0.0)
mu_dev = gpuarray.to_gpu(mu)
s1_dev = gpuarray.empty(mu.T.shape,dtype)
nu_dev = gpuarray.to_gpu(nu)
s2_dev = gpuarray.empty(nu.shape,dtype)
K_dev = gpuarray.to_gpu(K)
a_dev = gpuarray.to_gpu(a)
an_dev = gpuarray.empty(a.shape,dtype)
b_dev = gpuarray.to_gpu(mu)
error_min_dev = gpuarray.to_gpu(np.array(1e-3).astype(np.float64))
niter_max = 1000
culinalg.init()
for i in xrange(0, niter_max):
cublas.cublasDgemv(handle, 't', m, n, alpha, K_dev.gpudata, m,
a_dev.gpudata, 1, beta, s1_dev.gpudata, 1)
b_dev = cumisc.divide(mu_dev,culinalg.transpose(s1_dev))
cublas.cublasDgemv(handle, 'n', n, m, alpha, K_dev.gpudata, n,
b_dev.gpudata, 1, beta, s2_dev.gpudata, 1)
an_dev = cumisc.divide(nu_dev, s2_dev)
error_dev =
cumisc.divide(cumisc.sum(cumisc.subtract(an_dev,a_dev)),cumisc.sum(a_dev))
a_dev = an_dev
print(error_dev.get_async(), error_min_dev.get_async())
if(error_dev < error_min_dev):
break
a = a_dev.get()
b = b_dev.get()
psi = np.reshape(epsilon*np.log(a),(np.size(a),))
phi = np.reshape(epsilon*np.log(b),(np.size(b),))
Gamma = K*a*b
cublas.cublasDestroy(handle)
return Gamma, phi, psi