Keith Brown
2015-11-09 03:23:18 UTC
I have several thousand matrices where I need to calculate their dot
product. So, it seems pyCuda should do the trick (i hope). I am
running into an issue with block sizes.
Here is my code
#!/usr/bin/env python
import sys,time
from string import Template
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
from pycuda.compiler import SourceModule
import pycuda.autoinit
def main():
d={}
size=4
d['size']=size
src=Template("""
__global__ void MatrixMulKernel(float *a, float *b, float *c)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
float Pvalue = 0;
for (int k = 0; k < $size; ++k) {
float Aelement = a[ty * $size + k];
float Belement = b[k * $size + tx];
Pvalue += Aelement * Belement;
}
c[ty * $size + tx] = Pvalue;
}
""")
#src.substitute(d)
a_cpu = np.random.randn(size,size).astype(np.float32)
b_cpu = np.random.randn(size,size).astype(np.float32)
a_gpu=gpuarray.to_gpu(a_cpu)
b_gpu=gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.empty((size,size), np.float32)
src.substitute(d)
mod = compiler.SourceModule(src.substitute(d))
mm=mod.get_function("MatrixMulKernel")
v=mm(a_gpu,b_gpu,c_gpu,
block=(16,16,1),
)
start=time.time()
gpu_ans=c_gpu.get()
stop=time.time()
print "Gpu",stop-start
start=time.time()
cpu_ans=np.dot(a_cpu,b_cpu)
stop=time.time()
print "Cpu",stop-start
#print gpu_ans
#print cpu_ans
print np.allclose(gpu_ans,cpu_ans,atol=1e-03)
Couple of issues:
When I increase size of matrix it seems it gets less accurate than CPU
dot product therefore I have to use np.allclose to compare.
Also, what is the optimal block size I should be using?
product. So, it seems pyCuda should do the trick (i hope). I am
running into an issue with block sizes.
Here is my code
#!/usr/bin/env python
import sys,time
from string import Template
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
from pycuda.compiler import SourceModule
import pycuda.autoinit
def main():
d={}
size=4
d['size']=size
src=Template("""
__global__ void MatrixMulKernel(float *a, float *b, float *c)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
float Pvalue = 0;
for (int k = 0; k < $size; ++k) {
float Aelement = a[ty * $size + k];
float Belement = b[k * $size + tx];
Pvalue += Aelement * Belement;
}
c[ty * $size + tx] = Pvalue;
}
""")
#src.substitute(d)
a_cpu = np.random.randn(size,size).astype(np.float32)
b_cpu = np.random.randn(size,size).astype(np.float32)
a_gpu=gpuarray.to_gpu(a_cpu)
b_gpu=gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.empty((size,size), np.float32)
src.substitute(d)
mod = compiler.SourceModule(src.substitute(d))
mm=mod.get_function("MatrixMulKernel")
v=mm(a_gpu,b_gpu,c_gpu,
block=(16,16,1),
)
start=time.time()
gpu_ans=c_gpu.get()
stop=time.time()
print "Gpu",stop-start
start=time.time()
cpu_ans=np.dot(a_cpu,b_cpu)
stop=time.time()
print "Cpu",stop-start
#print gpu_ans
#print cpu_ans
print np.allclose(gpu_ans,cpu_ans,atol=1e-03)
Couple of issues:
When I increase size of matrix it seems it gets less accurate than CPU
dot product therefore I have to use np.allclose to compare.
Also, what is the optimal block size I should be using?