Numba CUDA`vectorize`和`reduce`装饰器比预期慢

时间:2017-02-02 18:35:44

标签: python performance numpy numba reduction

我一直在使用Numba包测试一些基本的CUDA函数。我的主要目标是在GPU上实现Richardson-Lucy算法。可以加速算法,并且可以在以下虚函数

中总结这样做的一个主要步骤
def dummy(arr1, arr2):
    return (arr1 * arr2).sum() / ((arr2**2).sum() + eps)

这个功能在CPU上运行得相当快,但是我想把所有东西放在GPU上以避免主机< --->设备副本。

为了比较不同计算的速度,我写了一小段函数:

import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time


# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
    return a + b

@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
    return a * b

@cuda.reduce
def sum_gpu(a, b):
    return a + b

@cuda.jit
def add_gpu_1d(a, b, c):
    x = cuda.grid(1)
    if x < c.size:
        c[x] = a[x] + b[x]

@cuda.jit
def mult_gpu_1d(a, b, c):
    x = cuda.grid(1)
    if x < c.size:
        c[x] = a[x] * b[x]

@cuda.jit
def mult_gpu_2d(a, b, c):
    x, y = cuda.grid(2)
    if x < c.shape[0] and y < c.shape[1]:
        c[x, y] = a[x, y] * b[x, y]

@cuda.jit
def add_gpu_2d(a, b, c):
    x, y = cuda.grid(2)
    if x < c.shape[0] and y < c.shape[1]:
        c[x, y] = a[x, y] + b[x, y]

和一些计时器功能:

def avg_t(t, num):
    return np.mean(t) / num

def format_t(t):
    """Turn t into nice formating"""
    if t < 1e-3:
        return "{:.1f} us".format(t * 1e6)
    elif t < 1:
        return "{:.1f} ms".format(t * 1e3)
    else:
        return "{:.1f} s".format(t)

def test_1d_times(data_len, dtype=np.float32):
    num_times = 10

    title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
    print(len(title) * "=")
    print(title)
    print(len(title) * "=")

    t = time.time()
    arr1, arr2 = np.empty((2, data_len), dtype=dtype)
    d_arr1 = cuda.to_device(arr1)
    d_arr2 = cuda.to_device(arr2)
    d_result = cuda.device_array_like(d_arr1)
    print("Data generated in " + format_t(time.time() - t))
    print("d_arr1 dtype =", d_arr1.dtype)
    print("d_arr1 size = ", d_arr1.size)

    print()
    print("Testing multiplication times")
    print("----------------------------")

    t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing sum times")
    print("------------------")

    t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing reduction times")
    print("-----------------------")

    t = timeit.repeat((lambda: arr1.sum()), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
    print("sum_gpu time = " + format_t(avg_t(t, num_times)))
    print()

def test_2d_times(data_len, dtype=np.float32):
    num_times = 10

    title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
    print(len(title) * "=")
    print(title)
    print(len(title) * "=")

    t = time.time()
    arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
    d_arr1 = cuda.to_device(arr1)
    d_arr2 = cuda.to_device(arr2)
    d_result = cuda.device_array_like(d_arr1)
    print("Data generated in {} seconds".format(time.time() - t))
    print("d_arr1 dtype =", d_arr1.dtype)
    print("d_arr1 size = ", d_arr1.size)

    print()
    print("Testing multiplication times")
    print("----------------------------")

    t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing sum times")
    print("------------------")

    t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing reduction times")
    print("-----------------------")

    t = timeit.repeat((lambda: arr1.sum()), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
    print("sum_gpu time = " + format_t(avg_t(t, num_times)))
    print()

运行测试功能

numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)

给出以下输出:

Found 1 CUDA devices
id 0    b'GeForce GTX TITAN X'                              [SUPPORTED]
                      compute capability: 5.2
                           pci device id: 0
                              pci bus id: 3
Summary:
    1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us

Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us

Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us

Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us

Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms

============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us

Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us

Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us

Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us

Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms

似乎@cuda.vectorize修饰函数的执行速度比CPU和自定义编写的@cuda.jit函数慢。虽然@cuda.jit函数可以提供预期的数量级加速和几乎恒定的时间性能(结果未显示)。

另一方面,@cuda.reduce函数的运行速度明显慢于@cuda.vectorize函数或CPU函数。

@cuda.vectorize@cuda.reduce功能表现不佳是否有原因?是否可以仅使用Numba编写CUDA减少内核?

编辑:

在[{1}}中看来这是一个合法的错误:https://github.com/numba/numba/issues/2266https://github.com/numba/numba/issues/2268

1 个答案:

答案 0 :(得分:2)

我无法解释<script type="text/javascript"> window.addEventListener('load', onVrViewLoad) function onVrViewLoad() { var vrView = new VRView.Player('#vrview', { image: 'uploads/pano.jpg', is_stereo: false }); vrView.addHotspot('hotspot-one', { pitch: 30, // In degrees. Up is positive. yaw: 20, // In degrees. To the right is positive. radius: 0.05, // Radius of the circular target in meters. distance: 2, // Distance of target from camera in meters. }); } </script> <div id="vrview"> </div> @cuda.vectorize的行为。有时候结果对我来说有点奇怪。例如,此处Negative Speed Gain Using Numba Vectorize target='cuda' @cuda.reduce会减慢计算速度,而使用@cuda.vectorize可以加快计算速度。 在这里,我建议尝试PyCUDA(https://documen.tician.de/pycuda/)。我测试了点积(https://documen.tician.de/pycuda/array.html)的性能。

@cuda.jit

在我的电脑上import numpy as np from pycuda.curandom import rand as curand import pycuda.gpuarray as gpuarray import pycuda.driver as pycu import pycuda.autoinit from pycuda.reduction import ReductionKernel import numba.cuda as cuda from time import time dot = ReductionKernel(dtype_out=np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="float *x, float *y") n = 2**24 x = curand((n), dtype=np.float32) y = curand((n), dtype=np.float32) x_cpu = np.random.random((n)) y_cpu = np.random.random((n)) st = time() x_dot_y = dot(x, y).get() gpu_time = (time() - st) print "GPU: ", gpu_time st = time() x_dot_y_cpu = np.dot(x_cpu, y_cpu) cpu_time = (time() - st) print "CPU: ", cpu_time print "speedup: ", cpu_time/gpu_time 我得到了以下结果:

CPU: Intel Core2 Quad 3GHz, GPU: NVIDIA GeForce GTX 580.

有必要注意,在上面的代码中没有考虑初始化和预编译内核所需的时间。但是,这个时间可能很重要。考虑到这段时间,我已经获得了:

GPU:  0.00191593170166
CPU:  0.0518710613251
speedup:  27.0735440518

因此,在这种情况下,GPU代码比CPU代码慢。 同时,对于大多数应用程序,您只需要初始化内核一次,然后多次使用它。在这种情况下,使用PyCUDA简化内核看起来是合理的。

之前,我通过计算2D扩散系数方程测试了GPU: 0.316560029984 CPU: 0.0511090755463 speedup: 0.161451449031 ,PyCUDA和CUDA-C代码的性能。我发现PyCUDA可以获得与CUDA-C几乎相同的性能,而Numba表现出更差的性能。下图显示了这些结果。 enter image description here