Question

我编写的代码负责执行大量数据的缩减，虽然代码在逻辑上是正确的，但它证明比简单std::accumulate或std::max_element要求提供相同的数据，并且我正在寻找有关如何破坏此代码性能的任何见解。

这些是我得到的结果。请注意，即使执行内核的原始时间也比我的数据的简单CPU减少要慢。

Select which Device to use: 
0:                Cedar (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
1:                Cedar (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
2:         Intel(R) ... (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
3:         Intel(R) ... (Experimental Open... - OpenCL 2.0 (Build...)
Device: Cedar
Platform: AMD Accelerated Parallel Processing
Num of compute units: 8
Work Group Size: 128
i = 9419918
Internal Duration:    95609555ns //Time to run just the kernel, no setup
Num of Work Groups to sum up: 78125
Reduced Value was detected to be:    -5.06886
(Index):                             1008460
Value at index is:                   -5.06886
Kernel Duration:     153748214ns //Includes copying of data, excludes building of kernel
Counting manually, Reduced Value is: -5.06886
(Index of):                          1008460
Value at index is:                   -5.06886
Manual Duration:      48173322ns //CPU runtime using std::max_element`.
Press any key to continue . . .

通过连接所有这四个文件构建内核代码：

expand.cl

R"D(
#define EXPAND(type) \
typedef     type        Scalar;\
typedef     type ## 2   Vector2;\
typedef     type ## 4   Vector4;\
typedef     type ## 8   Vector8;\
typedef     type ## 16  Vector16;
)D"

float.cl

R"D(
EXPAND(float);

#define SCALAR_MAXIMUM INFINITY;
#define SCALAR_MINIMUM -INFINITY;
#define SCALAR_ZERO 0;
)D"

max.cl

R"D(
constant Scalar IDENTITY = SCALAR_MINIMUM;

#define REDUCE_IMPL(a, b, indexa, indexb, reduced_value, reduced_index) \
if(a > b) {\
    reduced_value = a;\
    reduced_index = indexa;\
} else {\
    reduced_value = b;\
    reduced_index = indexb;\
}
)D"

还原Main.cl

R"D(
kernel void reduce(global Scalar * a, global Scalar * output, local Scalar * scratch, global long * index_output, local long * index_scratch, long size) {
    size_t gid = get_global_id(0);
    size_t lid = get_local_id(0);
    size_t wid = get_group_id(0);
    size_t gsize = get_global_size(0);
    size_t lsize = get_local_size(0);
    size_t wsize = get_num_groups(0);

    if(gid < size) {
        scratch[lid] = a[gid];
        index_scratch[lid] = gid;
    } else {
        scratch[lid] = IDENTITY;
        index_scratch[lid] = -1;
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    for(size_t offset = lsize / 2; offset > 0; offset >>= 1) {
        if(lid < offset) {
            size_t indexa = index_scratch[lid];
            size_t indexb = index_scratch[lid + offset];
            Scalar a = scratch[lid];
            Scalar b = scratch[lid + offset];

            Scalar reduced_value;
            size_t reduced_index;

            REDUCE_IMPL(a, b, indexa, indexb, reduced_value, reduced_index);

            scratch[lid] = reduced_value;
            index_scratch[lid] = reduced_index;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    if(lid == 0) {
        output[wid] = scratch[0];
        index_output[wid] = index_scratch[0];
    }
}
)D"

CL Reduction.h perform_reduction：

std::future<result> perform_reduction(std::vector<T> const& values) {
    cl_long size = values.size();
    uint64_t num_of_work_groups = size / work_group_size;
    int64_t global_size = work_group_size * num_of_work_groups;
    if (global_size < size) {
        num_of_work_groups++;
        global_size = work_group_size * num_of_work_groups;
    }
    cl::Buffer input_buffer(context, CL_MEM_READ_ONLY, global_size * sizeof(T), nullptr);
    std::vector<cl::Event> write_events(1);
    queue.enqueueWriteBuffer(input_buffer, false, 0, size * sizeof(T), values.data(), nullptr, &write_events.back());
    if (global_size != size) {
        write_events.emplace_back();
        queue.enqueueFillBuffer(input_buffer, reduction::identity<T>(), size * sizeof(T), (global_size - size) * sizeof(T), nullptr, &write_events.back());
    }
    return std::async([size, num_of_work_groups, global_size, input_buffer, write_events, this] {
        cl::Buffer output_buffer( context, CL_MEM_WRITE_ONLY, num_of_work_groups * sizeof(T) );
        cl::Buffer output_index_buffer(context, CL_MEM_WRITE_ONLY, num_of_work_groups * sizeof(cl_long));
        kernel.setArg(0, input_buffer);
        kernel.setArg(1, output_buffer);
        kernel.setArg(2, sizeof(T) * work_group_size, nullptr);
        kernel.setArg(3, output_index_buffer);
        kernel.setArg(4, sizeof(cl_long) * work_group_size, nullptr);
        kernel.setArg(5, size);

        std::vector<cl::Event> kernel_event;
        kernel_event.emplace_back();
        queue.enqueueNDRangeKernel(kernel, {}, { uint64_t(global_size) }, { work_group_size }, &write_events, &kernel_event.back());
        std::vector<T> results;
        std::vector<int64_t> indexes;
        results.resize(num_of_work_groups);
        indexes.resize(num_of_work_groups);
        queue.enqueueReadBuffer(output_buffer, false, 0, num_of_work_groups * sizeof(T), results.data(), &kernel_event);
        queue.enqueueReadBuffer(output_index_buffer, false, 0, num_of_work_groups * sizeof(cl_long), indexes.data(), &kernel_event);
        queue.finish();
        std::cout << "Internal Duration: " << std::setw(11) << (kernel_event[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - kernel_event[0].getProfilingInfo<CL_PROFILING_COMMAND_START>()) << "ns" << std::endl;
        std::cout << "Num of Work Groups to sum up: " << num_of_work_groups << std::endl;
        result t{ reduction::identity<T>(), 0 };
        for (size_t i = 0; i < results.size(); i++) {
            T const& val = results[i];
            size_t const& index = indexes[i];
            t = reduction::reduce(t.reduced_value, val, t.reduced_index, index);
        }
        return t;
    });
}

减少Main.cpp：

#define _HAS_AUTO_PTR_ETC 1
#include <vector>
#include <list>
#include <memory>
#include <utility>
#include<fstream>
#include<chrono>
#include<numeric>
#include<random>
#include<iomanip>

#include "CL Reduction.h"

std::string limit(std::string string, size_t limit) {
    if (string.size() >= limit) return string.substr(0, limit - 3) + "...";
    else return std::move(string);
}

cl::Device choose_device() {
    std::vector<cl::Device> all_devices;
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    for (cl::Platform const& platform : platforms) {
        std::vector<cl::Device> devices;
        platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
        all_devices.insert(all_devices.end(), devices.begin(), devices.end());
    }

    std::cout << "Select which Device to use: " << std::endl;
    for (size_t i = 0; i < all_devices.size(); i++) {
        cl::Device const& device = all_devices[i];
        std::cout << i;
        std::cout << ": ";
        std::cout << std::setw(20) << limit(device.getInfo<CL_DEVICE_NAME>(), 20);
        std::cout << " (";
        std::cout << std::setw(20) << limit(cl::Platform{ device.getInfo<CL_DEVICE_PLATFORM>() }.getInfo<CL_PLATFORM_NAME>(), 20);
        std::cout << " - ";
        std::cout << std::setw(20) << limit(device.getInfo<CL_DEVICE_VERSION>(), 20);
        std::cout << ")";
        std::cout << std::endl;
    }
    size_t chosen;
    std::cin >> chosen;
    return all_devices[chosen];
}

int main() {
    using type = float;
    using reduction_type = cl_reduction_type::reduction_type<cl_reduction_type::type::maximum>;
    using datatype = cl_datatype::datatype<type>;
    using context_t = cl_reduction::reduction_context<datatype, reduction_type>;
    std::ofstream err_log{ "err.txt" };

    cl::Device device = choose_device();

    try {
        cl_reduction::reduction_context<datatype, reduction_type> context{ { device }, err_log };
        std::vector<type> values;
        auto last_ping = std::chrono::steady_clock::now();
        std::default_random_engine engine{ std::random_device{}() };
        std::uniform_real_distribution<type> distribution{ -100.f, 100.f };
        //std::uniform_int_distribution<type> distribution(1, 500);
        values.resize(10'000'000ull);
        //values.resize(10'000);
        type start = distribution(engine);
        for (size_t i = 0; i < values.size(); i++) {
            values[i] = start;
            start = std::nextafter(start, std::numeric_limits<type>::infinity());
            if (std::chrono::steady_clock::now() - last_ping > std::chrono::seconds(1)) {
                std::cout << "i = " << i << '\r';
                last_ping += std::chrono::seconds(1);
            }
        }

        std::shuffle(values.begin(), values.end(), engine);

        auto begin = std::chrono::steady_clock::now();
        auto future = context.perform_reduction(values);
        context_t::result t;
        try {
            t = future.get();
        }
        catch (cl::Error const& e) {
            err_log << e.what() << std::endl;
            err_log << e.err() << std::endl;

        }
        auto end = std::chrono::steady_clock::now();

        std::cout << "Reduced Value was detected to be:    " << t.reduced_value << std::endl;
        std::cout << "(Index):                             " << t.reduced_index << std::endl;
        std::cout << "Value at index is:                   " << values[t.reduced_index] << std::endl;
        std::cout << "Kernel Duration:   " << std::setw(11) << (end - begin).count() << "ns" << std::endl;
        begin = std::chrono::steady_clock::now();
        //auto value = std::accumulate(values.begin(), values.end(), type(0));
        auto it = std::max_element(values.begin(), values.end());
        auto index = std::distance(values.begin(), it);
        auto value = values[index];
        end = std::chrono::steady_clock::now();
        std::cout << "Counting manually, Reduced Value is: " << value << std::endl;
        std::cout << "(Index of):                          " << index << std::endl;
        std::cout << "Value at index is:                   " << values[index] << std::endl;
        std::cout << "Manual Duration:   " << std::setw(11) << (end - begin).count() << "ns" << std::endl;
    }
    catch (cl::Error const& e) {
        std::cerr << e.what() << ':' << e.err() << std::endl;
        if (e.err() == CL_INVALID_BUFFER_SIZE)
            std::cerr << device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>() << std::endl;
    }
    system("pause");
    return 0;
}

我已经包含了整个代码库here，其中包括使用的三个标头和主要功能。（＆＃34; CL Datatype.h＆＃34;，＆＃34; Cl Reduction Type.h＆＃34;，＆＃34; CL Reduction.h＆＃34;，＆＃34; Reduction Main.cpp＆＃34;）。我在这篇文章中只包含了我认为相关的代码，但如果您认为问题出在其他方面，您可以在Github Repo中指出这一点。

Answer 1

使用Vector4 a = vload4(...)阅读您的输入并使用.xyzw。您也可以尝试使用vload8进行8向量化。

而不是a > b，使用isgreater(a, b)以及any，all和select。

每个循环执行多次减少以将其保留在寄存器中并减少本地内存的带宽。对于128的工作组大小和4的向量大小，第一个线程将使用512-515减少0-3，然后使用1024-1027等减少，然后使用vstore4写入本地内存。尝试不同的内环尺寸。

尽可能地，你不希望线程无所事事。内核应该只是从全局内存减少到寄存器一次，存储到本地内存然后在一个线程从内部减少到内核的单个值之前同步线程并将其存储在全局内存中。最后，您可以对CPU进行最后一次相对较小的减少。此级别仅包含每个工作组中的一个值：total_size / (work_group_size = 128) / (vector_size = 4) / (inner_loop_size = 16)

如何提高此OpenCL减少内核代码的性能？

1 个答案: