Question

我正在尝试利用计算着色器提供的并行性来操纵任意数据。我已经建立了一个示例场景，可以进行一些实验。特别是我想做的是以下事情：

我有一张黑白图片，以及相对的json序列化，其中像素存储在float数组中。如果像素为白色，则为1.0，否则为0.0
我使用计算缓冲区将float数组提供给计算着色器
着色器为数组中的每个单元分配一个线程（因此为图像中的每个像素分配一个线程）
每个线程读取其单元格/像素的值：
- 如果值为“ 1.0”，则必须重新迭代整个数组，对所有“ 1.0”进行计数，并将计数器存储在输出缓冲区的其单元格中。

该算法在输入的特定大小下都可以正常工作：图像尺寸为400x400（数组大小为160000），然后崩溃。

我的系统的规格是：

CPU：Intel Core i7-4700mq cpu @ 2.40ghz
GPU：具有GDDR5 2GB VRAM的NVIDIA GeForce GT 750M
RAM：8GB DDR3
HDD：256GB SSD
操作系统：Windows 10
DirectX11

我试图在具有GTX970的更强大的台式机上运行代码，即使它可以处理更大的输入，图像尺寸为500x500（阵列大小为250000），此后也会崩溃。

我已经查看了Unity的日志文件，当它崩溃时，充满了以下错误消息：

d3d11：无法在GfxDeviceD3D11中创建2D纹理
d3d11：创建缓冲区失败（目标0x1模式0大小为960）[0x887A0005]
对表达式的声明失败：“ SUCCEEDED（hr）”

我也尝试过使用RenderDoc启动运行示例场景的unity项目的编译版本，并捕获在其中执行调度调用的帧，但是它给了我以下错误：“ renderdoc无法为以下内容打开捕获重播：在API级别重播捕获失败。”我认为它无法捕获该帧，因为DirectX11崩溃。

这是分派计算着色器的C＃脚本的相关部分。

using System.Collections;
using System.Collections.Generic;
using System.IO;
using UnityEngine;

public class ComputeShaderTest1 : MonoBehaviour
{
    public TextAsset inputTextureData;
    private SerializableTextureData deserializedInputTextureData;
    private ComputeShader computeShader;
    private ComputeBuffer inputDataBuffer;
    private float[] outputValuesData;
    private ComputeBuffer outputDataBuffer;

    // Use this for initialization
    void Start()
    {
        deserializedInputTextureData = JsonUtility.FromJson<SerializableTextureData>(inputTextureData.text);

        computeShader = Resources.Load<ComputeShader>("Shaders/ComputeShader1");

        if (computeShader == null)
            Debug.LogError("computeShader not found in the specified path");
        else
            compute();
    }

    private void compute()
    {
        int inputDataSize = deserializedInputTextureData.width * deserializedInputTextureData.height;

        int csMain = computeShader.FindKernel("CSMain");

        if (csMain < 0)
        {
            Debug.Log("Initialization failed.");
            return;
        }

        uint threadGroupSizeX, threadGroupSizeY, threadGroupSizeZ;
        int offsetX, offsetY;
        int groupsX, groupsY, groupsZ;

        computeShader.GetKernelThreadGroupSizes(csMain, out threadGroupSizeX, out threadGroupSizeY, out threadGroupSizeZ);
        offsetX = (int)threadGroupSizeX - 1;
        offsetY = (int)threadGroupSizeY - 1;

        groupsX = (deserializedInputTextureData.width + offsetX) / (int)threadGroupSizeX;
        groupsY = (deserializedInputTextureData.height + offsetY) / (int)threadGroupSizeY;
        groupsZ = 1;

        inputDataBuffer = new ComputeBuffer(inputDataSize, sizeof(float));
        inputDataBuffer.SetData(deserializedInputTextureData.data);
        computeShader.SetBuffer(csMain, "InputDataBuffer", inputDataBuffer);
        computeShader.SetInt("InputDataWidth", deserializedInputTextureData.width);
        computeShader.SetInt("InputDataHeight", deserializedInputTextureData.height);

        outputDataBuffer = new ComputeBuffer(inputDataSize, sizeof(float));
        computeShader.SetBuffer(csMain, "OutputDataBuffer", outputDataBuffer);

        Debug.Log("Dispatching [" + groupsX + "," + groupsY + "," + groupsZ + "] groups");

        var watch = System.Diagnostics.Stopwatch.StartNew();

        computeShader.Dispatch(csMain, groupsX, groupsY, groupsZ);

        watch.Stop();

        outputValuesData = new float[inputDataSize];
        outputDataBuffer.GetData(outputValuesData);

        Debug.Log("Compute Shader Execution Completed. Time elapsed (ns): " + watch.Elapsed.TotalMilliseconds * 1000000);

        saveOutDataAsJSON();
        saveOutDataAsTexture2D();
    }

    void OnDestroy()
    {
        if (inputDataBuffer != null)
            inputDataBuffer.Dispose();
        if (outputDataBuffer != null)
            outputDataBuffer.Dispose();
    }
}

这是计算着色器本身：

#pragma enable_d3d11_debug_symbols
#pragma kernel CSMain

StructuredBuffer<float> InputDataBuffer;
uint InputDataWidth;
uint InputDataHeight;

RWStructuredBuffer<float> OutputDataBuffer;

[numthreads(32, 32, 1)]
void CSMain(uint3 groupID : SV_GroupID,
   uint3 groupThreadID : SV_GroupThreadID,
   uint groupIndex : SV_GroupIndex,
   uint3 id : SV_DispatchThreadID)
{
    uint navMeshRes = InputDataWidth * InputDataHeight;

    // Each thread is mapped to a single "pixel" of the input
    uint index = id.y * InputDataWidth + id.x;

    // Check that we are inside the boundaries of the input
    if(id.x < InputDataWidth && id.y < InputDataHeight)
    {
        OutputDataBuffer[index] = 0;
        float val = InputDataBuffer[index];

        uint i = 0, j = 0;
        float v;

        if (val == 1)
        {
            for(i = 0; i < InputDataWidth; i++)
            {
                for(j = 0; j < InputDataHeight; j++)
                {
                    v = InputDataBuffer[j * InputDataWidth + i];

                    if (v == 1)
                    {
                        OutputDataBuffer[index] += 1;
                    }
                }
            }
        }
    }
}

我知道代码确实没有经过优化，冗余和不必要，但是它仅作为测试用例，用于理解为什么当输入增加时它会以这种方式崩溃。

由于每个线程都必须遍历整个数组，因此我希望基于输入的二次因子会导致性能下降，再加上管理线程本身所需的资源会给系统带来一些开销。

我不明白的是为什么它崩溃了，为什么只有在输入达到一定大小时才崩溃。

计算缓冲区是否有大小限制？我认为即使是这样，他们也将能够存储比我要管理的数据更多的数据。

如果您有兴趣查看完整的代码，我已经制作了一个开放的git repo：https://github.com/MichelangeloDiamanti/Compute-Shader-Tests

缓冲区太大时，计算着色器会崩溃

0 个答案: