Question

我正在使用cuda /推力库进行蒙特卡罗模拟。这非常适用于一定数量的模拟，我得到一个bad_alloc异常。这似乎没问题，因为我的代码中越来越多的模拟意味着处理越来越大的device_vectors。所以我希望在某种程度上出现这种异常。

我现在要做的是根据GPU上的可用内存设置此模拟次数的上限。然后，我可以将工作量分成一堆模拟。

所以我在启动我的模拟集之前一直在尝试调整问题的大小。不幸的是，当我试图通过简单的例子来理解内存的管理方式时，我得到了令人惊讶的结果。

以下是我一直在测试的代码示例：

#include <cuda.h>
#include <thrust/system_error.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <cuda_profiler_api.h>

int main() 
{
    size_t freeMem, totalMem;

    cudaDeviceReset();
    cudaSetDevice(0);

    cudaMemGetInfo(&freeMem, &totalMem);
    std::cout << "Total Memory | Free Memory "<< std::endl;
    std::cout << totalMem << ", " << freeMem << std::endl;

    thrust::device_vector<float> vec1k(1000, 0);

    cudaMemGetInfo(&freeMem, &totalMem);
    std::cout << totalMem << ", " << freeMem << std::endl;

    thrust::device_vector<float> vec100k(100000, 0);

    cudaMemGetInfo(&freeMem, &totalMem);
    std::cout << totalMem << ", " << freeMem << std::endl;

    thrust::device_vector<float> vec1M(1000000, 0);

    cudaMemGetInfo(&freeMem, &totalMem);
    std::cout << totalMem << ", " << freeMem << std::endl;

    return 0;
}

以下是我得到的结果：

Total Memory | Free Memory
2147483648, 2080542720
2147483648, 2079494144
2147483648, 2078445568
2147483648, 2074382336

所以，基本上，

1,000个元素向量（加上所需的其他所有内容）使用1,048,576个字节
100,000元素向量也使用1,048,576字节！
1,000,000元素向量使用4,063,232字节。

我原本期望内存使用量大致按照元素数量进行缩放，但是当我预期“10x”时，我得到“4x”，而这种关系不能容纳1,000到100,000个元素。

所以，我的两个问题是：

任何人都可以帮我理解这些数字吗？
如果我无法估计代码将使用的内存量，那么，确保我的程序适合内存的好策略是什么？

修改

继麦龙洞评论之后，我尝试了两个向量，一个是262144个浮点数（4个字节），另一个是262145.不幸的是，事情看起来不像“每1MB页面分配”：

第一个向量的大小（262144个浮点数）：1048576个字节
第二个向量的大小（262145个浮点数）：1179648个字节

两者之间的增量为131072字节（或128 KB）。页面大小可变吗？这有意义吗？

Answer 1

Thrust对内存管理没有任何魔力，默认分配器只是cudaMalloc，您所看到的是驱动程序内存管理器页面大小选择算法。这没有记录，并且没有迹象表明平台和硬件版本之间的行为是一致的。

那就是说，如果我将代码扩展为更有用的东西：

#include <iostream>
#include <vector>
#include <thrust/system_error.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>

void report_mem(size_t allocd, bool first=false)
{
    size_t freeMem, totalMem;
    cudaMemGetInfo(&freeMem, &totalMem);
    if (first) 
        std::cout << "Allocated | Total Memory | Free Memory "<< std::endl;
    std::cout << allocd << ", " << totalMem << ", " << freeMem << std::endl;
}

int main() 
{
    cudaSetDevice(0);

    report_mem(0, true);
    std::vector<size_t> asizes;
    const int nallocs = 10;
    for(int i=0; i < nallocs; i++) asizes.push_back(1<<14);
    for(int i=0; i < nallocs; i++) asizes.push_back(1<<16);
    for(int i=0; i < nallocs; i++) asizes.push_back(1<<18);
    for(int i=0; i < nallocs; i++) asizes.push_back(1<<20);
    for(int i=0; i < nallocs; i++) asizes.push_back(1<<22);

    typedef thrust::device_vector<float> dvecf_t;
    std::vector<dvecf_t*> allocs;
    auto it = asizes.begin();
    for(; it != asizes.end(); ++it) {
        dvecf_t* v = new dvecf_t(*it);
        allocs.push_back(v);
    report_mem(v->capacity() * sizeof(float));
    }
    return 0;
}

并在Windows 64位上的计算2.1设备上运行它，我明白了：

Allocated | Total Memory | Free Memory 
0, 1073741824, 1007849472
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
65536, 1073741824, 1006800896
262144, 1073741824, 1005752320
262144, 1073741824, 1005752320
262144, 1073741824, 1005752320
262144, 1073741824, 1005752320
262144, 1073741824, 1004703744
262144, 1073741824, 1004703744
262144, 1073741824, 1004703744
262144, 1073741824, 1004703744
262144, 1073741824, 1003655168
262144, 1073741824, 1003655168
1048576, 1073741824, 1002606592
1048576, 1073741824, 1001558016
1048576, 1073741824, 1000509440
1048576, 1073741824, 999460864
1048576, 1073741824, 998412288
1048576, 1073741824, 997363712
1048576, 1073741824, 996315136
1048576, 1073741824, 995266560
1048576, 1073741824, 994217984
1048576, 1073741824, 993169408
4194304, 1073741824, 988975104
4194304, 1073741824, 984780800
4194304, 1073741824, 980586496
4194304, 1073741824, 976392192
4194304, 1073741824, 972197888
4194304, 1073741824, 968003584
4194304, 1073741824, 963809280
4194304, 1073741824, 959614976
4194304, 1073741824, 955420672
4194304, 1073741824, 951226368
16777216, 1073741824, 934449152
16777216, 1073741824, 917671936
16777216, 1073741824, 900894720
16777216, 1073741824, 884117504
16777216, 1073741824, 867340288
16777216, 1073741824, 850563072
16777216, 1073741824, 833785856
16777216, 1073741824, 817008640
16777216, 1073741824, 800231424

我将其解释为在我测试此平台的平台上指示分配粒度为1MiB（1048576或2 ^ 20字节）。您的平台可能会有所不同。

了解推力（CUDA）内存使用情况

1 个答案: