Question

我针对背包问题实现了经典动态规划算法的两个版本：顺序的和并行的。当容量较小（<500 000）时，并行版本的速度快将近两倍。但是，随着容量的增加（> 1 000 000），并行版本变得比顺序版本慢。为什么？有办法解决吗？

#include <vector>
#include <thread>
#include <random>
#include <chrono>
#include <iostream>
#include <cstdint>

struct Item { int64_t w, p; };
struct Instance { std::vector<Item> items; int64_t c; };

int64_t opt_bellman_array(const Instance& ins)
{
    int64_t n = ins.items.size();
    std::vector<int64_t> values(ins.c + 1, 0);
    for (int64_t j=0; j<n; ++j)
        for (int64_t x=ins.c; x>=ins.items[j].w; x--)
            if (values[x] < values[x - ins.items[j].w] + ins.items[j].p)
                values[x] = values[x - ins.items[j].w] + ins.items[j].p;
    return values[ins.c];
}

/******************************************************************************/

void opts_bellmanpar_array(const Instance& ins,
        int64_t n1, int64_t n2, std::vector<int64_t>::iterator values)
{
    for (int64_t j=n1; j<n2; ++j)
        for (int64_t x=ins.c; x>=ins.items[j].w; x--)
            if (*(values + x) < *(values + x - ins.items[j].w) + ins.items[j].p)
                *(values + x) = *(values + x - ins.items[j].w) + ins.items[j].p;
}

int64_t opt_bellmanpar_array(const Instance& ins)
{
    int64_t n = ins.items.size();
    if (n == 0) {
        return 0;
    } else if (n == 1) {
        return ins.items[0].p;
    }

    int64_t k = (n - 1) / 2 + 1;

    std::vector<int64_t> values1(ins.c + 1, 0);
    std::thread thread(opts_bellmanpar_array, std::ref(ins), 0, k, values1.begin());
    std::vector<int64_t> values2(ins.c + 1, 0);
    opts_bellmanpar_array(std::ref(ins), k, n, values2.begin());
    thread.join();

    int64_t opt = -1;
    for (int64_t c1=0; c1<=ins.c; ++c1) {
        int64_t z = values1[c1] + values2[ins.c - c1];
        if (opt < z)
            opt = z;
    }
    return opt;
}

/******************************************************************************/

int main(int argc, char *argv[])
{
    (void)argc;
    (void)argv;

    int64_t n = 500;
    int64_t r = 10000;
    Instance ins;
    ins.c = 200000;
    //ins.c = 2000000;

    ins.items = std::vector<Item>(n);
    std::default_random_engine generator;
    std::uniform_int_distribution<int64_t> d(1, r);
    for (int64_t i=0; i<n; ++i) {
        ins.items[i].w = d(generator);
        ins.items[i].p = d(generator);
    }

    auto t1 = std::chrono::high_resolution_clock::now();
    opt_bellman_array(ins);
    auto t2 = std::chrono::high_resolution_clock::now();
    opt_bellmanpar_array(ins);
    auto t3 = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> timespan1 = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
    std::chrono::duration<double> timespan2 = std::chrono::duration_cast<std::chrono::duration<double>>(t3 - t2);

    std::cout << "seq " << timespan1.count() << std::endl;
    std::cout << "par " << timespan2.count() << std::endl;
    return 0;
}

编译命令：

g++ -O3 test.cpp -o test -lpthread

ins.c的输出= 200000

seq 0.071924
par 0.0388827

ins.c = 2000000的输出

seq 0.975181
par 1.08307

大量并行执行会降低效率

0 个答案: