Question

我是pthread的新手，我编写了这段代码进行测试。我不明白为什么只用1个pthread运行代码比用多个pthreads运行代码更快。该代码是解决TSP的遗传算法的设置部分。我有3个线性数组（city_x，city_y，city_id）保存数据：

1代表x
1代表y
每个城市的ID为1

这些数组就像线性化的数组，代表总体元素。每个元素都有NUM_CITIES个x，y和id数据。因此，如果我们有：

人口的3个要素
10个NUM_CITIES
每个数组的数据总数为3 * 10 = 30

代码需要输入总体元素的编号，在city_set数组中设置一些坐标，并使用整个总体所有元素的坐标x，y和id创建全局数组。

#include <pthread.h>

#include <limits> // std::numeric_limits<double>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <utility>
//#include <math.h>
#include <algorithm>    // std::lower_bound, std::find
#include <random>
#include <cmath> 
#include <cstring>
#include <iomanip>      // std::setprecision
#include <vector>       // std::vector

#define NUM_CITIES 10  // This is a tour for the LIN105. It has length 14379.
// #define SIZE_POP 100000000
#define SIZE_MATING 3
#define MUTATION_RATE 0.03
#define STALL_LIMIT 10

// variabili condivise
long size_pop = 0;
long tot_elem = 0;
const int num_threads = 24;
int tid[num_threads];
int start[num_threads];
int stop[num_threads];

// città
int city_set_x[NUM_CITIES];
int city_set_y[NUM_CITIES];
int city_set_id[NUM_CITIES];

// elementi della popolazione
int *city_x;
int *city_y;
int *city_id;

void *setup(void *p) {

    int id = *(int *)p;
    // std::cout << "id: " << id << "\n";

    int s = start[id];

    int perm[NUM_CITIES];
    for(int i = 0; i < NUM_CITIES; ++i) {
        perm[i] = i;
        // std::cout << perm[i] << ",";
    }

    for(long i = start[id]; i < stop[id]; i += NUM_CITIES) {
        std::random_shuffle ( perm, perm + NUM_CITIES );

        for(int j = 0; j < NUM_CITIES; ++j) {
            city_id[i + j] =  perm[j];
            city_x[i + j] =  city_set_x[perm[j]];
            city_y[i + j] =  city_set_y[perm[j]];
            // std::cout << "(" << city_x[i + j] << "," << city_y[i + j] << ") ";
        }
        // std::cout << "\n";
    }

}


static inline const double diffmsec(const struct timeval & a, 
                                    const struct timeval & b) {
    long sec  = (a.tv_sec  - b.tv_sec);
    long usec = (a.tv_usec - b.tv_usec);

    if(usec < 0) {
        --sec;
        usec += 1000000;
    }
    return ((double)(sec*1000)+ (double)usec/1000.0);
}

int main(int argc, char *argv[]) {

    size_pop = atol(argv[1]);

    std::cout << size_pop << "\n";

    tot_elem = NUM_CITIES * size_pop;
    std::cout << "tot_elem: " << tot_elem << "\n";

    struct timeval program_start, program_end, setup_start, setup_end;

    std::vector<double> v_set;

    city_x = (int *)malloc(tot_elem * sizeof(int));
    // memset(city_x, -1, tot_elem * sizeof(int));
    city_y = (int *)malloc(tot_elem * sizeof(int));
    // memset(city_y, -1, tot_elem * sizeof(int));
    city_id = (int *)malloc(tot_elem * sizeof(int));
    for(int i = 0; i < tot_elem; ++i) {
        city_x[i] = -1;
        city_y[i] = -1;
        city_id[i] = -1;
    }

    srand(time(NULL));

    int x[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    int y[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};


    // stampa
    std::cout << "[CITTA.X]\n";
    for(int i = 0; i < NUM_CITIES; ++i) {

        city_set_x[i] = x[i];
        // city_set[i].x = i + 1;
        std::cout << city_set_x[i] << " ";
    }
    std::cout << "\n";

    std::cout << "[CITTA.Y]\n";
    for(int i = 0; i < NUM_CITIES; ++i) {

        city_set_y[i] = y[i];
        // city_set[i].y = i + 1;
        std::cout << city_set_y[i] << " ";
    }
    std::cout << "\n";

    std::cout << "[CITTA.ID]\n";
    for(int i = 0; i < NUM_CITIES; ++i) {

        city_set_id[i] = i;
        std::cout << city_set_id[i] << " ";
    }
    std::cout << "\n";

    // std::cin.get() != '\n';

    pthread_t threads[num_threads];

    for(int i = 0; i < num_threads; ++i) {
        tid[i] = i;
        start[i] = i * NUM_CITIES * floor(size_pop/num_threads);
        // std::cout << "start: " << start << "\n";
        if(i != num_threads - 1) {
            stop[i] = start[i] + (floor(size_pop/num_threads) * NUM_CITIES);
            // std::cout << "stop: " << stop << "\n";
        }
        else {
            stop[i] = tot_elem;
            // std::cout << "stop: " << stop << "\n";
        }
        // std::cout << "\n";
    }

    for(int c = 0; c < 10; c++) {

        gettimeofday(&setup_start, NULL);

        for(int i = 0; i < num_threads; ++i) {
            if( pthread_create( &threads[i], NULL, &setup, (void *) &tid[i]) )
            {
              printf("Thread creation failed\n");
            }
        }

        for(int i = 0; i < num_threads; ++i) {
            pthread_join( threads[i], NULL);
        }

        gettimeofday(&setup_end, NULL);
        v_set.push_back(diffmsec(setup_end, setup_start) / 1000);
    }

    // // stampa
    // std::cout << "[SETUP]\n";
    // for(int i = 0; i < size_pop; ++i){
    //  long idx = i * NUM_CITIES;
    //  std::cout << "pop[" << i << "]: ";
    //  for(int j = 0; j < NUM_CITIES; ++j){
    //      std::cout << "(" << city_x[idx + j] << "," << city_y[idx + j] << ") ";
    //  }
    //  std::cout << "\n";
    // }

    double sum = 0;
    double mean;


    sum = 0;
    for (int i = 0; i < v_set.size(); ++i) {
        sum += v_set[i];
    }
    mean = sum / v_set.size();
    std::cout << "[SET]: " << mean << " s\n";

    free(city_x);
    free(city_y);
    free(city_id);

}

我使用 1000000个元素运行代码，将线程数设置为1，并且结果为0.332 s 。在使用 1000000个元素运行但将线程数设置为4后，结果为1.361 s 。如果我将数字增加24 ，则结果为0.60 s ，但是是顺序的两倍！当我超过24个线程数时，结果保持不变或再次增加。

编辑

使用： grep -c处理器/ proc / cpuinfo

我得到56。

使用： cat / proc / cpuinfo

处理器：0

vendor_id：正版英特尔

cpu家庭：6

型号：79

型号名称：Intel（R）Xeon（R）CPU E5-2680 v4 @ 2.40GHz 步进：1

微码：0xb00001e

cpu MHz：1967.906

缓存大小：35840 KB

物理ID：0

兄弟姐妹：28

核心ID：0

cpu核心：14

apicid：0

初始尖峰：0

fpu：是

fpu_exception：是

cpuid级别：20

wp：是

flags：fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lmts_topfcpffpfpfcfpqfppfpfpfpfpfpfpfpfpfpfpfpfpfpfpfpfpfpfpffpfpfffpfffpfffpfffcfpfffcfpfffcfpfffcfpfffcfpfffcfpfffcfpfffcfpffcfffcfffcfpfcffcffcfffcfffcfffcfcfffcfffcfffcfcfffcfffcfffcfffcfffcfffcfffcffffc dtes64监视ds_cpl VMX SMX EST TM2 SSSE3 FMA CX16 xtpr PDCM PCID DCA sse4_1 sse4_2 x2apic movbe POPCNT tsc_deadline_timer AES XSAVE AVX F16C rdrand lahf_lm ABM 3dnowprefetch ARAT EPB PLN PTS dtherm intel_pt tpr_shadow vnmi FlexPriority可EPT VPID fsgsbase tsc_adjust BMI1 HLE AVX2 SMEP bmi2 ERMS invpcid RTM CQM rdseed adx smap xsaveopt cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local

bogomips：4799.62

clflush大小：64

cache_alignment：64

地址大小：物理46位，虚拟48位

对于56个处理器中的每一个。

Answer 1

std::random_shuffle使用共享资源，所有线程都使用它，因此您的程序争用很高，线程大多在等待对方。为每个线程使用单独的随机生成器（例如，std::mt19937与std::shuffle，检出cppreference）。

此外，您可能希望增加NUM_CITIES，因此每个线程使用单独的缓存行。

Answer 2

运行带有多个线程的代码，要求系统在每个线程之间进行上下文切换，这意味着您要承担计算开销，而实际上并没有从中获得任何好处。另外，您还需要一个循环来计算线程参数，该线程参数会随着生成更多线程而变得计算量更大，但这可能是引入最少的延迟，因为它不需要大量计算。

还要注意，线程可能在单个物理核心上运行，请在程序运行时检查资源的使用方式。如果该程序仅在单个内核上运行，则实际上您并没有使用在多个内核中引入的硬件加速。

最后，因为这是C ++，所以我建议使用本机std :: thread。

最后，我认为这种延迟主要是由于线程之间的上下文切换以及线程可能在单个内核上运行这一事实造成的。尝试检查在多个物理内核上运行该程序的可能性，并检查花费了多少时间。

为什么pthread会降低代码速度？

2 个答案: