缓存高效的多线程合并排序

时间:2012-06-27 08:13:27

标签: c++ multithreading caching merge

void AFCQueue::ExtractValuesSecComplex(int startIndex, int endIndex,int helperIndex)
{

int size = 0,i,index;
TimeType min_timestamp;
bool is_singleQueue = false;
TimeType* local_queue_time = helper_queue_time[helperIndex];
int* local_queue_value = helper_queue_value[helperIndex];
volatile int& in_local_helper = in_sec[helperIndex];
volatile int& out_local_helper = out_sec[helperIndex];

NodeArrayBlock * heads_local[_MAX_THREADS];
NodeArrayBlock *tails_local[_MAX_THREADS];
int outs_local[_MAX_THREADS];
int ins_local[_MAX_THREADS];
TimeType local_timearray[_MAX_THREADS];
int min_index = 0;

min_timestamp = timestamps_arr[startIndex];
for (i=startIndex,index=startIndex ; i < endIndex; i++){
    heads_local[i] = (NodeArrayBlock *)heads[i];
    outs_local[i]  = outs[i];
    tails_local[i] = (NodeArrayBlock *)tails[i];
    ins_local[i]   = ins[i];
    local_timearray[i] = timestamps_arr[i];

    if (local_timearray[i] < min_timestamp){
        min_timestamp = local_timearray[i];
        index = i;
    }
}

do{
    //if central queue is full 
    while((out_local_helper-1)==in_local_helper || 
        (out_local_helper==0 && in_local_helper == HELPERS_QUEUE_SIZE_1) || _gIsStopThreads){
        if (_gIsStopThreads)
            return;
    }

    local_queue_time[in_local_helper] = heads_local[index]->_timestamp_arr[outs_local[index]];
    local_queue_value[in_local_helper] = heads_local[index]->_values_arr[outs_local[index]++];

    if (in_local_helper < HELPERS_QUEUE_SIZE_1)
        in_local_helper++;
    else
        in_local_helper = 0;    

    if (outs_local[index] == _INIT_SIZE){
        heads_local[index]->_free = true;
        heads_local[index] = heads_local[index]->_next;
        if (heads_local[index]==null)
        {
            tails_local[index] = null;
            ins_local[index]=0;
        }
        outs_local[index] = 0;
    }
    if (ins_local[index] == outs_local[index] &&
        heads_local[index]==tails_local[index])
    {
        //if it was not the last local queue in the array of snapshots
        if (--endIndex != index){
            heads_local[index]                  = heads_local[endIndex];
            tails_local[index]                  = tails_local[endIndex];
            outs_local[index]                   = outs_local[endIndex];
            ins_local[index]                    = ins_local[endIndex];
            local_timearray[index]              = local_timearray[endIndex];
        }
        if ((endIndex-startIndex)==1)
            is_singleQueue = true;
        heads_local[endIndex]                   = null;
    }else{
        local_timearray[index] = heads_local[index]->_timestamp_arr[outs_local[index]];
    }
    //If a single Queue left, no need to check timestamps
    if (is_singleQueue){
        int out = outs_local[startIndex];
        int in  = ins_local[startIndex];
        NodeArrayBlock* he = heads_local[startIndex];
        NodeArrayBlock* ta = tails_local[startIndex];
        int* value_arr = he->_values_arr;
        TimeType* time_arr = he->_timestamp_arr;
        while (true){
            if ((in == out && he==ta))
            {
                //heads[startIndex] = null;
                return;
            }
            if (out == _INIT_SIZE){
                he->_free = true;
                he = he->_next;
                if (he==null)
                {
                    //heads[startIndex]=null;
                    return;
                }
                value_arr = he->_values_arr;
                time_arr = he->_timestamp_arr;
                out = 0;
            }   
            while((out_local_helper-1)==in_local_helper || 
                (out_local_helper==0 && in_local_helper == HELPERS_QUEUE_SIZE_1) || 
                _gIsStopThreads){
                if (_gIsStopThreads)
                    return;
            }

            if (he==ta){
                if (out_local_helper <= in_local_helper){
                    min_index = Math::Min(HELPERS_QUEUE_SIZE-in_local_helper,in-out);
                }else{
                    min_index = Math::Min(out_local_helper-1-in_local_helper,in-out);
                }
            }else{
                if (out_local_helper <= in_local_helper){
                    min_index = Math::Min(HELPERS_QUEUE_SIZE-in_local_helper,_INIT_SIZE-out);
                }else{
                    min_index = Math::Min(out_local_helper-1-in_local_helper,_INIT_SIZE-out);
                }
            }
            memcpy(&local_queue_time[in_local_helper],&time_arr[out],min_index * sizeof(*time_arr));
            memcpy(&local_queue_value[in_local_helper],&value_arr[out],min_index * sizeof(*value_arr));
            in_local_helper+=min_index;
            out+=min_index;
            if (in_local_helper == HELPERS_QUEUE_SIZE)
                in_local_helper = 0;
        }
    }
    if (endIndex==startIndex)
        break;

    min_timestamp = local_timearray[startIndex];
    for(i = startIndex+1,index=startIndex; i < endIndex ;i++){
        if (local_timearray[i] < min_timestamp){
            min_timestamp = local_timearray[i];
            index = i;
        }
    }
}while(true);
}

这是我的算法的一个片段,这个函数专用于一个迭代多个队列的线程(有时间戳的队列和有值的队列)

执行此方法的每个线程,迭代X队列并将它们合并为时间戳和值的单个循环队列。

此函数存在大量缓存未命中,

如何改进以减少缓存未命中(多个线程同时使用不同的id执行此方法 - helperIndex)

2 个答案:

答案 0 :(得分:1)

当然,你有很多缓存未命中。你必须记住的是CPU和L3缓存和RAM之间只有一条总线。因此,如果线程1忙于读取内存然后被挂起以允许另一个线程执行相同的操作但使用不同的内存,则需要重新加载缓存。当进程被挂起时也会发生这种情况 - 当进程恢复执行时,缓存需要重新加载。

要限制缓存未命中,请将线程数限制为物理核心数(忽略超线程核心)。如果线程数多于核心数,则每次进行线程切换时都需要更新缓存。如果您具有相同数量的线程和核心,则可以减少将缓存丢失到另一个线程/进程的可能性。你想尝试num_cores - 1个线程,看看是否有帮助。

此外,您的代码非常庞大且未注释。很难看出你正在做什么。

答案 1 :(得分:1)

这不是真正的代码问题,因此发布代码无效。

要限制缓存未命中,请更改DATA,以便每个线程一次只能处理自己的[L1缓存大小]块。合并排序相当容易。

一个典型的,有效的合并排序将使用一个线程池和合并任务来分割它们的输入分区并产生子合并,直到任务得到一个小于[L1缓存大小]的分区,然后使用in-像快速排序一样排序,以完成最后一点。

分割可以通过一个额外的[数据大小]缓冲区来完成,任务在快速排序完成后进行插入排序之间移动数据。不需要任何memcopying。

只是采用单线程内联代码并且在不考虑数据的情况下使其在多个线程上工作只是无效。