Question

我在C ++中有一个简短的浮动转换，这是我的代码瓶颈。

代码从本机短路的硬件设备缓冲区转换，这代表来自花式光子计数器的输入。

float factor=  1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
    int value = source[i];//ushort -> int
    destination[i] = value*factor;//int*float->float
}

一些细节

值应从0到2 ^ 16-1，它代表高灵敏度相机的像素值
我在配备i7处理器的多核x86机器上（i7 960是SSE 4.2和4.1）。
源与8位边界对齐（硬件设备的要求）
W * H总是可被8整除，大部分时间W和H可被8整除

这让我感到难过，有什么我可以做的吗？

我正在使用Visual Studios 2012 ...

Answer 1

这是一个基本的SSE4.1实现：

__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
    //  Load 8 16-bit ushorts.
    //  vi = {a,b,c,d,e,f,g,h}
    __m128i vi = _mm_load_si128((const __m128i*)(source + i));

    //  Convert to 32-bit integers
    //  vi0 = {a,0,b,0,c,0,d,0}
    //  vi1 = {e,0,f,0,g,0,h,0}
    __m128i vi0 = _mm_cvtepu16_epi32(vi);
    __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

    //  Convert to float
    __m128 vf0 = _mm_cvtepi32_ps(vi0);
    __m128 vf1 = _mm_cvtepi32_ps(vi1);

    //  Multiply
    vf0 = _mm_mul_ps(vf0,factor);
    vf1 = _mm_mul_ps(vf1,factor);

    //  Store
    _mm_store_ps(destination + i + 0,vf0);
    _mm_store_ps(destination + i + 4,vf1);
}

这假定：

source和destination都与16个字节对齐。
W*H是8的倍数。

通过进一步展开此循环可以做得更好。（见下文）

这里的想法如下：

将8个短路加载到单个SSE寄存器中。
将寄存器拆分为两个：一个是底部4个短裤，另一个是前4个短裤。
将两个寄存器零扩展为32位整数。
将它们转换为float s。
乘以因子。
将它们存储到destination。

编辑：

我已经进行了一段时间的优化，所以我继续展开循环。

Core i7 920 @ 3.5 GHz
Visual Studio 2012 - 发布x64：

Original Loop      : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416

进一步展开导致收益递减。

这是测试代码：

#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;


void default_loop(float *destination,const short* source,float value,int size){
    float factor = 1.0f / value; 
    for (int i = 0; i < size; i++)
    {
        int value = source[i];
        destination[i] = value*factor;
    }
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}
void print_sum(const float *destination,int size){
    float sum = 0;
    for (int i = 0; i < size; i++){
        sum += destination[i];
    }
    cout << sum << endl;
}

int main(){

    int size = 8000;

    short *source       = (short*)_mm_malloc(size * sizeof(short), 16);
    float *destination  = (float*)_mm_malloc(size * sizeof(float), 16);

    for (int i = 0; i < size; i++){
        source[i] = i;
    }

    float value = 1.1;

    int iterations = 1000000;
    clock_t start;

    //  Default Loop
    start = clock();
    for (int it = 0; it < iterations; it++){
        default_loop(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    //  Vectorize 8, no unroll
    start = clock();
    for (int it = 0; it < iterations; it++){
        vectorize8_unroll1(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    //  Vectorize 8, unroll 2
    start = clock();
    for (int it = 0; it < iterations; it++){
        vectorize8_unroll2(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    _mm_free(source);
    _mm_free(destination);

    system("pause");
}

Answer 2

我相信我有最好的答案。我的结果比神秘的快得多。它们只需要SSE2，但可以利用SSE3，SSE4，AVX甚至AVX2（如果有的话）。您不必更改任何代码。你只需要重新编译。

我跑了三个尺寸：8008,64000和2560 * 1920 = 4915200.我尝试了几种不同的版本。我列出了下面最重要的一些。函数vectorize8_unroll2是神秘的功能。我做了一个名为vectorize8_unroll2_parallel的改进版本。函数vec16_loop_unroll2_fix和vec16_loop_unroll2_parallel_fix是我的函数，我认为它比神秘的更好。如果使用AVX进行编译，这些函数将自动使用AVX，但在SSE4甚至SSE2上工作正常

另外，你写道“W * H总是可被8整除，大部分时间W和H可以被8整除”。所以我们不能假设在所有情况下W * H都可被16整除。当大小不是16的倍数时，神秘的函数 vectorize8_unroll2 有一个错误（在他的代码中尝试大小= 8008，你会看到我的意思）。我的代码没有这样的错误。

我正在使用Ander Fog的vectorclass进行矢量化。它不是lib或dll文件。它只是一些头文件。我使用OpenMP进行并行化。以下是一些结果：

Intel Xeon E5630 @2.53GHz (supports upto SSE4.2)    
size 8008, size2 8032, iterations 1000000

                        default_loop time: 7.935 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000

size 64000, size2 64000, iterations 100000
                        default_loop time: 6.387 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000

size 4915200, size2 4915200, iterations 1000
                        default_loop time: 5.125 seconds, diff 0.000000
                  vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000

编辑：我在本回答结尾处使用GCC在AVX系统上添加了结果。

以下是代码。代码看起来很长，因为我做了很多交叉检查并测试了很多变化。从中下载vectorclass http://www.agner.org/optimize/#vectorclass。将头文件（vectorclass.h，instrset.h，vectorf128.h，vectorf256.h，vectorf256e.h，vectori128.h，vectori256.h，vectori256e.h）复制到编译目录中。在C ++ / CommandLine下添加/ D__SSE4_2__。在发布模式下编译。如果您有一个带AVX的CPU，那么请改为/ arch：AVX。在C ++专业/语言下添加OpenMP支持。

In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp

在下面的代码中，函数vec16_loop_unroll2_parallel要求数组为32的倍数。您可以将数组大小更改为32的倍数（即size2指的是）或者如果不可能，则可以使用没有这种限制的函数vec16_loop_unroll2_parallel_fix。反正它一样快。

#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"

#define ROUND_DOWN(x, s) ((x) & ~((s)-1))

inline void* aligned_malloc(size_t size, size_t align) {
    void *result;
    #ifdef _MSC_VER 
    result = _aligned_malloc(size, align);
    #else 
     if(posix_memalign(&result, align, size)) result = 0;
    #endif
    return result;
}

inline void aligned_free(void *ptr) {
    #ifdef _MSC_VER 
        _aligned_free(ptr);
    #else 
      free(ptr);
    #endif

}

void default_loop(float *destination, const unsigned short* source, float value, int size){
    float factor = 1.0f/value;
    for (int i = 0; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }
}


void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
    float factor = 1.0f / value;
    #pragma omp parallel for  
    for (int i = 0; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }
}

void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 8) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 4);
  }
}

void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 16) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 4);

    Vec8us vi_new = Vec8us().load(source + i + 8);
    Vec4ui vi2 = extend_low(vi_new);
    Vec4ui vi3 = extend_high(vi_new);
    Vec4f vf2 = to_float(vi2);
    Vec4f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 8);
    vf3.store(destination + i + 12);
  }
}

void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 8) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 4);
  }
}

void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 16) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 4);

    Vec8us vi_new = Vec8us().load(source + i + 8);
    Vec4ui vi2 = extend_low(vi_new);
    Vec4ui vi3 = extend_high(vi_new);
    Vec4f vf2 = to_float(vi2);
    Vec4f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 8);
    vf3.store(destination + i + 12);
  }
}

void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 16) {
    Vec16us vi = Vec16us().load(source + i);
    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 8);
  }
}

void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 32) {
    Vec16us vi = Vec16us().load(source + i);

    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 8);

    Vec16us vi_new = Vec16us().load(source + i + 16);

    Vec8ui vi2 = extend_low(vi_new);
    Vec8ui vi3 = extend_high(vi_new);
    Vec8f vf2 = to_float(vi2);
    Vec8f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 16);
    vf3.store(destination + i + 24);

  }
}

void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    int i = 0;
    for (; i <ROUND_DOWN(size, 32); i += 32) {
    Vec16us vi = Vec16us().load(source + i);

    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 8);

    Vec16us vi_new = Vec16us().load(source + i + 16);

    Vec8ui vi2 = extend_low(vi_new);
    Vec8ui vi3 = extend_high(vi_new);
    Vec8f vf2 = to_float(vi2);
    Vec8f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 16);
    vf3.store(destination + i + 24);

    }
    for (; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }

}

void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 16) {
    Vec16us vi = Vec16us().load(source + i);
    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 8);
  }
}

void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    #pragma omp parallel for
    for (int i = 0; i < size; i += 32) {
        Vec16us vi = Vec16us().load(source + i); 
        Vec8ui vi0 = extend_low(vi);
        Vec8ui vi1 = extend_high(vi);
        Vec8f vf0 = to_float(vi0);
        Vec8f vf1 = to_float(vi1);
        vf0*=factor;
        vf1*=factor;
        vf0.store(destination + i + 0);
        vf1.store(destination + i + 8);

        Vec16us vi_new = Vec16us().load(source + i + 16);
        Vec8ui vi2 = extend_low(vi_new);
        Vec8ui vi3 = extend_high(vi_new);
        Vec8f vf2 = to_float(vi2);
        Vec8f vf3 = to_float(vi3);
        vf2*=factor;
        vf3*=factor;
        vf2.store(destination + i + 16);
        vf3.store(destination + i + 24);
    }
}

void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    int i = 0;  
    #pragma omp parallel for 
    for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
        Vec16us vi = Vec16us().load(source + i);  
        Vec8ui vi0 = extend_low(vi);
        Vec8ui vi1 = extend_high(vi);
        Vec8f vf0 = to_float(vi0);
        Vec8f vf1 = to_float(vi1);
        vf0*=factor;
        vf1*=factor;
        vf0.store(destination + i + 0);
        vf1.store(destination + i + 8);

        Vec16us vi_new = Vec16us().load(source + i + 16); 
        Vec8ui vi2 = extend_low(vi_new);
        Vec8ui vi3 = extend_high(vi_new);
        Vec8f vf2 = to_float(vi2);
        Vec8f vf3 = to_float(vi3);
        vf2*=factor;
        vf3*=factor;
        vf2.store(destination + i + 16);
        vf3.store(destination + i + 24);

    }

    for(int i = ROUND_DOWN(size, 32); i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }

}

void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}

void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}

void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    #pragma omp parallel for
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}



void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    #pragma omp parallel for
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}

void copy_arrays(float* a, float*b, const int size) {
    float sum = 0;
    for(int i=0; i<size; i++) {
        b[i] = a[i];
    }
}

float compare_arrays(float* a, float*b, const int size) {
    float sum = 0;
    for(int i=0; i<size; i++) {
        float diff = a[i] - b[i];
        if(diff!=0)  {
            printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
            break;
        }
        sum += diff;
    }
    return sum;
}

void randomize_array(unsigned short* a, const int size) {
    for(int i=0; i<size; i++) {
        float r = (float)rand()/RAND_MAX;
        a[i] = (int)(65536*r);
    }
}

void run(int size, int iterations) {
    int rd = ROUND_DOWN(size, 32);
    int size2 = rd == size ? size : rd + 32;
    float value = 1.1f;

    printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
    unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
    float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
    float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
    float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);

    void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
    fp[0] = default_loop;
    fp[1] = vec8_loop;
    fp[2] = vec8_loop_unroll2;
    fp[3] = vec16_loop;
    fp[4] = vec16_loop_unroll2;
    fp[5] = vec16_loop_unroll2_fix;
    fp[6] = vectorize8_unroll1;
    fp[7] = vectorize8_unroll2;

    fp[8] = default_loop_parallel;
    fp[9] = vec8_loop_parallel;
    fp[10] = vec8_loop_unroll2_parallel;
    fp[11] = vec16_loop_parallel;
    fp[12] = vec16_loop_unroll2_parallel;
    fp[13] = vec16_loop_unroll2_parallel_fix;
    fp[14] = vectorize8_unroll1_parallel;
    fp[15] = vectorize8_unroll2_parallel;

    char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
        "default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
        "vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};

    randomize_array(source, size2);

    copy_arrays(destination_old, destination_ref, size);
    fp[0](destination_ref, source, value, size);

    for(int i=0; i<16; i++) {
        copy_arrays(destination_old, destination, size);
        double dtime = omp_get_wtime();
        for (int it = 0; it < iterations; it++){
            fp[i](destination, source, value, size);
        }
        dtime = omp_get_wtime() - dtime;
        float diff = compare_arrays(destination, destination_ref, size);
        printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
    }
    printf("\n");
    aligned_free(source);
    aligned_free(destination);
    aligned_free(destination_old);
    aligned_free(destination_ref);
}
int main() {
    run(8008, 1000000); 
    run(64000, 100000);
    run(2560*1920, 1000);
}

结果在具有AVX的系统上使用GCC。 GCC自动并行化循环（Visual Studio因短而失败，但如果你尝试int则可以工作）。手写矢量化代码获得的收益很少。但是，使用多个线程可以根据数组大小提供帮助。对于8008的小阵列大小，OpenMP会产生更糟糕的结果。但是，对于较大的数组，128000使用OpenMP可以提供更好的结果。对于最大的数组大小4915200，它完全受内存限制，而OpenMP没有帮助。

i7-2600k @ 4.4GHz
size 8008, size2 8032, iterations 1000000
                        default_loop time: 1.319 seconds, diff 0.000000          
              vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.227 seconds, diff 0.000000                
         vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000

size 128000, size2 128000, iterations 100000
                        default_loop time: 2.902 seconds, diff 0.000000                     
              vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
                  vectorize8_unroll2 time: 2.844 seconds, diff 0.000000         
     vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000

size 4915200, size2 4915200, iterations 1000
                        default_loop time: 2.313 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000    
                  vectorize8_unroll2 time: 2.318 seconds, diff 0.000000                
     vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000         
         vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000

Answer 3

在我的机器上使用SSE内在函数[Quad Core Athlon，3.3GHz，16GB RAM]和g++ -O2优化[1]可以提供大约2.5-3倍的速度。我还编写了一个函数来在内联汇编程序中执行相同的操作，但它并没有明显更快（再次，这适用于我的机器，随意在其他机器上运行）。

我尝试了各种尺寸的H * W，它们都给出了大致相同的结果。

[1]使用g++ -O3为所有四个函数提供相同的时间，因为显然-O3启用“自动矢量化代码”。因此，假设您的编译器支持类似的自动矢量化功能，那么整个事情就有点浪费时间。

<强>结果

convert_naive                  sum=4373.98 t=7034751 t/n=7.03475
convert_naive                  sum=4373.98 t=7266738 t/n=7.26674
convert_naive                  sum=4373.98 t=7006154 t/n=7.00615
convert_naive                  sum=4373.98 t=6815329 t/n=6.81533
convert_naive                  sum=4373.98 t=6820318 t/n=6.82032
convert_unroll4                sum=4373.98 t=8103193 t/n=8.10319
convert_unroll4                sum=4373.98 t=7276156 t/n=7.27616
convert_unroll4                sum=4373.98 t=7028181 t/n=7.02818
convert_unroll4                sum=4373.98 t=7074258 t/n=7.07426
convert_unroll4                sum=4373.98 t=7081518 t/n=7.08152
convert_sse_intrinsic          sum=4373.98 t=3377290 t/n=3.37729
convert_sse_intrinsic          sum=4373.98 t=3227018 t/n=3.22702
convert_sse_intrinsic          sum=4373.98 t=3007898 t/n=3.0079
convert_sse_intrinsic          sum=4373.98 t=3253366 t/n=3.25337
convert_sse_intrinsic          sum=4373.98 t=5576068 t/n=5.57607
convert_sse_inlineasm          sum=4373.98 t=3470887 t/n=3.47089
convert_sse_inlineasm          sum=4373.98 t=2838492 t/n=2.83849
convert_sse_inlineasm          sum=4373.98 t=2828556 t/n=2.82856
convert_sse_inlineasm          sum=4373.98 t=2789052 t/n=2.78905
convert_sse_inlineasm          sum=4373.98 t=3176522 t/n=3.17652

<强>代码

#include <iostream>
#include <iomanip>
#include <cstdlib> 
#include <cstring>
#include <xmmintrin.h>
#include <emmintrin.h>


#define W 1000
#define H 1000

static __inline__ unsigned long long rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

void convert_naive(short *source, float *destination)
{
    float factor=  1.0f/32767;
    for (int i = 0; i < W*H; i++)
    {
    int value = source[i];
    destination[i] = value*factor;
    }
}


void convert_unroll4(short *source, float *destination)
{
    float factor=  1.0f/32767;
    for (int i = 0; i < W*H; i+=4)
    {
    int v1 = source[i];
    int v2 = source[i+1];
    int v3 = source[i+2];
    int v4 = source[i+3];
    destination[i]   = v1*factor;
    destination[i+1] = v2*factor;
    destination[i+2] = v3*factor;
    destination[i+3] = v4*factor;
    }
}


void convert_sse_intrinsic(short *source, float *destination)
{
    __m128 factor =  { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
    __m64 zero1 =  { 0,0 };
    __m128i zero2 =  { 0,0 };
    __m64 *ps = reinterpret_cast<__m64 *>(source);
    __m128 *pd = reinterpret_cast<__m128 *>(destination);
    for (int i = 0; i < W*H; i+=4)
    {
    __m128i value = _mm_unpacklo_epi16(_mm_set_epi64(zero1, *ps), zero2);
    value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
    __m128  fval  = _mm_cvtepi32_ps(value);
    *pd = _mm_mul_ps(fval, factor);   // destination[0,1,2,3] = value[0,1,2,3] * factor;
    pd++;
    ps++;
    }
}

void convert_sse_inlineasm(short *source, float *destination)
{
    __m128 factor =  { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
    __asm__ __volatile__(
    "\t pxor       %%xmm1, %%xmm1\n"
    "\t movaps     %3, %%xmm2\n"
    "\t mov        $0, %%rax\n"
    "1:"
    "\t movq       (%1, %%rax), %%xmm0\n"
    "\t movq       8(%1, %%rax), %%xmm3\n"
    "\t movq       16(%1, %%rax), %%xmm4\n"
    "\t movq       24(%1, %%rax), %%xmm5\n"
    "\t punpcklwd  %%xmm1, %%xmm0\n"
    "\t pslld      $16, %%xmm0\n"
    "\t psrad      $16, %%xmm0\n"
    "\t cvtdq2ps   %%xmm0, %%xmm0\n"
    "\t mulps      %%xmm2, %%xmm0\n"
    "\t punpcklwd  %%xmm1, %%xmm3\n"
    "\t pslld      $16, %%xmm3\n"
    "\t psrad      $16, %%xmm3\n"
    "\t cvtdq2ps   %%xmm3, %%xmm3\n"
    "\t mulps      %%xmm2, %%xmm3\n"
    "\t punpcklwd  %%xmm1, %%xmm4\n"
    "\t pslld      $16, %%xmm4\n"
    "\t psrad      $16, %%xmm4\n"
    "\t cvtdq2ps   %%xmm4, %%xmm4\n"
    "\t mulps      %%xmm2, %%xmm4\n"
    "\t punpcklwd  %%xmm1, %%xmm5\n"
    "\t pslld      $16, %%xmm5\n"
    "\t psrad      $16, %%xmm5\n"
    "\t cvtdq2ps   %%xmm5, %%xmm5\n"
    "\t mulps      %%xmm2, %%xmm5\n"
    "\t movaps     %%xmm0, (%0, %%rax, 2)\n"
    "\t movaps     %%xmm3, 16(%0, %%rax, 2)\n"
    "\t movaps     %%xmm4, 32(%0, %%rax, 2)\n"
    "\t movaps     %%xmm5, 48(%0, %%rax, 2)\n"
    "\t addq       $32, %%rax\n"
    "\t cmpq       %2, %%rax\n"
    "\t jbe        1b\n"
    : /* no outputs */ 
    : "r" (destination), "r" (source), "i"(sizeof(*source) * H * W), "m"(factor):
      "rax", "xmm0", "xmm1", "xmm3");
}




short inbuffer[W * H] __attribute__ ((aligned (16)));
float outbuffer[W * H + 16] __attribute__ ((aligned (16)));
#ifdef DEBUG
float outbuffer2[W * H];
#endif


typedef void (*func)(short *source, float *destination);

struct BmEntry
{
    const char *name;
    func  fn;
};

void bm(BmEntry& e)
{
    memset(outbuffer, 0, sizeof(outbuffer));
    unsigned long long t = rdtsc();
    e.fn(inbuffer, outbuffer);
    t = rdtsc() - t; 

    float sum = 0;
    for(int i = 0; i < W * H; i++)
    {
    sum += outbuffer[i]; 
    }

#if DEBUG
    convert_naive(inbuffer, outbuffer2);
    for(int i = 0; i < W * H; i++)
    {
    if (outbuffer[i] != outbuffer2[i])
    {
        std::cout << i << ":: " << inbuffer[i] << ": " 
              << outbuffer[i] << " != " << outbuffer2[i] 
              << std::endl;
    }
    }
#endif

    std::cout << std::left << std::setw(30) << e.name << " sum=" << sum << " t=" << t << 
    " t/n=" << (double)t / (W * H) << std::endl;
}


#define BM(x) { #x, x }


BmEntry table[] = 
{
    BM(convert_naive),
    BM(convert_unroll4),
    BM(convert_sse_intrinsic),
    BM(convert_sse_inlineasm),
};


int main()
{
    for(int i = 0; i < W * H; i++)
    {
    inbuffer[i] = (short)i;
    }

    for(int i = 0; i < sizeof(table)/sizeof(table[i]); i++)
    {
    for(int j = 0; j < 5; j++)
        bm(table[i]);
    }
    return 0;
}

Answer 4

不确定循环中的条件表达式是否仅计算一次。你可以尝试：

float factor=  1.0f/value;
for (int i = 0, count = W*H; i < count; ++i)//25% of time is spent doing this
{
    int value = source[i];//short -> int
    destination[i] = value*factor;//int->float
}

Answer 5

这不是一个有效的答案，不要把它当成它，但我实际上想知道代码如何通过使用256k查找表来表现。（基本上是一个'短到浮动'表，包含65536个条目）。

我认为CoreI7有大约8兆字节的缓存，因此查找表适合数据缓存。

我真的很想知道这会如何影响表现：）

Answer 6

并且您可以使用OpenMP来雇用CPU的每个核心，只需执行以下操作即可：

#include <omp.h>
float factor=  1.0f/value;
#pragma omp parallel for 
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
    int value = source[i];//ushort -> int
    destination[i] = value*factor;//int*float->float
}

这是基于以前程序的结果，只需添加如下：

#pragma omp parallel for 
for (int it = 0; it < iterations; it++){
 ...
}

然后是结果

beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -fopenmp

beta@beta-PC ~
$ opt
0.748
2.90873e+007
0.484
2.90873e+007
0.796
2.90873e+007


beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -O3


beta@beta-PC ~
$ opt
1.404
2.90873e+007
1.404
2.90873e+007
1.404
2.90873e+007

。

结果显示openmp提高了100％。 Visual C ++也支持openmp。

Answer 7

您可以尝试近似表达式

float factor = 1.0f/value;

由numerator/denomitator和numerator denominator组成的分数int。这可以在您的应用程序中完成所需的精度，如

int denominator = 10000;
int numerator = factor * denominator;

然后你可以在整数算术中进行计算，比如

int value = source[i];
destination[i] = (value * numerator) / numerator;

您必须处理溢出，或许您需要切换到{64}系统上的long（甚至long long）进行计算。

加速投注的空头？

7 个答案: