我在我的程序中使用SSE指令来提高它的性能但是在调用_mm_shuffle_ps
时经常会崩溃。
我知道很可能是因为对齐需要16字节,但我无法解决这个问题。
这是我使用的代码(我的程序是使用VisualStudio 2017在32位编译的):
#define SHUFFLEMASK(A0,A1,B2,B3) ( (A0) | ((A1)<<2) | ((B2)<<4) | ((B3)<<6) )
inline __m128 RotateVector(const __m128& quaternion, const __m128& vector)
{
const uint32 shuffleMask = SHUFFLEMASK(3, 3, 3, 3);
// THE NEXT LINE IS THE ONE CRASHING
const __m128 qw = _mm_shuffle_ps(quaternion, quaternion, shuffleMask);
// The rest isn't useful since it crashes before even getting there
...
}
inline __m128 MakeVectorRegister(float X, float Y, float Z, float W)
{
return _mm_setr_ps(X, Y, Z, W);
}
class Vertex
{
public:
union
{
float vec[3];
struct
{
float x, y, z;
};
};
// Rest of class (only methods, no other attributes)
...
};
__declspec(align(16)) class X
{
...
__m128 _scale;
__m128 _rotation;
...
Vertex TransformVector(const Vertex& vector) const
{
float __declspec(align(16)) vectorData[3];
memcpy(vectorData, &vector.x, sizeof(float) * 3);
// The next line was originally this: const __m128 inputVectorW0 = MakeVectorRegister(((const float*)(&vector.x))[0], ((const float*)(&vector.x))[1], ((const float*)(&vector.x))[2], 0.0f)
const __m128 inputVectorW0 = MakeVectorRegister(((const float*)(vectorData))[0], ((const float*)(vectorData))[1], ((const float*)(vectorData))[2], 0.0f)
const __m128 scaledVec = _mm_mul_ps(_scale, inputVectorW0);
const __m128 rotatedVec = RotateVector(_rotation, scaledVec);
// The rest isn't useful since it crashes before
...
}
}
// Example of usage
int main(...)
{
Vertex v;
X x;
// This crashes calling _mm_shuffle_ps inside RotateVector
Vertex result = x.TransformVector(v);
}