NEON:优化代码

时间:2017-07-25 13:11:26

标签: c arm neon

我目前正在玩ARM Neon并编写了以下函数,一个用C语言编写,一个用NEON Intrinsics来比较速度。这些函数比较两个数组。参数cb是字节数除以8:

inline uint32_t is_not_zero(uint32x4_t v)
{
        uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
        return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
}

uint32_t sum_neon(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
        const uint32_t *s1_cmp = (uint32_t *)s1;
        const uint32_t *s2_cmp = (uint32_t *)s2;

        cb *= 2;

        while (cb--)
        {
                uint32x4x2_t cmp1 = vld2q_u32(s1_cmp);
                uint32x4x2_t cmp2 = vld2q_u32(s2_cmp);

                uint32x4_t res1 = vceqq_u32(cmp1.val[0], cmp2.val[0]);
                uint32x4_t res2 = vceqq_u32(cmp1.val[1], cmp2.val[1]);

                if (!is_not_zero(res1)) return 1;
                if (!is_not_zero(res2)) return 1;

                s1_cmp += 8;
                s2_cmp += 8;
        }
        return 0;
}

uint32_t sum_c(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
    const uint64_t *p1 = (uint64_t *)s1;
    const uint64_t *p2 = (uint64_t *)s2;
    uint32_t n = 0;
    while (cb--) {
        if ((p1[n  ] != p2[n  ]) ||
                (p1[n+1] != p2[n+1]) ||
                (p1[n+2] != p2[n+2]) ||
                (p1[n+3] != p2[n+3])) return 1;
        ++n;
    }
    return 0; 
}

我不明白为什么C实现比NEON变体更快。代码是在raspberry pi上编译的 -O3 -mcpu=cortex-a7 -mfpu=neon-vfpv4 -mfloat-abi=hard作为CFlags。

0 个答案:

没有答案