c – SSE优化平方差之和
发布时间:2020-12-16 10:05:46 所属栏目:百科 来源:网络整理
导读:我最近发现我的程序花费大部分时间在以下简单函数中: void SumOfSquaredDifference( const uint8_t * a,size_t aStride,const uint8_t * b,size_t bStride,size_t width,size_t height,uint64_t * sum){ *sum = 0; for(size_t row = 0; row height; ++row)
我最近发现我的程序花费大部分时间在以下简单函数中:
void SumOfSquaredDifference( const uint8_t * a,size_t aStride,const uint8_t * b,size_t bStride,size_t width,size_t height,uint64_t * sum) { *sum = 0; for(size_t row = 0; row < height; ++row) { int rowSum = 0; for(size_t col = 0; col < width; ++col) { int d = a[col] - b[col]; rowSum += d*d; } *sum += rowSum; a += aStride; b += bStride; } } 此函数查找两个8位灰度图像的平方差的和. 解决方法
当然,您可以改进您的代码.
这是使用SSE2优化函数的示例: const __m128i Z = _mm_setzero_si128(); const size_t A = sizeof(__m128i); inline __m128i SquaredDifference(__m128i a,__m128i b) { const __m128i aLo = _mm_unpacklo_epi8(a,Z); const __m128i bLo = _mm_unpacklo_epi8(b,Z); const __m128i dLo = _mm_sub_epi16(aLo,bLo); const __m128i aHi = _mm_unpackhi_epi8(a,Z); const __m128i bHi = _mm_unpackhi_epi8(b,Z); const __m128i dHi = _mm_sub_epi16(aHi,bHi); return _mm_add_epi32(_mm_madd_epi16(dLo,dLo),_mm_madd_epi16(dHi,dHi)); } inline __m128i HorizontalSum32(__m128i a) { return _mm_add_epi64(_mm_unpacklo_epi32(a,Z),_mm_unpackhi_epi32(a,Z)); } inline uint64_t ExtractSum64(__m128i a) { uint64_t _a[2]; _mm_storeu_si128((__m128i*)_a,a); return _a[0] + _a[1]; } void SumOfSquaredDifference( const uint8_t *a,const uint8_t *b,uint64_t * sum) { assert(width%A == 0 && width < 0x10000); __m128i fullSum = Z; for(size_t row = 0; row < height; ++row) { __m128i rowSum = Z; for(size_t col = 0; col < width; col += A) { const __m128i a_ = _mm_loadu_si128((__m128i*)(a + col)); const __m128i b_ = _mm_loadu_si128((__m128i*)(b + col)); rowSum = _mm_add_epi32(rowSum,SquaredDifference(a_,b_)); } fullSum = _mm_add_epi64(fullSum,HorizontalSum32(rowSum)); a += aStride; b += bStride; } *sum = ExtractSum64(fullSum); } 这个例子有一些简化(如果图像宽度不是16的倍数则不起作用). 和SSSE3版本的一些魔力: const __m128i K_1FF = _mm_set1_epi16(0x1FF); inline __m128i SquaredDifference(__m128i a,__m128i b) { const __m128i lo = _mm_maddubs_epi16(_mm_unpacklo_epi8(a,b),K_1FF); const __m128i hi = _mm_maddubs_epi16(_mm_unpackhi_epi8(a,K_1FF); return _mm_add_epi32(_mm_madd_epi16(lo,lo),_mm_madd_epi16(hi,hi)); } 神奇的描述(见_mm_maddubs_epi16): K_1FF -> {-1,1,-1,...}; _mm_unpacklo_epi8(a,b) -> {a0,b0,a1,b1,...}; _mm_maddubs_epi16(_mm_unpacklo_epi8(a,K_1FF) -> {b0 - a0,b1 - a1,...}; (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |