c – 如何更有效地乘以转置矩阵?
我一直在为此而奋斗.我有一个基于SSE的算法,用于将矩阵A乘以矩阵B.我还需要实现A,B或两者都被转置的操作.我做了一个天真的实现,下面表示的4×4矩阵代码(我认为这是非常标准的SSE操作),但A * B ^ T操作大约是A * B的两倍. ATLAS实现为A * B返回类似的值,并且与转置相乘的结果几乎相同,这表明有一种有效的方法可以做到这一点.
MM-乘法: m1 = (mat1.m_>>2)<<2; n2 = (mat2.n_>>2)<<2; n = (mat1.n_>>2)<<2; for (k=0; k<n; k+=4) { for (i=0; i<m1; i+=4) { // fetch: get 4x4 matrix from mat1 // row-major storage,so get 4 rows Float* a0 = mat1.el_[i]+k; Float* a1 = mat1.el_[i+1]+k; Float* a2 = mat1.el_[i+2]+k; Float* a3 = mat1.el_[i+3]+k; for (j=0; j<n2; j+=4) { // fetch: get 4x4 matrix from mat2 // row-major storage,so get 4 rows Float* b0 = mat2.el_[k]+j; Float* b1 = mat2.el_[k+1]+j; Float* b2 = mat2.el_[k+2]+j; Float* b3 = mat2.el_[k+3]+j; __m128 b0r = _mm_loadu_ps(b0); __m128 b1r = _mm_loadu_ps(b1); __m128 b2r = _mm_loadu_ps(b2); __m128 b3r = _mm_loadu_ps(b3); { // first row of result += first row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a0+0),b0r),_mm_mul_ps(_mm_load_ps1(a0+1),b1r)); __m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a0+2),b2r),_mm_mul_ps(_mm_load_ps1(a0+3),b3r)); Float* c0 = this->el_[i]+j; _mm_storeu_ps(c0,_mm_add_ps(_mm_add_ps(cX1,cX2),_mm_loadu_ps(c0))); } { // second row of result += second row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a1+0),_mm_mul_ps(_mm_load_ps1(a1+1),b1r)); __m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a1+2),_mm_mul_ps(_mm_load_ps1(a1+3),b3r)); Float* c1 = this->el_[i+1]+j; _mm_storeu_ps(c1,_mm_loadu_ps(c1))); } { // third row of result += third row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a2+0),_mm_mul_ps(_mm_load_ps1(a2+1),b1r)); __m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a2+2),_mm_mul_ps(_mm_load_ps1(a2+3),b3r)); Float* c2 = this->el_[i+2]+j; _mm_storeu_ps(c2,_mm_loadu_ps(c2))); } { // fourth row of result += fourth row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a3+0),_mm_mul_ps(_mm_load_ps1(a3+1),b1r)); __m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a3+2),_mm_mul_ps(_mm_load_ps1(a3+3),b3r)); Float* c3 = this->el_[i+3]+j; _mm_storeu_ps(c3,_mm_loadu_ps(c3))); } } // Code omitted to handle remaining rows and columns } 对于MT乘法(矩阵乘以转置矩阵),我用以下命令将b0r存储到b3r并适当地改变了循环变量: __m128 b0r = _mm_set_ps(b3[0],b2[0],b1[0],b0[0]); __m128 b1r = _mm_set_ps(b3[1],b2[1],b1[1],b0[1]); __m128 b2r = _mm_set_ps(b3[2],b2[2],b1[2],b0[2]); __m128 b3r = _mm_set_ps(b3[3],b2[3],b1[3],b0[3]); 我怀疑减速的部分原因是一次拉动一次并且每次都要存储4个值来获得色谱柱之间的区别,但我觉得另一种方式是这样做,拉入B行和然后乘以As的列,只需将成本转移到存储4列结果. 我也尝试将B的行拉成行然后使用_MM_TRANSPOSE4_PS(b0r,b1r,b2r,b3r);做转置(我认为在那个宏中可能会有一些额外的优化),但是没有真正的改进. 从表面上看,我觉得这应该更快……所涉及的点数产品会连续一行,这看起来本质上更有效率,但试图直接做点产品只会导致必须做同样的事情存储结果. 我在这里错过了什么? 补充:为了澄清,我试图不转置矩阵.我宁愿沿着它们进行迭代.我能说的最好的问题是_mm_set_ps命令比_mm_load_ps慢得多. 我还尝试了一种变体,其中我存储了A矩阵的4行,然后替换了包含1个加载,4个乘法和4个加法的4个卷曲括号的段,4个乘法指令和3个加号,但是没有用.时间保持不变(是的,我尝试使用调试语句来验证代码在我的测试编译中是否已更改.当然,在分析之前删除了所述调试语句): { // first row of result += first row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a0r,_mm_mul_ps(a0r,b1r)); __m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a0r,b3r)); Float* c0 = this->el_[i]+j; _mm_storeu_ps(c0,_mm_add_ps(_mm_hadd_ps(cX1,_mm_loadu_ps(c0))); } { // second row of result += second row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a1r,_mm_mul_ps(a1r,b1r)); __m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a1r,b3r)); Float* c0 = this->el_[i+1]+j; _mm_storeu_ps(c0,_mm_loadu_ps(c0))); } { // third row of result += third row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a2r,_mm_mul_ps(a2r,b1r)); __m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a2r,b3r)); Float* c0 = this->el_[i+2]+j; _mm_storeu_ps(c0,_mm_loadu_ps(c0))); } { // fourth row of result += fourth row of mat1 * 4x4 of mat2 __m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a3r,_mm_mul_ps(a3r,b1r)); __m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a3r,b3r)); Float* c0 = this->el_[i+3]+j; _mm_storeu_ps(c0,_mm_loadu_ps(c0))); } 更新: 解决方法
我认为这是水平添加有用的少数情况.你想要C = AB ^ T但B不作为转置存储在内存中.那就是问题所在.它的存储就像AoS而不是SoA.在这种情况下,采用B的转置并进行垂直加法比使用水平加法慢.这对于Matrixvector
Efficient 4×4 matrix vector multiplication with SSE: horizontal add and dot product – what’s the point?至少也是如此.在下面的代码中,函数m4x4是非SSE 4×4矩阵乘积,m4x4_vec使用SSE,
m4x4T在没有SSE的情况下进行C = AB ^ T,并且m4x4T_vec进行C = AB ^ T usisg SSE.最后一个是你想要的那个. 注意:对于较大的矩阵,我不会使用这种方法.在这种情况下,首先采用转置并使用垂直添加更快(使用SSE / AVX,您可以执行更复杂的操作,转换具有SSE / AVX宽度的条带).这是因为转置为O(n ^ 2),矩阵乘积为O(n ^ 3),因此对于大型矩阵,转置是无关紧要的.然而,对于4×4,转置是重要的,因此水平加入获胜. 编辑: C = A*B in Einstein notation is C_i,j = A_i,k * B_k,j. Since (A*B)^T = B^T*A^T we can write C = (A*B)^T in Einstein notation is C_i,j = B^T_i,k * A^T_k,j = A_j,i 如果你比较两者,唯一改变的是我们交换j和i的角色.我在这个答案的最后给了一些代码来做这件事. #include "stdio.h" #include <nmmintrin.h> void m4x4(const float *A,const float *B,float *C) { for(int i=0; i<4; i++) { for(int j=0; j<4; j++) { float sum = 0.0f; for(int k=0; k<4; k++) { sum += A[i*4+k]*B[k*4+j]; } C[i*4 + j] = sum; } } } void m4x4T(const float *A,float *C) { for(int i=0; i<4; i++) { for(int j=0; j<4; j++) { float sum = 0.0f; for(int k=0; k<4; k++) { sum += A[i*4+k]*B[j*4+k]; } C[i*4 + j] = sum; } } } void m4x4_vec(const float *A,float *C) { __m128 Brow[4],Mrow[4]; for(int i=0; i<4; i++) { Brow[i] = _mm_load_ps(&B[4*i]); } for(int i=0; i<4; i++) { Mrow[i] = _mm_set1_ps(0.0f); for(int j=0; j<4; j++) { __m128 a = _mm_set1_ps(A[4*i +j]); Mrow[i] = _mm_add_ps(Mrow[i],_mm_mul_ps(a,Brow[j])); } } for(int i=0; i<4; i++) { _mm_store_ps(&C[4*i],Mrow[i]); } } void m4x4T_vec(const float *A,float *C) { __m128 Arow[4],Brow[4],Mrow[4]; for(int i=0; i<4; i++) { Arow[i] = _mm_load_ps(&A[4*i]); Brow[i] = _mm_load_ps(&B[4*i]); } for(int i=0; i<4; i++) { __m128 prod[4]; for(int j=0; j<4; j++) { prod[j] = _mm_mul_ps(Arow[i],Brow[j]); } Mrow[i] = _mm_hadd_ps(_mm_hadd_ps(prod[0],prod[1]),_mm_hadd_ps(prod[2],prod[3])); } for(int i=0; i<4; i++) { _mm_store_ps(&C[4*i],Mrow[i]); } } float compare_4x4(const float* A,const float*B) { float diff = 0.0f; for(int i=0; i<4; i++) { for(int j=0; j<4; j++) { diff += A[i*4 +j] - B[i*4+j]; printf("A %f,B %fn",A[i*4 +j],B[i*4 +j]); } } return diff; } int main() { float *A = (float*)_mm_malloc(sizeof(float)*16,16); float *B = (float*)_mm_malloc(sizeof(float)*16,16); float *C1 = (float*)_mm_malloc(sizeof(float)*16,16); float *C2 = (float*)_mm_malloc(sizeof(float)*16,16); for(int i=0; i<4; i++) { for(int j=0; j<4; j++) { A[i*4 +j] = i*4+j; B[i*4 +j] = i*4+j; C1[i*4 +j] = 0.0f; C2[i*4 +j] = 0.0f; } } m4x4T(A,B,C1); m4x4T_vec(A,C2); printf("compare %fn",compare_4x4(C1,C2)); } 编辑: 这是标量和SSE函数,它做C =(AB)^ T.它们应该和它们的AB版本一样快. void m4x4TT(const float *A,float *C) { for(int i=0; i<4; i++) { for(int j=0; j<4; j++) { float sum = 0.0f; for(int k=0; k<4; k++) { sum += A[j*4+k]*B[k*4+i]; } C[i*4 + j] = sum; } } } void m4x4TT_vec(const float *A,Crow[4]; for(int i=0; i<4; i++) { Arow[i] = _mm_load_ps(&A[4*i]); } for(int i=0; i<4; i++) { Crow[i] = _mm_set1_ps(0.0f); for(int j=0; j<4; j++) { __m128 a = _mm_set1_ps(B[4*i +j]); Crow[i] = _mm_add_ps(Crow[i],Arow[j])); } } for(int i=0; i<4; i++) { _mm_store_ps(&C[4*i],Crow[i]); } } (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |