long valor = 0, i=0; __m128i vsum, vecPi, vecCi, vecQCi; vsum = _mm_set1_epi32(0); int32_t * const pA = A->data; int32_t * const pB = B->data; int sumDot[1]; for( ; i<SIZE-3 ;i+=4){ vecPi = _mm_loadu_si128((__m128i *)&(pA)[i] ); vecCi = _mm_loadu_si128((__m128i *)&(pB)[i] ); vecQCi = _mm_mullo_epi32(vecPi,vecCi); vsum = _mm_add_epi32(vsum,vecQCi); } vsum = _mm_hadd_epi32(vsum, vsum); vsum = _mm_hadd_epi32(vsum, vsum); _mm_storeu_si128((__m128i *)&(sumDot), vsum); for( ; i<SIZE; i++) valor += A->data[i] * B->data[i]; valor += sumDot[0];