_mm_mpsadbw_epu8
Microsoft Specific
Emits the Streaming SIMD Extensions 4 (SSE4) instruction mpsadbw. This instruction computes multiple packed sums on the absolute value of the difference between two parameters.
__m128i _mm_mpsadbw_epu8( __m128i a, __m128i b, const int mask );
A 128-bit result that contains eight 16-bit unsigned integers. The values of these integers can be computed as follows:
i = mask2 * 4
j = mask0-1 * 4
for (k = 0; k < 8; k = k + 1) {
t0 = abs(a[i + k + 0] - b[j + 0])
t1 = abs(a[i + k + 1] - b[j + 1])
t2 = abs(a[i + k + 2] - b[j + 2])
t3 = abs(a[i + k + 3] - b[j + 3])
r[k] = t0 + t1 + t2 + t3
}
a[n] and b[n] indicate the nth ordered unsigned 8-bit integer of parameters a and b where a[0] and b[0] are the lowest 8 bits. r[n] is the nth ordered unsigned 16-bit element of result r, where r[0] refers to the lowest 16 bits. mask0, mask1, and mask2 are the three least significant bits of parameter mask.
Before you use this intrinsic, software must ensure that the processor supports the instruction.
#include <stdio.h>
#include <smmintrin.h>
int main ()
{
__m128i a, b;
// A mask value of 0101 (5) will add four to each index
const int mask = 5;
a.m128i_u8[0] = 15;
a.m128i_u8[1] = 60;
a.m128i_u8[2] = 55;
a.m128i_u8[3] = 31;
a.m128i_u8[4] = 0;
a.m128i_u8[5] = 1;
a.m128i_u8[6] = 2;
a.m128i_u8[7] = 4;
a.m128i_u8[8] = 8;
a.m128i_u8[9] = 16;
a.m128i_u8[10] = 32;
a.m128i_u8[11] = 64;
a.m128i_u8[12] = 128;
a.m128i_u8[13] = 255;
a.m128i_u8[14] = 1;
a.m128i_u8[15] = 17;
b.m128i_u8[0] = 2;
b.m128i_u8[1] = 4;
b.m128i_u8[2] = 8;
b.m128i_u8[3] = 64;
b.m128i_u8[4] = 255;
b.m128i_u8[5] = 0;
b.m128i_u8[6] = 1;
b.m128i_u8[7] = 16;
b.m128i_u8[8] = 32;
b.m128i_u8[9] = 64;
b.m128i_u8[10] = 128;
b.m128i_u8[11] = 255;
b.m128i_u8[12] = 75;
b.m128i_u8[13] = 31;
b.m128i_u8[14] = 42;
b.m128i_u8[15] = 11;
__m128i res = _mm_mpsadbw_epu8(a, b, mask);
__m128i final;
int temp1, temp2, temp3, temp4, index;
for (index = 0; index < 8; index++)
{
temp1 = abs(a.m128i_u8[4 + index] - b.m128i_u8[4]);
temp2 = abs(a.m128i_u8[4 + index + 1] - b.m128i_u8[4 + 1]);
temp3 = abs(a.m128i_u8[4 + index + 2] - b.m128i_u8[4 + 2]);
temp4 = abs(a.m128i_u8[4 + index + 3] - b.m128i_u8[4 + 3]);
final.m128i_u16[index] = temp1 + temp2 + temp3 + temp4;
}
printf_s("Res0 should be %d: %d\nRes1 should be %d: %d\n",
final.m128i_u16[0], res.m128i_u16[0],
final.m128i_u16[1], res.m128i_u16[1]);
printf_s("Res2 should be %d: %d\nRes3 should be %d: %d\n",
final.m128i_u16[2], res.m128i_u16[2],
final.m128i_u16[3], res.m128i_u16[3]);
printf_s("Res4 should be %d: %d\nRes5 should be %d: %d\n",
final.m128i_u16[4], res.m128i_u16[4],
final.m128i_u16[5], res.m128i_u16[5]);
printf_s("Res6 should be %d: %d\nRes7 should be %d: %d\n",
final.m128i_u16[6], res.m128i_u16[6],
final.m128i_u16[7], res.m128i_u16[7]);
return 0;
}