📄 umc_h264_sad.cpp
字号:
_p_3 = _mm_loadl_epi64((__m128i*)(&diff[7][4]));
_p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
_p_2 = _mm_unpacklo_epi16(_p_2, _p_3);
_p_1 = _mm_unpackhi_epi32(_p_0, _p_2);
_p_0 = _mm_unpacklo_epi32(_p_0, _p_2);
_mm_storel_epi64((__m128i*)&tmp[4][4], _p_0);
_mm_storel_epi64((__m128i*)&tmp[6][4], _p_1);
_p_0 = _mm_srli_si128(_p_0, 8);
_p_1 = _mm_srli_si128(_p_1, 8);
_mm_storel_epi64((__m128i*)&tmp[5][4], _p_0);
_mm_storel_epi64((__m128i*)&tmp[7][4], _p_1);
_p_0 = _mm_load_si128((__m128i*)(tmp[0]));
_p_4 = _mm_sub_epi16(_p_0, *(__m128i*)(tmp[4]));
_p_0 = _mm_add_epi16(_p_0, *(__m128i*)(tmp[4]));
_p_1 = _mm_load_si128((__m128i*)(tmp[1]));
_p_5 = _mm_sub_epi16(_p_1, *(__m128i*)(tmp[5]));
_p_1 = _mm_add_epi16(_p_1, *(__m128i*)(tmp[5]));
_p_2 = _mm_load_si128((__m128i*)(tmp[2]));
_p_6 = _mm_sub_epi16(_p_2, *(__m128i*)(tmp[6]));
_p_2 = _mm_add_epi16(_p_2, *(__m128i*)(tmp[6]));
_p_3 = _mm_load_si128((__m128i*)(tmp[3]));
_p_7 = _mm_sub_epi16(_p_3, *(__m128i*)(tmp[7]));
_p_3 = _mm_add_epi16(_p_3, *(__m128i*)(tmp[7]));
_b_2 = _mm_sub_epi16(_p_0, _p_2);
_p_0 = _mm_add_epi16(_p_0, _p_2);
_b_3 = _mm_sub_epi16(_p_1, _p_3);
_p_1 = _mm_add_epi16(_p_1, _p_3);
_b_6 = _mm_sub_epi16(_p_4, _p_6);
_p_4 = _mm_add_epi16(_p_4, _p_6);
_b_7 = _mm_sub_epi16(_p_5, _p_7);
_p_5 = _mm_add_epi16(_p_5, _p_7);
_p_s = _mm_sub_epi16(_p_0, _p_1);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_1);
_p_t = _mm_srai_epi16(_p_0, 15);
_p_0 = _mm_xor_si128(_p_0, _p_t);
_p_0 = _mm_sub_epi16(_p_0, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_p_s = _mm_sub_epi16(_b_2, _b_3);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_b_2 = _mm_add_epi16(_b_2, _b_3);
_p_t = _mm_srai_epi16(_b_2, 15);
_b_2 = _mm_xor_si128(_b_2, _p_t);
_b_2 = _mm_sub_epi16(_b_2, _p_t);
_p_0 = _mm_add_epi16(_p_0, _b_2);
_p_s = _mm_sub_epi16(_p_4, _p_5);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_p_4 = _mm_add_epi16(_p_4, _p_5);
_p_t = _mm_srai_epi16(_p_4, 15);
_p_4 = _mm_xor_si128(_p_4, _p_t);
_p_4 = _mm_sub_epi16(_p_4, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_4);
_p_s = _mm_sub_epi16(_b_6, _b_7);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_b_6 = _mm_add_epi16(_b_6, _b_7);
_p_t = _mm_srai_epi16(_b_6, 15);
_b_6 = _mm_xor_si128(_b_6, _p_t);
_b_6 = _mm_sub_epi16(_b_6, _p_t);
_p_0 = _mm_add_epi16(_p_0, _b_6);
_p_t = _mm_srli_si128(_p_0, 8);
_p_0 = _mm_add_epi16(_p_0, _p_t);
_p_t = _mm_srli_si128(_p_0, 4);
_p_0 = _mm_add_epi16(_p_0, _p_t);
s = _mm_cvtsi128_si32(_p_0);
satd += (s >> 16) + (Ipp16s)s;
#endif
return satd>> 2;
}
Ipp32u SAT8x8D(const Ipp16u *pSrc1, Ipp32s src1Step, const Ipp16u *pSrc2, Ipp32s src2Step)
{
__ALIGN16 Ipp32s tmp[8][8];
__ALIGN16 Ipp16s diff[8][8];
Ipp32s i;
Ipp32u satd = 0;
ippiSub8x8_16u16s_C1R(pSrc1, src1Step, pSrc2, src2Step, &diff[0][0], 32);
for (i = 0; i < 8; i++) {
Ipp32s t0 = diff[i][0] + diff[i][4];
Ipp32s t4 = diff[i][0] - diff[i][4];
Ipp32s t1 = diff[i][1] + diff[i][5];
Ipp32s t5 = diff[i][1] - diff[i][5];
Ipp32s t2 = diff[i][2] + diff[i][6];
Ipp32s t6 = diff[i][2] - diff[i][6];
Ipp32s t3 = diff[i][3] + diff[i][7];
Ipp32s t7 = diff[i][3] - diff[i][7];
Ipp32s s0 = t0 + t2;
Ipp32s s2 = t0 - t2;
Ipp32s s1 = t1 + t3;
Ipp32s s3 = t1 - t3;
Ipp32s s4 = t4 + t6;
Ipp32s s6 = t4 - t6;
Ipp32s s5 = t5 + t7;
Ipp32s s7 = t5 - t7;
tmp[i][0] = s0 + s1;
tmp[i][1] = s0 - s1;
tmp[i][2] = s2 + s3;
tmp[i][3] = s2 - s3;
tmp[i][4] = s4 + s5;
tmp[i][5] = s4 - s5;
tmp[i][6] = s6 + s7;
tmp[i][7] = s6 - s7;
}
for (i = 0; i < 8; i++) {
Ipp32s t0 = tmp[0][i] + tmp[4][i];
Ipp32s t4 = tmp[0][i] - tmp[4][i];
Ipp32s t1 = tmp[1][i] + tmp[5][i];
Ipp32s t5 = tmp[1][i] - tmp[5][i];
Ipp32s t2 = tmp[2][i] + tmp[6][i];
Ipp32s t6 = tmp[2][i] - tmp[6][i];
Ipp32s t3 = tmp[3][i] + tmp[7][i];
Ipp32s t7 = tmp[3][i] - tmp[7][i];
Ipp32s s0 = t0 + t2;
Ipp32s s2 = t0 - t2;
Ipp32s s1 = t1 + t3;
Ipp32s s3 = t1 - t3;
Ipp32s s4 = t4 + t6;
Ipp32s s6 = t4 - t6;
Ipp32s s5 = t5 + t7;
Ipp32s s7 = t5 - t7;
satd += ABS(s0 + s1);
satd += ABS(s0 - s1);
satd += ABS(s2 + s3);
satd += ABS(s2 - s3);
satd += ABS(s4 + s5);
satd += ABS(s4 - s5);
satd += ABS(s6 + s7);
satd += ABS(s6 - s7);
}
return satd >> 2;
}
Ipp32u SATD_8u_C1R(const Ipp8u *pSrc1, Ipp32s src1Step, const Ipp8u *pSrc2, Ipp32s src2Step, Ipp32s width, Ipp32s height)
{
#ifndef H264_SATD_OPT
__ALIGN16 Ipp16s tmpBuff[4][4];
__ALIGN16 Ipp16s diffBuff[4][4];
#endif
Ipp32s x, y;
Ipp32u satd = 0;
for( y = 0; y < height; y += 4 ) {
for( x = 0; x < width; x += 4 ) {
#ifndef H264_SATD_OPT
Ipp32s b;
ippiSub4x4_8u16s_C1R(pSrc1 + x, src1Step, pSrc2 + x, src2Step, &diffBuff[0][0], 8);
for (b = 0; b < 4; b ++) {
Ipp32s a01, a23, b01, b23;
a01 = diffBuff[b][0] + diffBuff[b][1];
a23 = diffBuff[b][2] + diffBuff[b][3];
b01 = diffBuff[b][0] - diffBuff[b][1];
b23 = diffBuff[b][2] - diffBuff[b][3];
tmpBuff[b][0] = a01 + a23;
tmpBuff[b][1] = a01 - a23;
tmpBuff[b][2] = b01 - b23;
tmpBuff[b][3] = b01 + b23;
}
for (b = 0; b < 4; b ++) {
Ipp32s a01, a23, b01, b23;
a01 = tmpBuff[0][b] + tmpBuff[1][b];
a23 = tmpBuff[2][b] + tmpBuff[3][b];
b01 = tmpBuff[0][b] - tmpBuff[1][b];
b23 = tmpBuff[2][b] - tmpBuff[3][b];
satd += ABS(a01 + a23) + ABS(a01 - a23) + ABS(b01 - b23) + ABS(b01 + b23);
}
#else
__ALIGN16 __m128i _p_0, _p_1, _p_2, _p_3, _p_4, _p_5, _p_7, _p_zero;
const Ipp8u *pS1, *pS2;
Ipp32s s;
pS1 = pSrc1 + x;
pS2 = pSrc2 + x;
_p_zero = _mm_setzero_si128();
_p_0 = _mm_cvtsi32_si128(*(int*)(pS1));
_p_4 = _mm_cvtsi32_si128(*(int*)(pS2));
_p_1 = _mm_cvtsi32_si128(*(int*)(pS1+src1Step));
_p_5 = _mm_cvtsi32_si128(*(int*)(pS2+src2Step));
_p_0 = _mm_unpacklo_epi8(_p_0, _p_zero);
_p_4 = _mm_unpacklo_epi8(_p_4, _p_zero);
_p_1 = _mm_unpacklo_epi8(_p_1, _p_zero);
_p_5 = _mm_unpacklo_epi8(_p_5, _p_zero);
_p_0 = _mm_sub_epi16(_p_0, _p_4);
_p_1 = _mm_sub_epi16(_p_1, _p_5);
pS1 += 2 * src1Step;
pS2 += 2 * src2Step;
_p_2 = _mm_cvtsi32_si128(*(int*)(pS1));
_p_4 = _mm_cvtsi32_si128(*(int*)(pS2));
_p_3 = _mm_cvtsi32_si128(*(int*)(pS1+src1Step));
_p_5 = _mm_cvtsi32_si128(*(int*)(pS2+src2Step));
_p_2 = _mm_unpacklo_epi8(_p_2, _p_zero);
_p_4 = _mm_unpacklo_epi8(_p_4, _p_zero);
_p_3 = _mm_unpacklo_epi8(_p_3, _p_zero);
_p_5 = _mm_unpacklo_epi8(_p_5, _p_zero);
_p_2 = _mm_sub_epi16(_p_2, _p_4);
_p_3 = _mm_sub_epi16(_p_3, _p_5);
_p_5 = _mm_subs_epi16(_p_0, _p_1);
_p_0 = _mm_adds_epi16(_p_0, _p_1);
_p_7 = _mm_subs_epi16(_p_2, _p_3);
_p_2 = _mm_adds_epi16(_p_2, _p_3);
_p_1 = _mm_subs_epi16(_p_0, _p_2);
_p_0 = _mm_adds_epi16(_p_0, _p_2);
_p_3 = _mm_adds_epi16(_p_5, _p_7);
_p_5 = _mm_subs_epi16(_p_5, _p_7);
_p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
_p_5 = _mm_unpacklo_epi16(_p_5, _p_3);
_p_7 = _mm_unpackhi_epi32(_p_0, _p_5);
_p_0 = _mm_unpacklo_epi32(_p_0, _p_5);
_p_1 = _mm_srli_si128(_p_0, 8);
_p_3 = _mm_srli_si128(_p_7, 8);
_p_5 = _mm_subs_epi16(_p_0, _p_1);
_p_0 = _mm_adds_epi16(_p_0, _p_1);
_p_2 = _mm_subs_epi16(_p_7, _p_3);
_p_7 = _mm_adds_epi16(_p_7, _p_3);
_p_1 = _mm_subs_epi16(_p_0, _p_7);
_p_0 = _mm_adds_epi16(_p_0, _p_7);
_p_3 = _mm_adds_epi16(_p_5, _p_2);
_p_5 = _mm_subs_epi16(_p_5, _p_2);
_p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
_p_5 = _mm_unpacklo_epi16(_p_5, _p_3);
_p_2 = _mm_unpackhi_epi32(_p_0, _p_5);
_p_0 = _mm_unpacklo_epi32(_p_0, _p_5);
_p_3 = _mm_srai_epi16(_p_2, 15);
_p_1 = _mm_srai_epi16(_p_0, 15);
_p_2 = _mm_xor_si128(_p_2, _p_3);
_p_0 = _mm_xor_si128(_p_0, _p_1);
_p_2 = _mm_sub_epi16(_p_2, _p_3);
_p_0 = _mm_sub_epi16(_p_0, _p_1);
_p_0 = _mm_add_epi16(_p_0, _p_2);
_p_2 = _mm_srli_si128(_p_0, 8);
_p_0 = _mm_add_epi16(_p_0, _p_2);
_p_2 = _mm_srli_si128(_p_0, 4);
_p_0 = _mm_add_epi16(_p_0, _p_2);
s = _mm_cvtsi128_si32(_p_0);
satd += (s >> 16) + (Ipp16s)s;
#endif
}
pSrc1 += 4 * src1Step;
pSrc2 += 4 * src2Step;
}
return satd >> 1;
}
Ipp32u SATD_16u_C1R(const Ipp16u *pSrc1, Ipp32s src1Step, const Ipp16u *pSrc2, Ipp32s src2Step, Ipp32s width, Ipp32s height)
{
__ALIGN16 Ipp32s tmpBuff[4][4];
__ALIGN16 Ipp16s diffBuff[4][4];
Ipp32s x, y;
Ipp32u satd = 0;
src1Step >>= 1;
src2Step >>= 1;
for( y = 0; y < height; y += 4 ) {
for( x = 0; x < width; x += 4 ) {
Ipp32s b;
ippiSub4x4_16u16s_C1R(pSrc1 + x, src1Step, pSrc2 + x, src2Step, &diffBuff[0][0], 16);
for (b = 0; b < 4; b ++) {
Ipp32s a01, a23, b01, b23;
a01 = diffBuff[b][0] + diffBuff[b][1];
a23 = diffBuff[b][2] + diffBuff[b][3];
b01 = diffBuff[b][0] - diffBuff[b][1];
b23 = diffBuff[b][2] - diffBuff[b][3];
tmpBuff[b][0] = a01 + a23;
tmpBuff[b][1] = a01 - a23;
tmpBuff[b][2] = b01 - b23;
tmpBuff[b][3] = b01 + b23;
}
for (b = 0; b < 4; b ++) {
Ipp32s a01, a23, b01, b23;
a01 = tmpBuff[0][b] + tmpBuff[1][b];
a23 = tmpBuff[2][b] + tmpBuff[3][b];
b01 = tmpBuff[0][b] - tmpBuff[1][b];
b23 = tmpBuff[2][b] - tmpBuff[3][b];
satd += ABS(a01 + a23) + ABS(a01 - a23) + ABS(b01 - b23) + ABS(b01 + b23);
}
}
pSrc1 += 4 * src1Step;
pSrc2 += 4 * src2Step;
}
return satd >> 1;
}
} //namespace UMC_H264_ENCODER
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -