📄 umc_h264_sad.cpp
字号:
//
// INTEL CORPORATION PROPRIETARY INFORMATION
// This software is supplied under the terms of a license agreement or
// nondisclosure agreement with Intel Corporation and may not be copied
// or disclosed except in accordance with the terms of that agreement.
// Copyright (c) 2004 - 2007 Intel Corporation. All Rights Reserved.
//
#include <string.h>
#include "umc_h264_video_encoder.h"
#include "umc_h264_tables.h"
#include "umc_h264_to_ipp.h"
#include "umc_h264_bme.h"
#include "ippvc.h"
namespace UMC_H264_ENCODER
{
#if defined (WIN32) || defined (_WIN32)
#define H264_SATD_OPT
#endif
#if defined (WIN64) || defined (_WIN64)
#undef H264_SATD_OPT
#endif
#ifdef H264_SATD_OPT
#if defined(__INTEL_COMPILER) || (_MSC_VER >= 1300)
#include "emmintrin.h"
#else
#undef H264_SATD_OPT
#endif
#endif
Ipp32u SAT8x8D(const Ipp8u *pSrc1, Ipp32s src1Step, const Ipp8u *pSrc2, Ipp32s src2Step)
{
__ALIGN16 Ipp16s diff[8][8];
Ipp32u satd = 0;
ippiSub8x8_8u16s_C1R(pSrc1, src1Step, pSrc2, src2Step, &diff[0][0], 16);
#ifndef H264_SATD_OPT
Ipp32s i;
for (i = 0; i < 8; i++) {
Ipp32s t0 = diff[i][0] + diff[i][4];
Ipp32s t4 = diff[i][0] - diff[i][4];
Ipp32s t1 = diff[i][1] + diff[i][5];
Ipp32s t5 = diff[i][1] - diff[i][5];
Ipp32s t2 = diff[i][2] + diff[i][6];
Ipp32s t6 = diff[i][2] - diff[i][6];
Ipp32s t3 = diff[i][3] + diff[i][7];
Ipp32s t7 = diff[i][3] - diff[i][7];
Ipp32s s0 = t0 + t2;
Ipp32s s2 = t0 - t2;
Ipp32s s1 = t1 + t3;
Ipp32s s3 = t1 - t3;
Ipp32s s4 = t4 + t6;
Ipp32s s6 = t4 - t6;
Ipp32s s5 = t5 + t7;
Ipp32s s7 = t5 - t7;
diff[i][0] = s0 + s1;
diff[i][1] = s0 - s1;
diff[i][2] = s2 + s3;
diff[i][3] = s2 - s3;
diff[i][4] = s4 + s5;
diff[i][5] = s4 - s5;
diff[i][6] = s6 + s7;
diff[i][7] = s6 - s7;
}
for (i = 0; i < 8; i++) {
Ipp32s t0 = diff[0][i] + diff[4][i];
Ipp32s t4 = diff[0][i] - diff[4][i];
Ipp32s t1 = diff[1][i] + diff[5][i];
Ipp32s t5 = diff[1][i] - diff[5][i];
Ipp32s t2 = diff[2][i] + diff[6][i];
Ipp32s t6 = diff[2][i] - diff[6][i];
Ipp32s t3 = diff[3][i] + diff[7][i];
Ipp32s t7 = diff[3][i] - diff[7][i];
Ipp32s s0 = t0 + t2;
Ipp32s s2 = t0 - t2;
Ipp32s s1 = t1 + t3;
Ipp32s s3 = t1 - t3;
Ipp32s s4 = t4 + t6;
Ipp32s s6 = t4 - t6;
Ipp32s s5 = t5 + t7;
Ipp32s s7 = t5 - t7;
satd += ABS(s0 + s1);
satd += ABS(s0 - s1);
satd += ABS(s2 + s3);
satd += ABS(s2 - s3);
satd += ABS(s4 + s5);
satd += ABS(s4 - s5);
satd += ABS(s6 + s7);
satd += ABS(s6 - s7);
}
#else
#if 0
__ALIGN16 __m128i _p_0, _p_1, _p_2, _p_3, _p_4, _p_5, _p_6, _p_7, _b_2, _b_3, _b_6, _b_7, _p_t, _p_s;
Ipp32s s;
/*
for (i = 0; i < 8; i++) {
_p_0 = _mm_loadl_epi64((__m128i*)(&diff[i][0]));
_p_2 = _mm_loadl_epi64((__m128i*)(&diff[i][4]));
_p_1 = _mm_sub_epi16(_p_0, _p_2); // 0, 0, 0, 0, a7, a6, a5, a4
_p_0 = _mm_add_epi16(_p_0, _p_2); // 0, 0, 0, 0, a3, a2, a1, a0
_p_5 = _mm_srli_si128(_p_1, 4); // 0, 0, 0, 0, 0, 0, a7, a6
_p_4 = _mm_srli_si128(_p_0, 4); // 0, 0, 0, 0, 0, 0, a3, a2
_p_3 = _mm_sub_epi16(_p_1, _p_5); // 0, 0, 0, 0, xx, xx, b7, b6
_p_1 = _mm_add_epi16(_p_1, _p_5); // 0, 0, 0, 0, xx, xx, b5, b4
_p_2 = _mm_sub_epi16(_p_0, _p_4); // 0, 0, 0, 0, xx, xx, b3, b2
_p_0 = _mm_add_epi16(_p_0, _p_4); // 0, 0, 0, 0, xx, xx, b1, b0
_p_0 = _mm_unpacklo_epi16(_p_0, _p_2); // xx, xx, xx, xx, b3, b1, b2, b0
_p_1 = _mm_unpacklo_epi16(_p_1, _p_3); // xx, xx, xx, xx, b7, b5, b6, b4
_p_0 = _mm_unpacklo_epi32(_p_0, _p_1); // b7, b5, b3, b1, b6, b4, b2, b0
_p_2 = _mm_srli_si128(_p_0, 8); // 0, 0, 0, 0, b7, b5, b3, b1
_p_1 = _mm_sub_epi16(_p_0, _p_2);
_p_0 = _mm_add_epi16(_p_0, _p_2);
_p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
_mm_store_si128((__m128i*)diff[i], _p_0);
}
*/
for (i = 0; i < 8; i++) {
Ipp32s t0 = diff[i][0] + diff[i][4];
Ipp32s t4 = diff[i][0] - diff[i][4];
Ipp32s t1 = diff[i][1] + diff[i][5];
Ipp32s t5 = diff[i][1] - diff[i][5];
Ipp32s t2 = diff[i][2] + diff[i][6];
Ipp32s t6 = diff[i][2] - diff[i][6];
Ipp32s t3 = diff[i][3] + diff[i][7];
Ipp32s t7 = diff[i][3] - diff[i][7];
Ipp32s s0 = t0 + t2;
Ipp32s s2 = t0 - t2;
Ipp32s s1 = t1 + t3;
Ipp32s s3 = t1 - t3;
Ipp32s s4 = t4 + t6;
Ipp32s s6 = t4 - t6;
Ipp32s s5 = t5 + t7;
Ipp32s s7 = t5 - t7;
diff[i][0] = s0 + s1;
diff[i][1] = s0 - s1;
diff[i][2] = s2 + s3;
diff[i][3] = s2 - s3;
diff[i][4] = s4 + s5;
diff[i][5] = s4 - s5;
diff[i][6] = s6 + s7;
diff[i][7] = s6 - s7;
}
_p_0 = _mm_load_si128((__m128i*)(diff[0]));
_p_4 = _mm_sub_epi16(_p_0, *(__m128i*)(diff[4]));
_p_0 = _mm_add_epi16(_p_0, *(__m128i*)(diff[4]));
_p_1 = _mm_load_si128((__m128i*)(diff[1]));
_p_5 = _mm_sub_epi16(_p_1, *(__m128i*)(diff[5]));
_p_1 = _mm_add_epi16(_p_1, *(__m128i*)(diff[5]));
_p_2 = _mm_load_si128((__m128i*)(diff[2]));
_p_6 = _mm_sub_epi16(_p_2, *(__m128i*)(diff[6]));
_p_2 = _mm_add_epi16(_p_2, *(__m128i*)(diff[6]));
_p_3 = _mm_load_si128((__m128i*)(diff[3]));
_p_7 = _mm_sub_epi16(_p_3, *(__m128i*)(diff[7]));
_p_3 = _mm_add_epi16(_p_3, *(__m128i*)(diff[7]));
_b_2 = _mm_sub_epi16(_p_0, _p_2);
_p_0 = _mm_add_epi16(_p_0, _p_2);
_b_3 = _mm_sub_epi16(_p_1, _p_3);
_p_1 = _mm_add_epi16(_p_1, _p_3);
_b_6 = _mm_sub_epi16(_p_4, _p_6);
_p_4 = _mm_add_epi16(_p_4, _p_6);
_b_7 = _mm_sub_epi16(_p_5, _p_7);
_p_5 = _mm_add_epi16(_p_5, _p_7);
_p_s = _mm_sub_epi16(_p_0, _p_1);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_1);
_p_t = _mm_srai_epi16(_p_0, 15);
_p_0 = _mm_xor_si128(_p_0, _p_t);
_p_0 = _mm_sub_epi16(_p_0, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_p_s = _mm_sub_epi16(_b_2, _b_3);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_b_2 = _mm_add_epi16(_b_2, _b_3);
_p_t = _mm_srai_epi16(_b_2, 15);
_b_2 = _mm_xor_si128(_b_2, _p_t);
_b_2 = _mm_sub_epi16(_b_2, _p_t);
_p_0 = _mm_add_epi16(_p_0, _b_2);
_p_s = _mm_sub_epi16(_p_4, _p_5);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_p_4 = _mm_add_epi16(_p_4, _p_5);
_p_t = _mm_srai_epi16(_p_4, 15);
_p_4 = _mm_xor_si128(_p_4, _p_t);
_p_4 = _mm_sub_epi16(_p_4, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_4);
_p_s = _mm_sub_epi16(_b_6, _b_7);
_p_t = _mm_srai_epi16(_p_s, 15);
_p_s = _mm_xor_si128(_p_s, _p_t);
_p_s = _mm_sub_epi16(_p_s, _p_t);
_p_0 = _mm_add_epi16(_p_0, _p_s);
_b_6 = _mm_add_epi16(_b_6, _b_7);
_p_t = _mm_srai_epi16(_b_6, 15);
_b_6 = _mm_xor_si128(_b_6, _p_t);
_b_6 = _mm_sub_epi16(_b_6, _p_t);
_p_0 = _mm_add_epi16(_p_0, _b_6);
_p_t = _mm_srli_si128(_p_0, 8);
_p_0 = _mm_add_epi16(_p_0, _p_t);
_p_t = _mm_srli_si128(_p_0, 4);
_p_0 = _mm_add_epi16(_p_0, _p_t);
s = _mm_cvtsi128_si32(_p_0);
satd += (s >> 16) + (Ipp16s)s;
#endif
__ALIGN16 __m128i _p_0, _p_1, _p_2, _p_3, _p_4, _p_5, _p_6, _p_7, _b_2, _b_3, _b_6, _b_7, _p_t, _p_s;
Ipp32s s;
__ALIGN16 Ipp16s tmp[8][8];
_p_0 = _mm_load_si128((__m128i*)(diff[0]));
_p_4 = _mm_sub_epi16(_p_0, *(__m128i*)(diff[4]));
_p_0 = _mm_add_epi16(_p_0, *(__m128i*)(diff[4]));
_p_1 = _mm_load_si128((__m128i*)(diff[1]));
_p_5 = _mm_sub_epi16(_p_1, *(__m128i*)(diff[5]));
_p_1 = _mm_add_epi16(_p_1, *(__m128i*)(diff[5]));
_p_2 = _mm_load_si128((__m128i*)(diff[2]));
_p_6 = _mm_sub_epi16(_p_2, *(__m128i*)(diff[6]));
_p_2 = _mm_add_epi16(_p_2, *(__m128i*)(diff[6]));
_p_3 = _mm_load_si128((__m128i*)(diff[3]));
_p_7 = _mm_sub_epi16(_p_3, *(__m128i*)(diff[7]));
_p_3 = _mm_add_epi16(_p_3, *(__m128i*)(diff[7]));
_b_2 = _mm_sub_epi16(_p_0, _p_2);
_p_0 = _mm_add_epi16(_p_0, _p_2);
_b_3 = _mm_sub_epi16(_p_1, _p_3);
_p_1 = _mm_add_epi16(_p_1, _p_3);
_b_6 = _mm_sub_epi16(_p_4, _p_6);
_p_4 = _mm_add_epi16(_p_4, _p_6);
_b_7 = _mm_sub_epi16(_p_5, _p_7);
_p_5 = _mm_add_epi16(_p_5, _p_7);
_p_s = _mm_sub_epi16(_p_0, _p_1);
_p_0 = _mm_add_epi16(_p_0, _p_1);
_mm_store_si128((__m128i*)diff[1], _p_s);
_mm_store_si128((__m128i*)diff[0], _p_0);
_p_s = _mm_sub_epi16(_b_2, _b_3);
_b_2 = _mm_add_epi16(_b_2, _b_3);
_mm_store_si128((__m128i*)diff[3], _p_s);
_mm_store_si128((__m128i*)diff[2], _b_2);
_p_s = _mm_sub_epi16(_p_4, _p_5);
_p_4 = _mm_add_epi16(_p_4, _p_5);
_mm_store_si128((__m128i*)diff[5], _p_s);
_mm_store_si128((__m128i*)diff[4], _p_4);
_p_s = _mm_sub_epi16(_b_6, _b_7);
_b_6 = _mm_add_epi16(_b_6, _b_7);
_mm_store_si128((__m128i*)diff[7], _p_s);
_mm_store_si128((__m128i*)diff[6], _b_6);
_p_0 = _mm_loadl_epi64((__m128i*)(&diff[0][0]));
_p_1 = _mm_loadl_epi64((__m128i*)(&diff[1][0]));
_p_2 = _mm_loadl_epi64((__m128i*)(&diff[2][0]));
_p_3 = _mm_loadl_epi64((__m128i*)(&diff[3][0]));
_p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
_p_2 = _mm_unpacklo_epi16(_p_2, _p_3);
_p_1 = _mm_unpackhi_epi32(_p_0, _p_2);
_p_0 = _mm_unpacklo_epi32(_p_0, _p_2);
_mm_storel_epi64((__m128i*)&tmp[0][0], _p_0);
_mm_storel_epi64((__m128i*)&tmp[2][0], _p_1);
_p_0 = _mm_srli_si128(_p_0, 8);
_p_1 = _mm_srli_si128(_p_1, 8);
_mm_storel_epi64((__m128i*)&tmp[1][0], _p_0);
_mm_storel_epi64((__m128i*)&tmp[3][0], _p_1);
_p_0 = _mm_loadl_epi64((__m128i*)(&diff[0][4]));
_p_1 = _mm_loadl_epi64((__m128i*)(&diff[1][4]));
_p_2 = _mm_loadl_epi64((__m128i*)(&diff[2][4]));
_p_3 = _mm_loadl_epi64((__m128i*)(&diff[3][4]));
_p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
_p_2 = _mm_unpacklo_epi16(_p_2, _p_3);
_p_1 = _mm_unpackhi_epi32(_p_0, _p_2);
_p_0 = _mm_unpacklo_epi32(_p_0, _p_2);
_mm_storel_epi64((__m128i*)&tmp[4][0], _p_0);
_mm_storel_epi64((__m128i*)&tmp[6][0], _p_1);
_p_0 = _mm_srli_si128(_p_0, 8);
_p_1 = _mm_srli_si128(_p_1, 8);
_mm_storel_epi64((__m128i*)&tmp[5][0], _p_0);
_mm_storel_epi64((__m128i*)&tmp[7][0], _p_1);
_p_0 = _mm_loadl_epi64((__m128i*)(&diff[4][0]));
_p_1 = _mm_loadl_epi64((__m128i*)(&diff[5][0]));
_p_2 = _mm_loadl_epi64((__m128i*)(&diff[6][0]));
_p_3 = _mm_loadl_epi64((__m128i*)(&diff[7][0]));
_p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
_p_2 = _mm_unpacklo_epi16(_p_2, _p_3);
_p_1 = _mm_unpackhi_epi32(_p_0, _p_2);
_p_0 = _mm_unpacklo_epi32(_p_0, _p_2);
_mm_storel_epi64((__m128i*)&tmp[0][4], _p_0);
_mm_storel_epi64((__m128i*)&tmp[2][4], _p_1);
_p_0 = _mm_srli_si128(_p_0, 8);
_p_1 = _mm_srli_si128(_p_1, 8);
_mm_storel_epi64((__m128i*)&tmp[1][4], _p_0);
_mm_storel_epi64((__m128i*)&tmp[3][4], _p_1);
_p_0 = _mm_loadl_epi64((__m128i*)(&diff[4][4]));
_p_1 = _mm_loadl_epi64((__m128i*)(&diff[5][4]));
_p_2 = _mm_loadl_epi64((__m128i*)(&diff[6][4]));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -