⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 umc_h264_sad.cpp

📁 audio-video-codecs.rar语音编解码器
💻 CPP
📖 第 1 页 / 共 2 页
字号:
//
//               INTEL CORPORATION PROPRIETARY INFORMATION
//  This software is supplied under the terms of a license agreement or
//  nondisclosure agreement with Intel Corporation and may not be copied
//  or disclosed except in accordance with the terms of that agreement.
//        Copyright (c) 2004 - 2007 Intel Corporation. All Rights Reserved.
//

#include <string.h>
#include "umc_h264_video_encoder.h"
#include "umc_h264_tables.h"
#include "umc_h264_to_ipp.h"
#include "umc_h264_bme.h"
#include "ippvc.h"

namespace UMC_H264_ENCODER
{

#if defined (WIN32) || defined (_WIN32)
#define H264_SATD_OPT
#endif

#if defined (WIN64) || defined (_WIN64)
#undef H264_SATD_OPT
#endif

#ifdef H264_SATD_OPT
    #if defined(__INTEL_COMPILER) || (_MSC_VER >= 1300)
        #include "emmintrin.h"
    #else
        #undef H264_SATD_OPT
    #endif
#endif

Ipp32u SAT8x8D(const Ipp8u *pSrc1, Ipp32s src1Step, const Ipp8u *pSrc2, Ipp32s src2Step)
{
    __ALIGN16 Ipp16s diff[8][8];
    Ipp32u satd = 0;

    ippiSub8x8_8u16s_C1R(pSrc1, src1Step, pSrc2, src2Step, &diff[0][0], 16);
#ifndef H264_SATD_OPT
    Ipp32s i;
    for (i = 0; i < 8; i++) {
        Ipp32s t0 = diff[i][0] + diff[i][4];
        Ipp32s t4 = diff[i][0] - diff[i][4];
        Ipp32s t1 = diff[i][1] + diff[i][5];
        Ipp32s t5 = diff[i][1] - diff[i][5];
        Ipp32s t2 = diff[i][2] + diff[i][6];
        Ipp32s t6 = diff[i][2] - diff[i][6];
        Ipp32s t3 = diff[i][3] + diff[i][7];
        Ipp32s t7 = diff[i][3] - diff[i][7];
        Ipp32s s0 = t0 + t2;
        Ipp32s s2 = t0 - t2;
        Ipp32s s1 = t1 + t3;
        Ipp32s s3 = t1 - t3;
        Ipp32s s4 = t4 + t6;
        Ipp32s s6 = t4 - t6;
        Ipp32s s5 = t5 + t7;
        Ipp32s s7 = t5 - t7;
        diff[i][0] = s0 + s1;
        diff[i][1] = s0 - s1;
        diff[i][2] = s2 + s3;
        diff[i][3] = s2 - s3;
        diff[i][4] = s4 + s5;
        diff[i][5] = s4 - s5;
        diff[i][6] = s6 + s7;
        diff[i][7] = s6 - s7;
    }
    for (i = 0; i < 8; i++) {
        Ipp32s t0 = diff[0][i] + diff[4][i];
        Ipp32s t4 = diff[0][i] - diff[4][i];
        Ipp32s t1 = diff[1][i] + diff[5][i];
        Ipp32s t5 = diff[1][i] - diff[5][i];
        Ipp32s t2 = diff[2][i] + diff[6][i];
        Ipp32s t6 = diff[2][i] - diff[6][i];
        Ipp32s t3 = diff[3][i] + diff[7][i];
        Ipp32s t7 = diff[3][i] - diff[7][i];
        Ipp32s s0 = t0 + t2;
        Ipp32s s2 = t0 - t2;
        Ipp32s s1 = t1 + t3;
        Ipp32s s3 = t1 - t3;
        Ipp32s s4 = t4 + t6;
        Ipp32s s6 = t4 - t6;
        Ipp32s s5 = t5 + t7;
        Ipp32s s7 = t5 - t7;
        satd += ABS(s0 + s1);
        satd += ABS(s0 - s1);
        satd += ABS(s2 + s3);
        satd += ABS(s2 - s3);
        satd += ABS(s4 + s5);
        satd += ABS(s4 - s5);
        satd += ABS(s6 + s7);
        satd += ABS(s6 - s7);
    }
#else
#if 0
    __ALIGN16 __m128i  _p_0, _p_1, _p_2, _p_3, _p_4, _p_5, _p_6, _p_7, _b_2, _b_3, _b_6, _b_7, _p_t, _p_s;
    Ipp32s  s;
/*
    for (i = 0; i < 8; i++) {
        _p_0 = _mm_loadl_epi64((__m128i*)(&diff[i][0]));
        _p_2 = _mm_loadl_epi64((__m128i*)(&diff[i][4]));
        _p_1 = _mm_sub_epi16(_p_0, _p_2);   //  0,  0,  0,  0, a7, a6, a5, a4
        _p_0 = _mm_add_epi16(_p_0, _p_2);   //  0,  0,  0,  0, a3, a2, a1, a0
        _p_5 = _mm_srli_si128(_p_1, 4);     //  0,  0,  0,  0,  0,  0, a7, a6
        _p_4 = _mm_srli_si128(_p_0, 4);     //  0,  0,  0,  0,  0,  0, a3, a2
        _p_3 = _mm_sub_epi16(_p_1, _p_5);   //  0,  0,  0,  0, xx, xx, b7, b6
        _p_1 = _mm_add_epi16(_p_1, _p_5);   //  0,  0,  0,  0, xx, xx, b5, b4
        _p_2 = _mm_sub_epi16(_p_0, _p_4);   //  0,  0,  0,  0, xx, xx, b3, b2
        _p_0 = _mm_add_epi16(_p_0, _p_4);   //  0,  0,  0,  0, xx, xx, b1, b0
        _p_0 = _mm_unpacklo_epi16(_p_0, _p_2);   // xx, xx, xx, xx, b3, b1, b2, b0
        _p_1 = _mm_unpacklo_epi16(_p_1, _p_3);   // xx, xx, xx, xx, b7, b5, b6, b4
        _p_0 = _mm_unpacklo_epi32(_p_0, _p_1);   // b7, b5, b3, b1, b6, b4, b2, b0
        _p_2 = _mm_srli_si128(_p_0, 8);          //  0,  0,  0,  0, b7, b5, b3, b1
        _p_1 = _mm_sub_epi16(_p_0, _p_2);
        _p_0 = _mm_add_epi16(_p_0, _p_2);
        _p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
        _mm_store_si128((__m128i*)diff[i], _p_0);
    }
*/
    for (i = 0; i < 8; i++) {
        Ipp32s t0 = diff[i][0] + diff[i][4];
        Ipp32s t4 = diff[i][0] - diff[i][4];
        Ipp32s t1 = diff[i][1] + diff[i][5];
        Ipp32s t5 = diff[i][1] - diff[i][5];
        Ipp32s t2 = diff[i][2] + diff[i][6];
        Ipp32s t6 = diff[i][2] - diff[i][6];
        Ipp32s t3 = diff[i][3] + diff[i][7];
        Ipp32s t7 = diff[i][3] - diff[i][7];
        Ipp32s s0 = t0 + t2;
        Ipp32s s2 = t0 - t2;
        Ipp32s s1 = t1 + t3;
        Ipp32s s3 = t1 - t3;
        Ipp32s s4 = t4 + t6;
        Ipp32s s6 = t4 - t6;
        Ipp32s s5 = t5 + t7;
        Ipp32s s7 = t5 - t7;
        diff[i][0] = s0 + s1;
        diff[i][1] = s0 - s1;
        diff[i][2] = s2 + s3;
        diff[i][3] = s2 - s3;
        diff[i][4] = s4 + s5;
        diff[i][5] = s4 - s5;
        diff[i][6] = s6 + s7;
        diff[i][7] = s6 - s7;
    }
    _p_0 = _mm_load_si128((__m128i*)(diff[0]));
    _p_4 = _mm_sub_epi16(_p_0, *(__m128i*)(diff[4]));
    _p_0 = _mm_add_epi16(_p_0, *(__m128i*)(diff[4]));
    _p_1 = _mm_load_si128((__m128i*)(diff[1]));
    _p_5 = _mm_sub_epi16(_p_1, *(__m128i*)(diff[5]));
    _p_1 = _mm_add_epi16(_p_1, *(__m128i*)(diff[5]));
    _p_2 = _mm_load_si128((__m128i*)(diff[2]));
    _p_6 = _mm_sub_epi16(_p_2, *(__m128i*)(diff[6]));
    _p_2 = _mm_add_epi16(_p_2, *(__m128i*)(diff[6]));
    _p_3 = _mm_load_si128((__m128i*)(diff[3]));
    _p_7 = _mm_sub_epi16(_p_3, *(__m128i*)(diff[7]));
    _p_3 = _mm_add_epi16(_p_3, *(__m128i*)(diff[7]));

    _b_2 = _mm_sub_epi16(_p_0, _p_2);
    _p_0 = _mm_add_epi16(_p_0, _p_2);
    _b_3 = _mm_sub_epi16(_p_1, _p_3);
    _p_1 = _mm_add_epi16(_p_1, _p_3);
    _b_6 = _mm_sub_epi16(_p_4, _p_6);
    _p_4 = _mm_add_epi16(_p_4, _p_6);
    _b_7 = _mm_sub_epi16(_p_5, _p_7);
    _p_5 = _mm_add_epi16(_p_5, _p_7);

    _p_s = _mm_sub_epi16(_p_0, _p_1);
    _p_t = _mm_srai_epi16(_p_s, 15);
    _p_s = _mm_xor_si128(_p_s, _p_t);
    _p_s = _mm_sub_epi16(_p_s, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _p_1);
    _p_t = _mm_srai_epi16(_p_0, 15);
    _p_0 = _mm_xor_si128(_p_0, _p_t);
    _p_0 = _mm_sub_epi16(_p_0, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _p_s);

    _p_s = _mm_sub_epi16(_b_2, _b_3);
    _p_t = _mm_srai_epi16(_p_s, 15);
    _p_s = _mm_xor_si128(_p_s, _p_t);
    _p_s = _mm_sub_epi16(_p_s, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _p_s);
    _b_2 = _mm_add_epi16(_b_2, _b_3);
    _p_t = _mm_srai_epi16(_b_2, 15);
    _b_2 = _mm_xor_si128(_b_2, _p_t);
    _b_2 = _mm_sub_epi16(_b_2, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _b_2);

    _p_s = _mm_sub_epi16(_p_4, _p_5);
    _p_t = _mm_srai_epi16(_p_s, 15);
    _p_s = _mm_xor_si128(_p_s, _p_t);
    _p_s = _mm_sub_epi16(_p_s, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _p_s);
    _p_4 = _mm_add_epi16(_p_4, _p_5);
    _p_t = _mm_srai_epi16(_p_4, 15);
    _p_4 = _mm_xor_si128(_p_4, _p_t);
    _p_4 = _mm_sub_epi16(_p_4, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _p_4);

    _p_s = _mm_sub_epi16(_b_6, _b_7);
    _p_t = _mm_srai_epi16(_p_s, 15);
    _p_s = _mm_xor_si128(_p_s, _p_t);
    _p_s = _mm_sub_epi16(_p_s, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _p_s);
    _b_6 = _mm_add_epi16(_b_6, _b_7);
    _p_t = _mm_srai_epi16(_b_6, 15);
    _b_6 = _mm_xor_si128(_b_6, _p_t);
    _b_6 = _mm_sub_epi16(_b_6, _p_t);
    _p_0 = _mm_add_epi16(_p_0, _b_6);

    _p_t = _mm_srli_si128(_p_0, 8);
    _p_0 = _mm_add_epi16(_p_0, _p_t);
    _p_t = _mm_srli_si128(_p_0, 4);
    _p_0 = _mm_add_epi16(_p_0, _p_t);
    s = _mm_cvtsi128_si32(_p_0);
    satd += (s >> 16) + (Ipp16s)s;
#endif
    __ALIGN16 __m128i  _p_0, _p_1, _p_2, _p_3, _p_4, _p_5, _p_6, _p_7, _b_2, _b_3, _b_6, _b_7, _p_t, _p_s;
    Ipp32s  s;
    __ALIGN16 Ipp16s tmp[8][8];

    _p_0 = _mm_load_si128((__m128i*)(diff[0]));
    _p_4 = _mm_sub_epi16(_p_0, *(__m128i*)(diff[4]));
    _p_0 = _mm_add_epi16(_p_0, *(__m128i*)(diff[4]));
    _p_1 = _mm_load_si128((__m128i*)(diff[1]));
    _p_5 = _mm_sub_epi16(_p_1, *(__m128i*)(diff[5]));
    _p_1 = _mm_add_epi16(_p_1, *(__m128i*)(diff[5]));
    _p_2 = _mm_load_si128((__m128i*)(diff[2]));
    _p_6 = _mm_sub_epi16(_p_2, *(__m128i*)(diff[6]));
    _p_2 = _mm_add_epi16(_p_2, *(__m128i*)(diff[6]));
    _p_3 = _mm_load_si128((__m128i*)(diff[3]));
    _p_7 = _mm_sub_epi16(_p_3, *(__m128i*)(diff[7]));
    _p_3 = _mm_add_epi16(_p_3, *(__m128i*)(diff[7]));

    _b_2 = _mm_sub_epi16(_p_0, _p_2);
    _p_0 = _mm_add_epi16(_p_0, _p_2);
    _b_3 = _mm_sub_epi16(_p_1, _p_3);
    _p_1 = _mm_add_epi16(_p_1, _p_3);
    _b_6 = _mm_sub_epi16(_p_4, _p_6);
    _p_4 = _mm_add_epi16(_p_4, _p_6);
    _b_7 = _mm_sub_epi16(_p_5, _p_7);
    _p_5 = _mm_add_epi16(_p_5, _p_7);

    _p_s = _mm_sub_epi16(_p_0, _p_1);
    _p_0 = _mm_add_epi16(_p_0, _p_1);
    _mm_store_si128((__m128i*)diff[1], _p_s);
    _mm_store_si128((__m128i*)diff[0], _p_0);
    _p_s = _mm_sub_epi16(_b_2, _b_3);
    _b_2 = _mm_add_epi16(_b_2, _b_3);
    _mm_store_si128((__m128i*)diff[3], _p_s);
    _mm_store_si128((__m128i*)diff[2], _b_2);
    _p_s = _mm_sub_epi16(_p_4, _p_5);
    _p_4 = _mm_add_epi16(_p_4, _p_5);
    _mm_store_si128((__m128i*)diff[5], _p_s);
    _mm_store_si128((__m128i*)diff[4], _p_4);
    _p_s = _mm_sub_epi16(_b_6, _b_7);
    _b_6 = _mm_add_epi16(_b_6, _b_7);
    _mm_store_si128((__m128i*)diff[7], _p_s);
    _mm_store_si128((__m128i*)diff[6], _b_6);

    _p_0 = _mm_loadl_epi64((__m128i*)(&diff[0][0]));
    _p_1 = _mm_loadl_epi64((__m128i*)(&diff[1][0]));
    _p_2 = _mm_loadl_epi64((__m128i*)(&diff[2][0]));
    _p_3 = _mm_loadl_epi64((__m128i*)(&diff[3][0]));
    _p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
    _p_2 = _mm_unpacklo_epi16(_p_2, _p_3);
    _p_1 = _mm_unpackhi_epi32(_p_0, _p_2);
    _p_0 = _mm_unpacklo_epi32(_p_0, _p_2);
    _mm_storel_epi64((__m128i*)&tmp[0][0], _p_0);
    _mm_storel_epi64((__m128i*)&tmp[2][0], _p_1);
    _p_0 = _mm_srli_si128(_p_0, 8);
    _p_1 = _mm_srli_si128(_p_1, 8);
    _mm_storel_epi64((__m128i*)&tmp[1][0], _p_0);
    _mm_storel_epi64((__m128i*)&tmp[3][0], _p_1);

    _p_0 = _mm_loadl_epi64((__m128i*)(&diff[0][4]));
    _p_1 = _mm_loadl_epi64((__m128i*)(&diff[1][4]));
    _p_2 = _mm_loadl_epi64((__m128i*)(&diff[2][4]));
    _p_3 = _mm_loadl_epi64((__m128i*)(&diff[3][4]));
    _p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
    _p_2 = _mm_unpacklo_epi16(_p_2, _p_3);
    _p_1 = _mm_unpackhi_epi32(_p_0, _p_2);
    _p_0 = _mm_unpacklo_epi32(_p_0, _p_2);
    _mm_storel_epi64((__m128i*)&tmp[4][0], _p_0);
    _mm_storel_epi64((__m128i*)&tmp[6][0], _p_1);
    _p_0 = _mm_srli_si128(_p_0, 8);
    _p_1 = _mm_srli_si128(_p_1, 8);
    _mm_storel_epi64((__m128i*)&tmp[5][0], _p_0);
    _mm_storel_epi64((__m128i*)&tmp[7][0], _p_1);

    _p_0 = _mm_loadl_epi64((__m128i*)(&diff[4][0]));
    _p_1 = _mm_loadl_epi64((__m128i*)(&diff[5][0]));
    _p_2 = _mm_loadl_epi64((__m128i*)(&diff[6][0]));
    _p_3 = _mm_loadl_epi64((__m128i*)(&diff[7][0]));
    _p_0 = _mm_unpacklo_epi16(_p_0, _p_1);
    _p_2 = _mm_unpacklo_epi16(_p_2, _p_3);
    _p_1 = _mm_unpackhi_epi32(_p_0, _p_2);
    _p_0 = _mm_unpacklo_epi32(_p_0, _p_2);
    _mm_storel_epi64((__m128i*)&tmp[0][4], _p_0);
    _mm_storel_epi64((__m128i*)&tmp[2][4], _p_1);
    _p_0 = _mm_srli_si128(_p_0, 8);
    _p_1 = _mm_srli_si128(_p_1, 8);
    _mm_storel_epi64((__m128i*)&tmp[1][4], _p_0);
    _mm_storel_epi64((__m128i*)&tmp[3][4], _p_1);

    _p_0 = _mm_loadl_epi64((__m128i*)(&diff[4][4]));
    _p_1 = _mm_loadl_epi64((__m128i*)(&diff[5][4]));
    _p_2 = _mm_loadl_epi64((__m128i*)(&diff[6][4]));

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -