⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 nic_postprocess.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 5 页
字号:
#include "stdafx.h"
#include "nic_postProcess.h"
#include "postproc/postprocFilters.h"
#include "simd.h"
#include "Tconfig.h"

#pragma warning(push)
#pragma warning(disable:4309)
#pragma warning(disable:4799)

/******************* general, useful macros ****************/

//#define DEBUGMODE
//#define SELFCHECK
//#define PREFETCH

#ifdef PREFETCH
#define PREFETCH_AHEAD_V 8
#define PREFETCH_AHEAD_H 8
#define PREFETCH_ENABLE
#endif

#ifdef SELFCHECK
#include "ffdebug.h"
#define PP_SELF_CHECK
#define SELF_CHECK
#define SIGN(a)    ( (a)<0 ? -1 : 1 )
#endif

#ifdef DEBUGMODE
#define SHOWDECISIONS_H
#define SHOW_DECISIONS
#define SHOWDECISIONS_V
#endif

/* decide DC mode or default mode for the horizontal filter */
static inline int deblock_horiz_useDC(uint8_t *v, stride_t stride, int DEBLOCK_HORIZ_USEDC_THR)
{
 __m64 mm64_mask = _mm_set_pi8(0,-2,-2,-2,-2,-2,-2,-2);// (0x00,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe);
 int32_t mm32_result;
 __m64 *pmm1;
 int eq_cnt, useDC;

 #ifdef PP_SELF_CHECK
 int eq_cnt2, j, k;
 #endif

 pmm1 = (__m64*)(&(v[1])); /* this is a 32-bit aligned pointer, not 64-aligned */

 //        mov eax, pmm1

         /* first load some constants into mm4, mm5, mm6, mm7 */
 __m64 mm6=mm64_mask;         //        movq mm6, mm64_mask          /*mm6 = 0x00fefefefefefefe       */
 __m64 mm4=_mm_setzero_si64();//        pxor mm4, mm4                /*mm4 = 0x0000000000000000       */

 __m64 mm1=*pmm1; //       movq mm1, qword ptr [eax]    /* mm1 = *pmm            0 1 2 3 4 5 6 7    */
 pmm1+=stride/8;  //        add eax, stride              /* eax += stride/8      0 1 2 3 4 5 6 7    */

 __m64 mm5=mm1;//        movq mm5, mm1                /* mm5 = mm1             0 1 2 3 4 5 6 7    */
 mm1=_mm_srli_si64(mm1,8);//        psrlq mm1, 8                 /* mm1 >>= 8             0 1 2 3 4 5 6 7    */

 __m64 mm2=mm5;//        movq mm2, mm5                /* mm2 = mm5             0 1 2 3 4 5 6 7    */
 mm5=_mm_subs_pu8(mm5,mm1);//        psubusb mm5, mm1             /* mm5 -= mm1            0 1 2 3 4 5 6 7    */

 __m64 mm3;
         movq(mm3, *pmm1);    /* mm3 = *pmm            0 1 2 3 4 5 6 7    */
         psubusb(mm1, mm2);             /* mm1 -= mm2            0 1 2 3 4 5 6 7    */

         pmm1+=stride/8;//add eax, stride              /* eax += stride/8      0 1 2 3 4 5 6 7    */
         por(mm5, mm1);               /* mm5 |= mm1            0 1 2 3 4 5 6 7    */
 __m64 mm0;
         movq( mm0, mm3);              /* mm0 = mm3             0 1 2 3 4 5 6 7    */
         pand( mm5, mm6);              /* mm5 &= 0xfefefefefefefefe     */
 __m64 mm7;
         pxor( mm7, mm7);                /*mm7 = 0x0000000000000000       */
         pcmpeqb( mm5, mm4);             /* are the bytes of mm5 == 0 ?   */

         movq( mm1, *pmm1);    /* mm3 = *pmm            0 1 2 3 4 5 6 7    */
         psubb( mm7, mm5);               /* mm7 has running total of eqcnts */

         psrlq (mm3, 8);                 /* mm3 >>= 8             0 1 2 3 4 5 6 7    */
         movq( mm5, mm0);                /* mm5 = mm0             0 1 2 3 4 5 6 7    */

         psubusb( mm0, mm3);             /* mm0 -= mm3            0 1 2 3 4 5 6 7    */

         pmm1+=stride/8;//add eax, stride              /* eax += stride/8      0 1 2 3 4 5 6 7    */
         psubusb (mm3, mm5);             /* mm3 -= mm5            0 1 2 3 4 5 6 7    */

         movq (mm5, *pmm1);//    /* mm5 = *pmm            0 1 2 3 4 5 6 7    */
         por (mm0, mm3);                 /* mm0 |= mm3            0 1 2 3 4 5 6 7    */

         movq( mm3, mm1);                /* mm3 = mm1             0 1 2 3 4 5 6 7    */
         pand( mm0, mm6);                /* mm0 &= 0xfefefefefefefefe     */

         psrlq(   mm1, 8);               /* mm1 >>= 8             0 1 2 3 4 5 6 7    */
         pcmpeqb( mm0, mm4);             /* are the bytes of mm0 == 0 ?   */

         movq( mm2, mm3);                /* mm2 = mm3             0 1 2 3 4 5 6 7    */
         psubb( mm7, mm0);               /* mm7 has running total of eqcnts */

         psubusb( mm3, mm1);             /* mm3 -= mm1            0 1 2 3 4 5 6 7    */

         psubusb( mm1, mm2);             /* mm1 -= mm2            0 1 2 3 4 5 6 7    */

         por( mm3, mm1);                 /* mm3 |= mm1            0 1 2 3 4 5 6 7    */
         movq( mm1, mm5);                /* mm1 = mm5             0 1 2 3 4 5 6 7    */

         pand(    mm3, mm6);             /* mm3 &= 0xfefefefefefefefe     */
         psrlq(   mm5, 8);               /* mm5 >>= 8             0 1 2 3 4 5 6 7    */

         pcmpeqb( mm3, mm4);             /* are the bytes of mm3 == 0 ?   */
         movq   ( mm0, mm1);             /* mm0 = mm1             0 1 2 3 4 5 6 7    */

         psubb  ( mm7, mm3);             /* mm7 has running total of eqcnts */
         psubusb( mm1, mm5);             /* mm1 -= mm5            0 1 2 3 4 5 6 7    */

         psubusb( mm5, mm0);             /* mm5 -= mm0            0 1 2 3 4 5 6 7    */
         por    ( mm1, mm5);             /* mm1 |= mm5            0 1 2 3 4 5 6 7    */

         pand   ( mm1, mm6);             /* mm1 &= 0xfefefefefefefefe     */

         pcmpeqb( mm1, mm4);             /* are the bytes of mm1 == 0 ?   */

         psubb  ( mm7, mm1);             /* mm7 has running total of eqcnts */

         movq    (mm1, mm7);             /* mm1 = mm7             0 1w2 3 4 5 6 7r   */
         psllq   (mm7, 8);               /* mm7 >>= 24            0 1 2 3 4 5 6 7m   */

         psrlq   (mm1, 24);              /* mm7 >>= 24            0 1 2 3 4 5 6 7m   */

         paddb   (mm7, mm1);             /* mm7 has running total of eqcnts */

         movq (mm1, mm7);                /* mm1 = mm7             0 1w2 3 4 5 6 7r   */
         psrlq (mm7, 16);                /* mm7 >>= 16            0 1 2 3 4 5 6 7m   */

         paddb   (mm7, mm1);             /* mm7 has running total of eqcnts */

         movq (mm1, mm7);                /* mm1 = mm7             0 1w2 3 4 5 6 7r   */
         psrlq (  mm7, 8);               /* mm7 >>= 8             0 1 2 3 4 5 6 7m   */

         paddb (mm7, mm1);               /* mm7 has running total of eqcnts */

         movd (mm32_result, mm7);

         //pop eax

 eq_cnt = mm32_result & 0xff;

 #ifdef PP_SELF_CHECK
 eq_cnt2 = 0;
 for (k=0; k<4; k++)
 {
         for (j=1; j<=7; j++)
         {
                 if (abs(v[j+k*stride]-v[1+j+k*stride]) <= 1) eq_cnt2++;
         }
 }
 if (eq_cnt2 != eq_cnt)
         DPRINTF(_l("ERROR: MMX version of useDC is incorrect"));
 #endif

 useDC = eq_cnt >= DEBLOCK_HORIZ_USEDC_THR;

 return useDC;
}

/* decide whether the DC filter should be turned on according to QP */
static inline int deblock_horiz_DC_on(uint8_t *v, stride_t stride, int QP)
{
 /* 99% of the time, this test turns out the same as the |max-min| strategy in the standard */
// return (abs(v[1]-v[8]) < 2*QP);
 for (int i=0; i<4; ++i)
  {
   if (abs(v[0]-v[5]) >= 2*QP) return false;
   if (abs(v[1]-v[8]) >= 2*QP) return false;
   if (abs(v[1]-v[4]) >= 2*QP) return false;
   if (abs(v[2]-v[7]) >= 2*QP) return false;
   if (abs(v[3]-v[6]) >= 2*QP) return false;
   v += stride;
  }
 return true;
}

/* The 9-tap low pass filter used in "DC" regions */
/* I'm not sure that I like this implementation any more...! */
static inline void deblock_horiz_lpf9(uint8_t *v, stride_t stride, int QP)
{
        int y, p1, p2;

        #ifdef PP_SELF_CHECK
        uint8_t selfcheck[9];
        int psum;
        uint8_t *vv;
        int i;
        #endif
        uint8_t *pmm1;
        uint32_t mm32_p1p2;
        static const uint64_t mm64_coefs[18] =
         {
          0x0001000200040006LL, /* p1 left */ 0x0000000000000001LL, /* v1 right */
          0x0001000200020004LL, /* v1 left */ 0x0000000000010001LL, /* v2 right */
          0x0002000200040002LL, /* v2 left */ 0x0000000100010002LL, /* v3 right */
          0x0002000400020002LL, /* v3 left */ 0x0001000100020002LL, /* v4 right */
          0x0004000200020001LL, /* v4 left */ 0x0001000200020004LL, /* v5 right */
          0x0002000200010001LL, /* v5 left */ 0x0002000200040002LL, /* v6 right */
          0x0002000100010000LL, /* v6 left */ 0x0002000400020002LL, /* v7 right */
          0x0001000100000000LL, /* v7 left */ 0x0004000200020001LL, /* v8 right */
          0x0001000000000000LL, /* v8 left */ 0x0006000400020001LL  /* p2 right */
         };
        __m64 mm64_0008 = _mm_set1_pi16(0x0008);
        __m64 mm0,mm2,mm7,mm6,mm5,mm1,mm3,mm4;
        __m64 mm64_temp;
        for (y=0; y<4; y++)
        {
                p1 = (abs(v[0+y*stride]-v[1+y*stride]) < QP ) ?  v[0+y*stride] : v[1+y*stride];
                p2 = (abs(v[8+y*stride]-v[9+y*stride]) < QP ) ?  v[9+y*stride] : v[8+y*stride];

                mm32_p1p2 = 0x0101 * ((p2 << 16) + p1);

                #ifdef PP_SELF_CHECK
                /* generate a self-check version of the filter result in selfcheck[9] */
                /* low pass filtering (LPF9: 1 1 2 2 4 2 2 1 1) */
                vv = &(v[y*stride]);
                psum = p1 + p1 + p1 + vv[1] + vv[2] + vv[3] + vv[4] + 4;
                selfcheck[1] = (((psum + vv[1]) << 1) - (vv[4] - vv[5])) >> 4;
                psum += vv[5] - p1;
                selfcheck[2] = (((psum + vv[2]) << 1) - (vv[5] - vv[6])) >> 4;
                psum += vv[6] - p1;
                selfcheck[3] = (((psum + vv[3]) << 1) - (vv[6] - vv[7])) >> 4;
                psum += vv[7] - p1;
                selfcheck[4] = (((psum + vv[4]) << 1) + p1 - vv[1] - (vv[7] - vv[8])) >> 4;
                psum += vv[8] - vv[1];
                selfcheck[5] = (((psum + vv[5]) << 1) + (vv[1] - vv[2]) - vv[8] + p2) >> 4;
                psum += p2 - vv[2];
                selfcheck[6] = (((psum + vv[6]) << 1) + (vv[2] - vv[3])) >> 4;
                psum += p2 - vv[3];
                selfcheck[7] = (((psum + vv[7]) << 1) + (vv[3] - vv[4])) >> 4;
                psum += p2 - vv[4];
                selfcheck[8] = (((psum + vv[8]) << 1) + (vv[4] - vv[5])) >> 4;
                #endif

                pmm1 = (&(v[y*stride-3])); /* this is 64-aligned */

                /* mm7 = 0, mm6 is left hand accumulator, mm5 is right hand acc */
                        unsigned char *eax=(unsigned char*)pmm1;
                        unsigned char *ebx=(unsigned char*)&mm64_coefs[0];

                        #ifdef PREFETCH_ENABLE
                        prefetcht0 32[ebx]
                        #endif

                        movd   (mm0,   mm32_p1p2);            /* mm0 = ________p2p2p1p1    0w1 2 3 4 5 6 7    */
                        punpcklbw (mm0, mm0);                 /* mm0 = p2p2p2p2p1p1p1p1    0m1 2 3 4 5 6 7    */

                        movq    (mm2, eax);                   /* mm2 = v4v3v2v1xxxxxxxx    0 1 2w3 4 5 6 7    */
                        pxor    (mm7, mm7);                   /* mm7 = 0000000000000000    0 1 2 3 4 5 6 7w   */

                        movq     (mm6, mm64_0008);            /* mm6 = 0008000800080008    0 1 2 3 4 5 6w7    */
                        punpckhbw (mm2, mm2);                 /* mm2 = v4__v3__v2__v1__    0 1 2m3 4 5 6 7    */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -