⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 loopf_asm.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 2 页
字号:
//==========================================================================
//
//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
//  PURPOSE.
//
//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
//
//--------------------------------------------------------------------------


/****************************************************************************
*
*   Module Title :     loopf_asm.c
*
*   Description  :     Optimized version of the loop filter.
*
*
*****************************************************************************
*/

/****************************************************************************
*  Header Frames
*****************************************************************************
*/


#pragma warning (disable:4799)
#pragma warning (disable:4731)


#define STRICT              /* Strict type checking. */
#include <memory.h>

#include "ogg/ogg.h"
#include "codec_internal.h"
#include "inttypes.h"
#include "simd.h"

#define LIMIT_OFFSET        0
#define FOURONES_OFFSET     8
#define LFABS_OFFSET        16
#define TRANS_OFFSET        24

/****************************************************************************
*  Module constants.
*****************************************************************************
*/

/****************************************************************************
*  Explicit Imports
*****************************************************************************
*/

/****************************************************************************
*  Exported Global Variables
*****************************************************************************
*/

/****************************************************************************
*  Exported Functions
*****************************************************************************
*/

/****************************************************************************
*  Module Statics
*****************************************************************************
*/

/****************************************************************************
 *
 *  ROUTINE       :     SetupBoundingValueArray_ForMMX
 *
 *  INPUTS        :
 *
 *  OUTPUTS       :     None
 *
 *  RETURNS       :     None
 *
 *  FUNCTION      :     Applies a loop filter to the edge pixels of coded blocks.
 *
 *  SPECIAL NOTES :
 *
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
extern "C" void SetupBoundingValueArray_ForMMX(PB_INSTANCE *pbi, ogg_int32_t FLimit)
{
    ogg_int32_t * BoundingValuePtr;


    //    Since the FiltBoundingValue array is currently only used in the generic version, we are going
    //    to reuse this memory for our own purposes.
    //    2 longs for limit, 2 longs for _4ONES, 2 longs for LFABS_MMX, and 8 longs for temp work storage

    BoundingValuePtr = (ogg_int32_t *)((ogg_uint32_t)(&pbi->FiltBoundingValue[256]) & 0xffffffe0);

    //expand for mmx code
    BoundingValuePtr[0] = BoundingValuePtr[1] = FLimit * 0x00010001;
    BoundingValuePtr[2] = BoundingValuePtr[3] = 0x00010001;
    BoundingValuePtr[4] = BoundingValuePtr[5] = 0x00040004;

    pbi->BoundingValuePtr=BoundingValuePtr;
}
/****************************************************************************
 *
 *  ROUTINE       :     FilterHoriz_MMX
 *
 *  INPUTS        :     None
 *
 *  OUTPUTS       :     None
 *
 *  RETURNS       :     None
 *
 *  FUNCTION      :     Applies a loop filter to the vertical edge horizontally
 *
 *  SPECIAL NOTES :
 *
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
// this version attempts to fix the DC_misalign stalls
extern "C" void FilterHoriz_MMX(unsigned char * PixelPtr, ogg_int32_t LineLength, ogg_int32_t *BoundingValuePtr)
{
    ogg_int32_t ms = -LineLength;
    ogg_int32_t ms2 = ms + ms;

    /* A somewhat optimized MMX version of the left edge filter.*/
        unsigned char *eax=(unsigned char*)BoundingValuePtr;
        int         edx=LineLength;            //stride

        unsigned char *ebx=PixelPtr;
        int         ecx=LineLength;            //stride

        __m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;

        movd        (mm0,ebx + -2              );//xx xx xx xx 01 00 xx xx
    //-

        movd        (mm4,ebx + 2               );//xx xx xx xx xx xx 03 02
        psrld       (mm0,16                      );//xx xx xx xx 00 00 01 00

        movd        (mm1,ebx + ecx + -2        );//xx xx xx xx 11 10 xx xx
        punpcklwd   (mm0,mm4                     );//xx xx xx xx 03 02 01 00

        movd        (mm4,ebx + ecx + 2         );//xx xx xx xx xx xx 13 12
        psrld       (mm1,16                      );//xx xx xx xx 00 00 11 10

        punpcklwd   (mm1,mm4                     );//xx xx xx xx 13 12 11 10
        edx=edx*3           ;//stride * 3

        movd        (mm2,ebx + ecx*2 + -2      );//xx xx xx xx 21 20 xx xx
        punpcklbw   (mm0,mm1                     );//13 03 12 02 11 01 10 00

        movd        (mm4,ebx + ecx*2 + 2       );//xx xx xx xx xx xx 23 22
        psrld       (mm2,16                      );//xx xx xx xx 00 00 21 20

        movd        (mm1,ebx + edx + -2        );//xx xx xx xx 31 30 xx xx
        punpcklwd   (mm2,mm4                     );//xx xx xx xx 23 22 21 20

        movd        (mm4,ebx + edx + 2         );//xx xx xx xx xx xx 33 32
        psrld       (mm1,16                      );//xx xx xx xx 00 00 31 30

        punpcklwd   (mm1,mm4                     );//xx xx xx xx 33 32 31 30
        pxor        (mm4,mm4);

        punpcklbw   (mm2,mm1                     );//33 23 32 22 31 21 30 20
        movq        (mm1,mm0);

        punpcklwd   (mm0,mm2                     );//31 21 11 01 30 20 10 00
        ebx=ebx + ecx*4;//base + (stride * 4)

        punpckhwd   (mm1,mm2                     );//33 23 13 03 32 22 12 02
        movq        (mm6,mm0                     );//xx xx xx xx 30 20 10 00

        movq        (eax + TRANS_OFFSET + 0,mm0);
        movq        (mm2,mm1);

        movq        (eax + TRANS_OFFSET + 8,mm1);
        psrlq       (mm0,32                      );//xx xx xx xx 31 21 11 01

//-----------
        movd        (mm7,ebx + -2              );//xx xx xx xx 41 40 xx xx
        punpcklbw   (mm1,mm4                     );//convert to words

        movd        (mm4,ebx + 2               );//xx xx xx xx xx xx 43 42
        psrld       (mm7,16                      );//xx xx xx xx 00 00 41 40

        movd        (mm5,ebx + ecx + -2        );//xx xx xx xx 51 50 xx xx
        punpcklwd   (mm7,mm4                     );//xx xx xx xx 43 42 41 40

        movd        (mm4,ebx + ecx + 2         );//xx xx xx xx xx xx 53 52
        psrld       (mm5,16);

        punpcklwd   (mm5,mm4);
        pxor        (mm4,mm4);

        punpcklbw   (mm0,mm4);
//-

        psrlq       (mm2,32                      );//xx xx xx xx 33 23 13 03
        psubw       (mm1,mm0                     );//x = p0 - pms

        punpcklbw   (mm7,mm5                     );//53 43 52 42 51 41 50 40
        movq        (mm3,mm1);
//-------------------
        punpcklbw   (mm6,mm4);
        paddw       (mm3,mm1);

        punpcklbw   (mm2,mm4);
        paddw       (mm1,mm3);

        paddw       (mm1,eax + LFABS_OFFSET    );//x += LoopFilterAdjustBeforeShift
        psubw       (mm6,mm2);

        movd        (mm2,ebx + ecx*2 + -2      );//xx xx xx xx 61 60 xx xx
        paddw       (mm6,mm1);

        movd        (mm4,ebx + ecx*2 + 2       );//xx xx xx xx xx xx 63 62
        psrld       (mm2,16);

        movd        (mm5,ebx + edx + -2        );//xx xx xx xx 71 70 xx xx
        punpcklwd   (mm2,mm4                     );//xx xx xx xx 63 62 61 60

        movd        (mm4,ebx + edx + 2         );//xx xx xx xx xx xx 73 72
        psrld       (mm5,16                      );//xx xx xx xx 00 00 71 70

        ebx=(unsigned char*)PixelPtr;              //restore PixelPtr
        punpcklwd   (mm5,mm4                     );//xx xx xx xx 73 72 71 70

        psraw       (mm6,3                       );//values to be clipped
        pxor        (mm4,mm4);

        punpcklbw   (mm2,mm5                     );//73 63 72 62 71 61 70 60
        movq        (mm5,mm7                     );//53 43 52 42 51 41 50 40

        movq        (mm1,mm6);
        punpckhwd   (mm5,mm2                     );//73 63 53 43 72 62 52 42


        movq        (eax + TRANS_OFFSET + 24,mm5   );//save for later
        punpcklwd   (mm7,mm2                     );//71 61 51 41 70 60 50 40

        movq        (eax + TRANS_OFFSET + 16,mm7   );//save for later
        psraw       (mm6,15);

        movq        (mm2,eax + LIMIT_OFFSET        );//get the limit value
        movq        (mm0,mm7                         );//xx xx xx xx 70 60 50 41

        psrlq       (mm7,32                          );//xx xx xx xx 71 61 51 41
        pxor        (mm1,mm6);

        psubsw      (mm1,mm6                         );//abs(i)
        punpcklbw   (mm5,mm4);

        por         (mm6,eax + FOURONES_OFFSET     );//now have -1 or 1
        movq        (mm3,mm2);

        punpcklbw   (mm7,mm4);
        psubw       (mm3,mm1                         );//limit - abs(i)

        movq        (mm4,mm3);
        psraw       (mm3,15);

        //push        ebp
    //-

        psubw       (mm5,mm7                         );//x = p0 - pms
        pxor        (mm4,mm3);

        psubsw      (mm4,mm3                         );//abs(limit - abs(i))
        pxor        (mm3,mm3);

//        movd        mm1,eax + TRANS_OFFSET + 28  );//xx xx xx xx 73 63 53 43
        movq        (mm1,eax + TRANS_OFFSET + 28  );//xx xx xx xx 73 63 53 43
        psubusw     (mm2,mm4                     );//limit - abs(limit - abs(i))

        punpcklbw   (mm0,mm3);
        movq        (mm7,mm5);

        paddw       (mm7,mm5);
        pmullw      (mm2,mm6                     );//new y -- wait 3 cycles

        punpcklbw   (mm1,mm3);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -