📄 idct_ap922sse_rawcode.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
  NCOS4_16*RS7,  NCOS4_16*RS7,  NCOS4_16*RS7,  NCOS4_16*RS7, 
  NCOS2_16*RS7,  NCOS6_16*RS7, -NCOS6_16*RS7, -NCOS2_16*RS7, 
  NCOS4_16*RS7, -NCOS4_16*RS7, -NCOS4_16*RS7,  NCOS4_16*RS7, 
  NCOS6_16*RS7, -NCOS2_16*RS7,  NCOS2_16*RS7, -NCOS6_16*RS7, 
  NCOS1_16*RS7,  NCOS3_16*RS7,  NCOS5_16*RS7,  NCOS7_16*RS7, 
  NCOS3_16*RS7, -NCOS7_16*RS7, -NCOS1_16*RS7, -NCOS5_16*RS7, 
  NCOS5_16*RS7, -NCOS1_16*RS7,  NCOS7_16*RS7,  NCOS3_16*RS7, 
  NCOS7_16*RS7, -NCOS5_16*RS7,  NCOS3_16*RS7, -NCOS1_16*RS7, 

 // iDCT column coefficients
  NCOS4_16, NCOS4_16, NCOS4_16, NCOS4_16,
  TANG1_16, TANG1_16, TANG1_16, TANG1_16,
  TANG2_16, TANG2_16, TANG2_16, TANG2_16,
  TANG3_16, TANG3_16, TANG3_16, TANG3_16,
  RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET,
  0       , 0       , 0       , 0,        // not used, padding
  0       , 0       , 0       , 0,        // not used, padding
  0       , 0       , 0       , 0         // not used, padding
};

/*
const static float cos_4_16f= NCOS4_16;
const static float tg_1_16f = TANG1_16;
const static float tg_2_16f = TANG2_16;
const static float tg_3_16f = TANG3_16;

; The original, unscaled idct coefficient table
; static const short w[32] = {
; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
;
; #define DCT_8_INV_ROW(x, y)



#define BITS_INV_ACC	4	//; 4 or 5 for IEEE
	// 5 yields higher accuracy, but lessens dynamic range on the input matrix
#define SHIFT_INV_ROW	(16 - BITS_INV_ACC)
#define SHIFT_INV_COL	(1 + BITS_INV_ACC )

#define RND_INV_ROW		(1 << (SHIFT_INV_ROW-1))
#define RND_INV_COL		(1 << (SHIFT_INV_COL-1)) 
#define RND_INV_CORR	(RND_INV_COL - 1)		//; correction -1.0 and round
//#define RND_INV_ROW		(1024 * (6 - BITS_INV_ACC)) //; 1 << (SHIFT_INV_ROW-1)
//#define RND_INV_COL		(16 * (BITS_INV_ACC - 3)) //; 1 << (SHIFT_INV_COL-1)
#define SHIFT_ROUND_COL( x ) ( ( x + RND_INV_COL ) >> (SHIFT_INV_COL) )
#define SHIFT_ROUND_ROW( x ) ( ( x + RND_INV_ROW ) >> (SHIFT_INV_ROW) )

//.data
//Align 16
//const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW};

const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5
const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5
const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5
//const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5
const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5



//#define SHIFT_ROUND_COLF( x ) ( floor( (x)*0.125*0.5 + 0.5 ) )
#define SHIFT_ROUND_COLF( x ) ( floor( (x)*0.5 + 0.5 ) )

#define SHIFT_ROUND_ROWF( x ) ( x )
*/

// externally allocated array of 64 floats, for temporary storage
extern void *fTempArray;


void
idct_ap922float_sse( short *data )
{
  static const __int64 mmAndmask = 0xFFFF0000FFFF0000; // [ XX __ XX __ ]

  /*
  static float a[8], b[8], e[8];
  static float dp[8], tmp[2];

 
  short *x; // pointer to input
  short *y; // pointer to final output
  float *yr, *xc; // pointer to output row
  float *ti; // pointer to IDCT row coefficient table

  int i;
  */

  // AP-922 inverse_DCT row transform (floating point version)
  //
  // Row IDCT operator :A_T*M_T*P_T
  // Let Y=[output column data, 8 elements] 32-bit IEEE-754 float
  //     X=[input column data, 8 elements] 16-bit short integer
  //
  //     Y= [ A_T*M_T*P_T ] * X
  //
  //   (Y and X are both column vectors)

#define INPR eax
#define TABLE ecx
#define TABLEP1 esi  // starting address of (TABLE +32 elements)
#define OUTR edx

__asm {
    mov OUTR, dword ptr [fTempArray];
     mov edi, 0x00; // i = 0

    mov INPR, dword ptr [data];   ;// row 0
     sub OUTR, 32;    // precompensate OUT
 
    lea TABLEP1, dword ptr [tab_i_01234567ssef]; // row 0

    movq mm7, [mmAndmask];      // mm7 <= [ FFFF 0000 FFFF 0000]
     mov TABLE, TABLEP1;

    add TABLEP1, 128;   // initialize 
;//   mov OUT, INP;  // algorithm writes data in-place  -> row 0

  ;// for ( x = 0; x < 8; ++x )  // transform one row per iteration
ALIGN 16 ;// align jump address to 16 byte offset
  acc_idct_rowloop1:

;//
;// begin (0)- Convert short -> float data
;// --------------------------------------
;//  Step 0a) : pshufw, generate 64-bit vectors [x0 x0 x0 x0],[x1 x1 x1 x1], ...
;//  Step 0b) : pand, mask out lower-word of each DWORD,
;//             example [x0 x0 x0 x0] -> [x0 __ x0 __]
;//             This effectively prescales all x[] inputs by 65536
;//               (denoted ROW_PRESCALE_SHIFT)
;//  Step 0c)   Convert DWORD vectors [X X] into ssefloat vectors [ X X X X]
;//             (use cvtpi2ps and movlhps)
;//
    movq mm2, [INPR] ;           // (0) mm2 <= x3 x2 x1 x0
     ;//

    movq mm4, [INPR+8] ;         // (0) mm4 <= x7 x6 x5 x4
     pshufw mm0, mm2, 00000000b ;// (0a) mm0 <= x0 x0 x0 x0

    pshufw mm1, mm2, 10101010b  ;// (0a) mm1 <= x2 x2 x2 x2
     pand mm0, mm7;             ;// (0b) mm0 = [ x0 x0 ]*SCALEUP (dword)

    cvtpi2ps xmm0, mm0;         ;// (0c) xmm0 <= [__ __ x0 x0] float
;// mm0 free
     pand mm1, mm7;             ;// (0b) mm1 = [ x2 x2 ]*SCALEUP (dword)

    prefetcht0 [TABLEP1 + 0*32] ;// prefetch coeff +1row, [w7..0]
     pshufw mm0, mm4, 00000000b ;// (0a)  mm0 <= x4 x4 x4 x4

    cvtpi2ps xmm1, mm1;         ;// (0c) xmm1 <= [__ __ x2 x2] float
;// mm1 free
     pand mm0, mm7;             ;// (0b) mm4 <= [x4 x4]*SCALEUP (dword)

    movlhps xmm0, xmm0;         ;// (0c) xmm0 <= [x0 x0 x0 x0] float
     cvtpi2ps xmm2, mm0;        ;// (0c) xmm2 <= [__ __ x4 x4] float
;// mm0 free

;// 1) Apply M_T*P_T operator (P_T is implicit with table/element order)
;//    (table order has been changed for optimum SSE execution)
;//       1a) mulps to generate a_v0, a_v1, a_v2, a_v3, b_v0, b_v1, b_v2, b_v3
;//       1b) addps to generate a_v[] = a_v0 + a_v1 + a_v2 + a_v3
;//                             b_v[] = b_v0 + b_v1 + b_v2 + b_v3

    mulps xmm0, [TABLE + 0*16]  ;// (1a) xmm0 <= a_v0 = [w3..w0] * x0
     pshufw mm1, mm4, 10101010b; // (0a) mm1 <= [x6 x6 x6 x6]

    pand mm1, mm7;              ;// (0b) mm1 <= [x6 x6]*SCALEUP (dword)
     movlhps xmm1, xmm1;        ;// (0c) xmm1 <=[ x2 x2 x2 x2] float

    mulps xmm1, [TABLE + 1*16]  ;// (1a) xmm1 <= a_v1 = [w7..w4] * x2
     cvtpi2ps xmm3, mm1;        ;// (0c) xmm3 <=[ __ __ x6 x6] float
;// mm1 free

    movlhps xmm2, xmm2;         ;// (0c) xmm2 <= [x4 x4 x4 x4] float
     pshufw mm0, mm2, 01010101b; // (0a) mm1 <=[x1 x1 x1 x1]

    mulps xmm2, [TABLE + 2*16];  // (1a) xmm2 <= a_v2 = [w11..w8] * x4
     pshufw mm1, mm2, 11111111b; // (0a) mm1 <=[x3 x3 x3 x3]
;// mm2 free

    pand mm0, mm7;               // (0b) mm0 <=[x1 x1]*SCALEUP (dword)
     movlhps xmm3, xmm3;        ;// (0) xmm3 <= [x6 x6 x6 x6]

    mulps xmm3, [TABLE + 3*16] ; // (1a) xmm3 <= a_v3 = [w15..w12] * x6
     cvtpi2ps xmm4, mm0;         // (0c) xmm4 <=[__ __ x1 x1] float
;// mm0 free

    addps xmm1, xmm0;           ;// (1b) xmm1 <= a_v0 + a_v1
;// xmm0 free
     pand mm1, mm7;              // (0b) mm1 <=[x3 x3]*SCALEUP (dword)

    prefetcht0 [TABLEP1 + 1*32] ;// prefetch coeff +1row [w15..8]
     pshufw mm0, mm4, 01010101b; // (0a) mm0 <= [x5 x5 x5 x5]

    cvtpi2ps xmm5, mm1;          // (0c) xmm5 <=[__ __ x3 x3] float
     pand mm0, mm7;              // (0b) mm0 <= [x5 x5]*SCALEUP (dword)
;// mm1 free

    cvtpi2ps xmm6, mm0;          // (0c) xmm6 <=[__ __ x5 x5] float
;// mm0 free
     movlhps xmm4, xmm4;         // (0c) xmm4 <= [x1 x1 x1 x1]

    mulps xmm4, [TABLE + 4*16]  ;// (1a) xmm4 <= b_v0
     pshufw mm1, mm4, 11111111b; // (0a) mm1 <= [x7 x7 x7 x7]
;// mm4 free

    addps xmm2, xmm1;           ;// (1b) xmm2 + a_v2 <= (a_v0+a_v1) + a_v2
;// xmm1 free
     movlhps xmm5, xmm5;        ;// (0c) xmm5 <=[x3 x3 x3 x3] float

    mulps xmm5, [TABLE + 5*16]  ;// (1a) xmm5 <= b_v1
     pand mm1, mm7;             ;// (0b) mm1 <= [x7 x7]*SCALEUP (dword)

    cvtpi2ps xmm7, mm1;         ;// (0c) xmm7 <= [__ __ x7 x7]
;// mm1 free
     movlhps xmm6, xmm6;         // (0c) xmm6 <= [x5 x5 x5 x5] float

    mulps xmm6, [TABLE + 6*16]  ;// (1a) xmm6 <= b_v2
     add OUTR, 32;              ;// OUT <= OUT + 32 (increment row) 

    addps xmm3, xmm2;           ;// (1b) xmm3 + a_v3 <=(a_v0+a_v1+a_v2) + a_v3
;// xmm3 = complete a-vector ( a_v[] )
;// xmm2 free
     add INPR, 16;               // increment INP(psrc) + 1row 

    prefetcht0 [TABLEP1 + 2*32] ;// prefetch coeff +1row [w23..15]
     movlhps xmm7, xmm7;

    mulps xmm7, [TABLE + 7*16]  ;// (1a) xmm7 <= b_v3
     addps xmm5, xmm4;          ;// (1b) xmm5 <= b_v0 + b_v1
;// xmm4 free

    add TABLE, 128;              // move TABLE to w[32]
     add edi, 1;                 // i = i + 1 (loop index)

    prefetcht0 [TABLEP1 + 3*32] ;// prefetch coeff +1row [w31..24]
     add TABLEP1, 128;           // move TABLEP1 to w[32]

    movaps xmm2, xmm3;           // (1b) xmm2 <= a_v[3..0] 



    addps xmm6, xmm5;            // (1b) xmm6 + b_v2 <=(b_v0+b_v1) + b_v2
;// xmm5 free

    addps xmm7, xmm6;            // (1b) xmm7 + b_v3 <=(b_v0+b_v1+b_v2) + b_v3
;// xmm6 free
;// xmm7 = complete b-vector ( b_v[] )

;// 2) Apply A_T operator to generate outputs y[7..0]
;//     2a) y[3..0] = a_v[] + b_v[]
;//     2b) *y[4..7] = a_v[] - b_v[]   (* requires shufps to invert order)
;//     2c) store y[] to temporary memory


;// output *y[4..7] = a[7..4] + b[7..4], output order must be reversed!
    subps xmm3, xmm7;      ;// 2b) xmm3 <= a_v[3..0]-b_v[3..0] (y4 y5 y6 y7)

;// output y[3..0] = a[3..0] + b[3..0]
    addps xmm2, xmm7;      ;// 2a) xmm2 <= a_v[3..0]+b_v[3..0] (y3 y2 y1 y0)
;// xmm7 free

;// fix output order, y[4..7] -> y[7..4]
    shufps xmm3, xmm3, 00011011b;// 2b) xmm3  [y7 y6 y5 y4] <- [y4 y5 y6 y7]

;// !*@&#@ Visual C++ refuses to align data to 16-byte offset, SIGH

;//    movups [OUT + 0*16], xmm2; ;// 2c) store [y3 y2 y1 y0]
  movaps [OUTR + 0*16], xmm2; ;// 2c) store [y3 y2 y1 y0]
;// xmm2 free

;//    movups [OUT + 1*16], xmm3;  ;// 2c) store [y7 y6 y5 y4]
  movaps [OUTR + 1*16], xmm3;  ;// 2c) store [y7 y6 y5 y4]
;// xmm3 free

    cmp edi, 0x08;

       ;// end for ( x = 0; x < 8; ++x )  // transform one row per iteration
    jl  acc_idct_rowloop1;

;/////////////////////////////////////////////////////////////////////
;//
;//
;// iDCT column transform
;//
;//
;/////////////////////////////////////////////////////////////////////

    ;// 1) Apply (D_T * P_T) - the cos() coefficients of D_T are implicit
    ;//    in the idct_row operation.  But we still need to apply the
    ;//    shuffling operation of D_T.
    ;//
    ;//    1  0  0  0   0  0  0  0
    ;//    0  0  0  0   1  0  0  0
    ;//    0  0  1  0   0  0  0  0
    ;//    0  0  0  0   0  0  1  0
    ;//    0  1  0  0   0  0  0  0
    ;//    0  0  0  0   0  0  0  1
    ;//    0  0  0  1   0  0  0  0
    ;//    0  0  0  0   0  1  0  0

    ;// dp[0] = xc[ 0 *8];
    ;// dp[1] = xc[ 4 *8];
    ;// dp[2] = xc[ 2 *8];
    ;// dp[3] = xc[ 6 *8];

    ;// dp[4] = xc[ 1 *8];
    ;// dp[5] = xc[ 7 *8];
    ;// dp[6] = xc[ 3 *8];
    ;// dp[7] = xc[ 5 *8];

#define SCRATCH esi
#define INPC edx
#define OUTC eax
    mov OUTC, dword ptr [data];   ;// row 0
     mov edi, 0x00; // i = 0


    mov INPC, dword ptr [fTempArray];
 //    sub OUT, 16;    // precompensate OUT


#define _PCOS4        TABLE   
#define _PTAN1        TABLE+16
#define _PTAN2        TABLE+32
#define _PTAN3        TABLE+48
#define _PRND_INV_ROW TABLE+64

    lea TABLE, [tab_i_01234567ssef];
    mov SCRATCH, INPC; // after we read the first input column, use as scratch

    add TABLE, 1024; // increment TABLE to the COS4 entry
//
ALIGN 16
acc_idct_colloop1:
    movaps xmm0, [INPC + 1*32] ; // (1) xmm0 <= dp[4]
     add edi, 1;

    ;

    movaps xmm2, [INPC + 3*32];  // (1) xmm2 <= dp[6]

    ;

    movaps xmm1, [INPC + 7*32];  // (1) xmm1 <= dp[5]
     movaps xmm4, xmm0;          // (1) xmm4 <= dp[4]

    ;

    movaps xmm3, [INPC + 5*32] ; // (1) xmm3 <= dp[7]
     movaps xmm6, xmm2;         ;// (2a) xmm6 <= dp[6]

    // 2) Apply B_T
    //
    //    b[4] =   dp[4] + ( dp[5]*tg_1_16f );
    //    b[5] = - dp[5] + ( dp[4]*tg_1_16f );
    //    b[6] =   dp[6] + ( dp[7]*tg_3_16f );
    //    b[7] = - dp[7] + ( dp[6]*tg_3_16f );

    ;

    mulps xmm4, [_PTAN1];    ;// (2a) xmm4 <= (dp[4]*tg_1_16f)
     movaps xmm5, xmm1       ;// (2a) xmm5 <= dp[5] 

    ;
💿 文件大小 151 K
👤 上传用户 a1102882595
📂 所属分类压缩解压
🏷️ 相关标签

#iDCT #DCT #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -