idctmm32.cpp
来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,707 行 · 第 1/4 页
CPP
1,707 行
// store DWORD -> [ cols (1,0) "row 3"];
// store DWORD -> [ cols (1,0) "row 5"];
// store DWORD -> [ cols (1,0) "row 7"];
// store DWORD -> [ cols (1,0) "row 4"];
// store DWORD -> [ cols (1,0) "row 6"];
// }
//
// Note that we have DOUBLED the # store instructions (because we can
// only store one dword per instruction, instead of a qword.) But
// we eliminate the output-matrix post-transpose requirement, so overall
// the algorithm is faster.
//
// You'll notice that this assembly block contains two copies of the same
// row-transformation algorithm. The loops are nearly identical. They
// use different shift&round constants.
__asm {
//;------------------------------------------------------
//DCT_8_INV_ROW_8 MACRO INP:REQ, OUT:REQ, TABLE:REQ
mov INP, dword ptr [blk]; ;// input data is row 0 of blk[]
mov edi, 0x00; //yx = 0
lea TABLE, dword ptr [tab_i_01234567]; // row 0 of the tab_i...[]
lea OUT, dword ptr [qwTemp]; // write output to qwTemp[]
lea round_inv_row, dword ptr [r_inv_row]
// for ( y = 0; y<8; y=y+2 ) // transform two rows per iteration
lp_mmx32_rows:
movq mm0, qword ptr [INP] ; // 0 ; x3 x2 x1 x0
movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
movq mm2, mm0 ; // 2 ; x3 x2 x1 x0
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
punpcklwd mm0, mm1 ; // x5 x1 x4 x0
// ---------- processing row #y+0
movq mm5, mm0 ; // 5 ; x5 x1 x4 x0
punpckldq mm0, mm0 ; // x4 x0 x4 x0
movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
punpckhwd mm2, mm1 ; // 1 ; x7 x3 x6 x2
pmaddwd mm3, mm0 ; // x4*w06+x0*w04 x4*w02+x0*w00
movq mm6, mm2 ; // 6 ; x7 x3 x6 x2
movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
punpckldq mm2, mm2 ; // x6 x2 x6 x2
pmaddwd mm4, mm2 ; // x6*w07+x2*w05 x6*w03+x2*w01
punpckhdq mm5, mm5 ; // x5 x1 x5 x1
pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
punpckhdq mm6, mm6 ; // x7 x3 x7 x3
movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
pmaddwd mm1, mm5 ; // x5*w22+x1*w20 x5*w18+x1*w16
paddd mm3, qword ptr [round_inv_row];// +rounder
pmaddwd mm7, mm6 ; // x7*w23+x3*w21 x7*w19+x3*w17
pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
paddd mm3, mm4 ; // 4 ; a1=sum(even1) a0=sum(even0)
pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
movq mm4, mm3 ; // 4 ; a1 a0
pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
paddd mm1, mm7 ; // 7 ; b1=sum(odd1) b0=sum(odd0)
paddd mm0, qword ptr [round_inv_row];// +rounder
psubd mm3, mm1 ; // a1-b1 a0-b0
psrad mm3, SHIFT_INV_ROW ; // y6=a1-b1 y7=a0-b0
paddd mm1, mm4 ; // 4 ; a1+b1 a0+b0
paddd mm0, mm2 ; // 2 ; a3=sum(even3) a2=sum(even2)
psrad mm1, SHIFT_INV_ROW ; // y1=a1+b1 y0=a0+b0
paddd mm5, mm6 ; // 6 ; b3=sum(odd3) b2=sum(odd2)
movq mm4, mm0 ; // 4 ; a3 a2
paddd mm0, mm5 ; // a3+b3 a2+b2
psubd mm4, mm5 ; // 5 ; a3-b3 a2-b2
movq mm2, qword ptr [INP+16] ; // row+1; 0; x3 x2 x1 x0
psrad mm4, SHIFT_INV_ROW ; // y4=a3-b3 y5=a2-b2
movq mm7, qword ptr [INP+24] ; // row+1; 7 ; x7 x6 x5 x4
psrad mm0, SHIFT_INV_ROW ; // y3=a3+b3 y2=a2+b2
add INP, 32; // increment INPUT pointer +2 rows
packssdw mm4, mm3 ; // 3 ; y6 y7 y4 y5
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
packssdw mm1, mm0 ; // 0 ; y3 y2 y1 y0
movq qword ptr [scratch2], mm4 ; // 7 ; save y6 y7 y4 y5
movq mm0, mm2 ; // row+1; 2 ; x3 x2 x1 x0
movq qword ptr [scratch1], mm1 ; // 1 ; save y3 y2 y1 y0
punpcklwd mm0, mm7 ; // row+1; 0 x5 x1 x4 x0
// begin processing row #y+1
movq mm5, mm0 ; // 5 ; x5 x1 x4 x0
punpckldq mm0, mm0 ; // x4 x0 x4 x0
movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
punpckhwd mm2, mm7 ; // 1 ; x7 x3 x6 x2
pmaddwd mm3, mm0 ; // x4*w06+x0*w04 x4*w02+x0*w00
movq mm6, mm2 ; // 6 ; x7 x3 x6 x2
movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
punpckldq mm2, mm2 ; // x6 x2 x6 x2
pmaddwd mm4, mm2 ; // x6*w07+x2*w05 x6*w03+x2*w01
punpckhdq mm5, mm5 ; // x5 x1 x5 x1
pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
punpckhdq mm6, mm6 ; // x7 x3 x7 x3
movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
pmaddwd mm1, mm5 ; // x5*w22+x1*w20 x5*w18+x1*w16
paddd mm3, qword ptr [round_inv_row];// +rounder
pmaddwd mm7, mm6 ; // x7*w23+x3*w21 x7*w19+x3*w17
pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
paddd mm3, mm4 ; // 4 ; a1=sum(even1) a0=sum(even0)
pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
movq mm4, mm3 ; // 4 ; a1 a0
pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
paddd mm1, mm7 ; // 7 ; b1=sum(odd1) b0=sum(odd0)
paddd mm0, qword ptr [round_inv_row];// +rounder
psubd mm3, mm1 ; // a1-b1 a0-b0
psrad mm3, SHIFT_INV_ROW ; // y6=a1-b1 y7=a0-b0
paddd mm1, mm4 ; // 4 ; a1+b1 a0+b0
paddd mm0, mm2 ; // 2 ; a3=sum(even3) a2=sum(even2)
psrad mm1, SHIFT_INV_ROW ; // y1=a1+b1 y0=a0+b0
paddd mm5, mm6 ; // 6 ; b3=sum(odd3) b2=sum(odd2)
movq mm4, mm0 ; // 4 ; a3 a2
paddd mm0, mm5 ; // a3+b3 a2+b2
psubd mm4, mm5 ; // 5 ; a3-b3 a2-b2
// retrieve row#(y+0) data
movq mm2, qword ptr [scratch1]; // mm2 = row(0) y3 y2 y1 y0
psrad mm0, SHIFT_INV_ROW ; // y3=a3+b3 y2=a2+b2
packssdw mm1, mm0 ; // 0 ; y3 y2 y1 y0
movq mm0, mm2; // mm0 = copy of row(0) y3 y2 y1 y0
movq mm7, qword ptr [scratch2]; // mm7 = row(0) y6 y7 y4 y5
punpcklwd mm0, mm1; // mm0 = [ (r1)y1 (r0)y1 (r1)y0 (r0)y0 ]
punpckhwd mm2, mm1; // mm2 = [ (r1)y3 (r0)y3 (r1)y2 (r0)y2 ]
movq mm6, mm7; // mm6 = copy of row(0) y6 y7 y4 y5
movd dword ptr [OUT+ROW_STRIDE*0], mm0; // store row(0) [ y1 y0 ]
psrlq mm0, 32;
movd dword ptr [OUT+ROW_STRIDE*2], mm2; // store row(2) [ y1 y0 ]
psrlq mm2, 32;
movd dword ptr [OUT+ROW_STRIDE*1], mm0; // store row(1) [ y1 y0 ]
psrad mm4, SHIFT_INV_ROW ; // y4=a3-b3 y5=a2-b2
movd dword ptr [OUT+ROW_STRIDE*3], mm2; // store row(3) [ y1 y0 ]
packssdw mm4, mm3 ; // 3 ; y6 y7 y4 y5
add OUT, 4; // increment OUTPUT pointer +2 cols
punpcklwd mm6, mm4; // mm6 = [ (r1)y4 (r0)y4 (r1)y5 (r0)y5 ]
add edi, 0x02; // y = y + 2
punpckhwd mm7, mm4; // mm7 = [ (r1)y6 (r0)y7 (r1)y6 (r0)y7 ]
movd dword ptr [OUT+ROW_STRIDE*5 - 4], mm6 ; // store row(5) [ y1 y0]
psrlq mm6, 32;
movd dword ptr [OUT+ROW_STRIDE*7 - 4], mm7 ; // store row(7) [ y1 y0]
psrlq mm7, 32;
movd dword ptr [OUT+ROW_STRIDE*4 - 4], mm6 ; // store row(4) [ y1 y0]
movd dword ptr [OUT+ROW_STRIDE*6 - 4], mm7 ; // store row(6) [ y1 y0]
cmp edi, 0x08; // compare ( y <> 8 )
jl lp_mmx32_rows; // end for ( y = 0; y < 8; y = y + 2 )
// done with row-transformation
//
// qwTemp[] contains the output matrix (already transposed)
//
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
// the following subroutine repeats the row-transform operation,
// except with different shift&round constants.
//
// The output is stored into blk[], which destroys the original
// input data.
lea INP, dword ptr [qwTemp]; ;// input is qwTemp[]
mov edi, 0x00; //yx = 0
lea TABLE, dword ptr [tab_i_01234567]; // row 0 of table
mov OUT, dword ptr [blk]; // final output is to blk[]
lea round_inv_col, dword ptr [r_inv_col];
// for ( y = 0; y<8; y=y+2 ) // transform two rows per iteration
lp_mmx32_cols:
movq mm0, qword ptr [INP] ; // 0 ; x3 x2 x1 x0
movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
movq mm2, mm0 ; // 2 ; x3 x2 x1 x0
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
punpcklwd mm0, mm1 ; // x5 x1 x4 x0
// ---------- processing row #y+0
movq mm5, mm0 ; // 5 ; x5 x1 x4 x0
punpckldq mm0, mm0 ; // x4 x0 x4 x0
movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
punpckhwd mm2, mm1 ; // 1 ; x7 x3 x6 x2
pmaddwd mm3, mm0 ; // x4*w06+x0*w04 x4*w02+x0*w00
movq mm6, mm2 ; // 6 ; x7 x3 x6 x2
movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
punpckldq mm2, mm2 ; // x6 x2 x6 x2
pmaddwd mm4, mm2 ; // x6*w07+x2*w05 x6*w03+x2*w01
punpckhdq mm5, mm5 ; // x5 x1 x5 x1
pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
punpckhdq mm6, mm6 ; // x7 x3 x7 x3
movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
pmaddwd mm1, mm5 ; // x5*w22+x1*w20 x5*w18+x1*w16
paddd mm3, qword ptr [round_inv_col];// +rounder
pmaddwd mm7, mm6 ; // x7*w23+x3*w21 x7*w19+x3*w17
pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
paddd mm3, mm4 ; // 4 ; a1=sum(even1) a0=sum(even0)
pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
movq mm4, mm3 ; // 4 ; a1 a0
pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
paddd mm1, mm7 ; // 7 ; b1=sum(odd1) b0=sum(odd0)
paddd mm0, qword ptr [round_inv_col];// +rounder
psubd mm3, mm1 ; // a1-b1 a0-b0
psrad mm3, SHIFT_INV_COL ; // y6=a1-b1 y7=a0-b0
paddd mm1, mm4 ; // 4 ; a1+b1 a0+b0
paddd mm0, mm2 ; // 2 ; a3=sum(even3) a2=sum(even2)
psrad mm1, SHIFT_INV_COL; // y1=a1+b1 y0=a0+b0
paddd mm5, mm6 ; // 6 ; b3=sum(odd3) b2=sum(odd2)
movq mm4, mm0 ; // 4 ; a3 a2
paddd mm0, mm5 ; // a3+b3 a2+b2
psubd mm4, mm5 ; // 5 ; a3-b3 a2-b2
movq mm2, qword ptr [INP+16] ; // row+1; 0; x3 x2 x1 x0
psrad mm4, SHIFT_INV_COL ; // y4=a3-b3 y5=a2-b2
movq mm7, qword ptr [INP+24] ; // row+1; 7 ; x7 x6 x5 x4
psrad mm0, SHIFT_INV_COL ; // y3=a3+b3 y2=a2+b2
add INP, 32; // increment INPUT pointer +2 rows
packssdw mm4, mm3 ; // 3 ; y6 y7 y4 y5
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
packssdw mm1, mm0 ; // 0 ; y3 y2 y1 y0
movq qword ptr [scratch2], mm4 ; // 7 ; save y6 y7 y4 y5
movq mm0, mm2 ; // row+1; 2 ; x3 x2 x1 x0
movq qword ptr [scratch1], mm1 ; // 1 ; save y3 y2 y1 y0
punpcklwd mm0, mm7 ; // row+1; 0 x5 x1 x4 x0
// begin processing row #y+1
movq mm5, mm0 ; // 5 ; x5 x1 x4 x0
punpckldq mm0, mm0 ; // x4 x0 x4 x0
movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
punpckhwd mm2, mm7 ; // 1 ; x7 x3 x6 x2
pmaddwd mm3, mm0 ; // x4*w06+x0*w04 x4*w02+x0*w00
movq mm6, mm2 ; // 6 ; x7 x3 x6 x2
movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
punpckldq mm2, mm2 ; // x6 x2 x6 x2
pmaddwd mm4, mm2 ; // x6*w07+x2*w05 x6*w03+x2*w01
punpckhdq mm5, mm5 ; // x5 x1 x5 x1
pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
punpckhdq mm6, mm6 ; // x7 x3 x7 x3
movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
pmaddwd mm1, mm5 ; // x5*w22+x1*w20 x5*w18+x1*w16
paddd mm3, qword ptr [round_inv_col];// +rounder
pmaddwd mm7, mm6 ; // x7*w23+x3*w21 x7*w19+x3*w17
pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
paddd mm3, mm4 ; // 4 ; a1=sum(even1) a0=sum(even0)
pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
movq mm4, mm3 ; // 4 ; a1 a0
pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
paddd mm1, mm7 ; // 7 ; b1=sum(odd1) b0=sum(odd0)
paddd mm0, qword ptr [round_inv_col];// +rounder
psubd mm3, mm1 ; // a1-b1 a0-b0
psrad mm3, SHIFT_INV_COL ; // y6=a1-b1 y7=a0-b0
paddd mm1, mm4 ; // 4 ; a1+b1 a0+b0
paddd mm0, mm2 ; // 2 ; a3=sum(even3) a2=sum(even2)
psrad mm1, SHIFT_INV_COL ; // y1=a1+b1 y0=a0+b0
paddd mm5, mm6 ; // 6 ; b3=sum(odd3) b2=sum(odd2)
movq mm4, mm0 ; // 4 ; a3 a2
paddd mm0, mm5 ; // a3+b3 a2+b2
psubd mm4, mm5 ; // 5 ; a3-b3 a2-b2
// retrieve row#(y+0) data
movq mm2, qword ptr [scratch1]; // mm2 = row(0) y3 y2 y1 y0
psrad mm0, SHIFT_INV_COL ; // y3=a3+b3 y2=a2+b2
packssdw mm1, mm0 ; // 0 ; y3 y2 y1 y0
movq mm0, mm2; // mm0 = copy of row(0) y3 y2 y1 y0
movq mm7, qword ptr [scratch2]; // mm7 = row(0) y6 y7 y4 y5
punpcklwd mm0, mm1; // mm0 = [ (r1)y1 (r0)y1 (r1)y0 (r0)y0 ]
punpckhwd mm2, mm1; // mm2 = [ (r1)y3 (r0)y3 (r1)y2 (r0)y2 ]
movq mm6, mm7; // mm6 = copy of row(0) y6 y7 y4 y5
movd dword ptr [OUT+ROW_STRIDE*0], mm0; // store row(0) [ y1 y0 ]
psrlq mm0, 32;
movd dword ptr [OUT+ROW_STRIDE*2], mm2; // store row(2) [ y1 y0 ]
psrlq mm2, 32;
movd dword ptr [OUT+ROW_STRIDE*1], mm0; // store row(1) [ y1 y0 ]
psrad mm4, SHIFT_INV_COL ; // y4=a3-b3 y5=a2-b2
movd dword ptr [OUT+ROW_STRIDE*3], mm2; // store row(3) [ y1 y0 ]
packssdw mm4, mm3 ; // 3 ; y6 y7 y4 y5
add OUT, 4; // increment OUTPUT pointer +2 cols
punpcklwd mm6, mm4; // mm6 = [ (r1)y4 (r0)y4 (r1)y5 (r0)y5 ]
add edi, 0x02; // y = y + 2
punpckhwd mm7, mm4; // mm7 = [ (r1)y6 (r0)y7 (r1)y6 (r0)y7 ]
movd dword ptr [OUT+ROW_STRIDE*5 - 4], mm6 ; // store row(5) [ y1 y0]
psrlq mm6, 32;
movd dword ptr [OUT+ROW_STRIDE*7 - 4], mm7 ; // store row(7) [ y1 y0]
psrlq mm7, 32;
movd dword ptr [OUT+ROW_STRIDE*4 - 4], mm6 ; // store row(4) [ y1 y0]
movd dword ptr [OUT+ROW_STRIDE*6 - 4], mm7 ; // store row(6) [ y1 y0]
cmp edi, 0x08; // compare ( y <> 8 )
jl lp_mmx32_cols; // end for ( y = 0; y < 8; y = y + 2 )
} // end __asm
}
// v0.16b34 no longer in use
// the old implementation is here for educational purposes.
/*
static void
idct_mmx32_cols( short *blk ) // transform all 8 cols of 8x8 iDCT block
{
// Despite the function's name, the matrix is transformed
// row by row. This function is identical to idct_mmx32_rows(),
// except for the SHIFT amount and ROUND_INV amount.
// this subroutine performs two operations
// 1) iDCT row transform
// for( i = 0; i < 8; ++ i)
// DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
//
// 2) transpose the matrix (which was stored in qwTemp[])
// qwTemp[] -> [8x8 matrix transpose] -> blk[]
__asm {
//;------------------------------------------------------
//DCT_8_INV_ROW_1 MACRO INP:REQ, OUT:REQ, TABLE:REQ
mov INP, dword ptr [blk]; ;// row 0
mov edi, 0x00; // x = 0
lea TABLE, dword ptr [tab_i_01234567]; // row 0
lea OUT, dword ptr [qwTemp];
// mov OUT, INP; // algorithm writes data in-place -> row 0
lea round_inv_col, dword ptr [r_inv_col]
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?