idctmm32.cpp
来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,707 行 · 第 1/4 页
CPP
1,707 行
// for ( x = 0; x < 8; ++x ) // transform one row per iteration
acc_idct_colloop1:
movq mm0, qword ptr [INP] ; // 0 ; x3 x2 x1 x0
movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
movq mm2, mm0 ; // 2 ; x3 x2 x1 x0
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
punpcklwd mm0, mm1 ; // x5 x1 x4 x0
// ----------
movq mm5, mm0 ; // 5 ; x5 x1 x4 x0
punpckldq mm0, mm0 ; // x4 x0 x4 x0
movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
punpckhwd mm2, mm1 ; // 1 ; x7 x3 x6 x2
pmaddwd mm3, mm0 ; // x4*w06+x0*w04 x4*w02+x0*w00
movq mm6, mm2 ; // 6 ; x7 x3 x6 x2
movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
punpckldq mm2, mm2 ; // x6 x2 x6 x2
pmaddwd mm4, mm2 ; // x6*w07+x2*w05 x6*w03+x2*w01
punpckhdq mm5, mm5 ; // x5 x1 x5 x1
pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
punpckhdq mm6, mm6 ; // x7 x3 x7 x3
movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
pmaddwd mm1, mm5 ; // x5*w22+x1*w20 x5*w18+x1*w16
paddd mm3, qword ptr [round_inv_col] ;// +rounder
pmaddwd mm7, mm6 ; // x7*w23+x3*w21 x7*w19+x3*w17
pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
paddd mm3, mm4 ; // 4 ; a1=sum(even1) a0=sum(even0)
pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
movq mm4, mm3 ; // 4 ; a1 a0
pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
paddd mm1, mm7 ; // 7 ; b1=sum(odd1) b0=sum(odd0)
paddd mm0, qword ptr [round_inv_col] ;// +rounder
psubd mm3, mm1 ; // a1-b1 a0-b0
psrad mm3, SHIFT_INV_COL; // y6=a1-b1 y7=a0-b0
paddd mm1, mm4 ; // 4 ; a1+b1 a0+b0
paddd mm0, mm2 ; // 2 ; a3=sum(even3) a2=sum(even2)
psrad mm1, SHIFT_INV_COL; // y1=a1+b1 y0=a0+b0
paddd mm5, mm6 ; // 6 ; b3=sum(odd3) b2=sum(odd2)
movq mm4, mm0 ; // 4 ; a3 a2
paddd mm0, mm5 ; // a3+b3 a2+b2
psubd mm4, mm5 ; // 5 ; a3-b3 a2-b2
add INP, 16; // increment INPUT pointer -> row 1
psrad mm4, SHIFT_INV_COL; // y4=a3-b3 y5=a2-b2
add TABLE, 0; // ptr TABLE += 64 -> row 1
psrad mm0, SHIFT_INV_COL; // y3=a3+b3 y2=a2+b2
// movq mm2, qword ptr [INP] ; // row+1; 0; x3 x2 x1 x0
packssdw mm4, mm3 ; // 3 ; y6 y7 y4 y5
packssdw mm1, mm0 ; // 0 ; y3 y2 y1 y0
movq mm7, mm4 ; // 7 ; y6 y7 y4 y5
// movq mm0, mm2 ; // row+1; 2 ; x3 x2 x1 x0
// por mm1, qword ptr one_corr ; // correction y2 +0.5
psrld mm4, 16 ; // 0 y6 0 y4
movq qword ptr [OUT], mm1 ; // 1 ; save y3 y2 y1 y0
pslld mm7, 16 ; // y7 0 y5 0
// movq mm1, qword ptr [INP+8] ; // row+1; 1 ; x7 x6 x5 x4
// por mm7, qword ptr one_corr ; // correction y2 +0.5
por mm7, mm4 ; // 4 ; y7 y6 y5 y4
// movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
// punpcklwd mm0, mm1 ; // row+1; x5 x1 x4 x0
// begin processing row 1
movq qword ptr [OUT+8], mm7 ; // 7 ; save y7 y6 y5 y4
add edi, 0x01;
add OUT, 16;
cmp edi, 0x08; // compare x <> 8
jl acc_idct_colloop1; // end for ( x = 0; x < 8; ++x )
// done with the iDCT column-transformation
// now we have to transpose the output 8x8 matrix
// 8x8 (OUT) -> 8x8't' (IN)
// the transposition is implemented as 4 sub-operations.
// 1) transpose upper-left quad
// 2) transpose lower-right quad
// 3) transpose lower-left quad
// 4) transpose upper-right quad
// mm0 = 1st row [ A B C D ] row1
// mm1 = 2nd row [ E F G H ] 2
// mm2 = 3rd row [ I J K L ] 3
// mm3 = 4th row [ M N O P ] 4
// 1) transpose upper-left quad
lea OUT, dword ptr [qwTemp];
movq mm0, qword ptr [OUT + ROW_STRIDE * 0 ]
movq mm1, qword ptr [OUT + ROW_STRIDE * 1 ]
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE * 2 ]
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
movq mm3, qword ptr [OUT + ROW_STRIDE * 3]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
mov INP, dword ptr [blk]; // load input address
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
movq qword ptr [ INP + ROW_STRIDE * 0 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
// begin reading next quadrant (lower-right)
movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8];
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE * 2], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ INP +ROW_STRIDE * 1], mm1; // store row 2
movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
movq qword ptr [ INP +ROW_STRIDE * 3], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 2) transpose lower-right quadrant
// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE*6 + 8]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [OUT + ROW_STRIDE*7 + 8]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
; // slot
movq qword ptr [ INP + ROW_STRIDE*4 + 8], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE*6 + 8], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ INP +ROW_STRIDE*5 + 8], mm1; // store row 2
; // slot
movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
; // slot
movq qword ptr [ INP +ROW_STRIDE*7 + 8], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 3) transpose lower-left
// movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
// movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE * 6 ]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [OUT + ROW_STRIDE * 7 ]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
;//slot
movq qword ptr [ INP + ROW_STRIDE * 0 + 8 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
// begin reading next quadrant (upper-right)
movq mm0, qword ptr [OUT + ROW_STRIDE*0 + 8];
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE * 2 + 8], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ INP +ROW_STRIDE * 1 + 8 ], mm1; // store row 2
movq mm1, qword ptr [OUT + ROW_STRIDE*1 + 8]
movq qword ptr [ INP +ROW_STRIDE * 3 + 8], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 2) transpose lower-right quadrant
// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE*2 + 8]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [OUT + ROW_STRIDE*3 + 8]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
; // slot
movq qword ptr [ INP + ROW_STRIDE*4 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
movq qword ptr [ INP +ROW_STRIDE*5 ], mm1; // store row 2
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE*6 ], mm4; // store row 3
; // slot
movq qword ptr [ INP +ROW_STRIDE*7 ], mm3; // store row 4
; // slot
} // end __asm
}
static void
idct_mmx32_rows( short *blk ) // transform all 8 rows of 8x8 iDCT block
{
// this subroutine performs two operations
// 1) iDCT row transform
// for( i = 0; i < 8; ++ i)
// DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
//
// 2) transpose the matrix (which was stored in qwTemp[])
// qwTemp[] -> [8x8 matrix transpose] -> blk[]
__asm {
//;------------------------------------------------------
//DCT_8_INV_ROW_1 MACRO INP:REQ, OUT:REQ, TABLE:REQ
mov INP, dword ptr [blk]; ;// row 0
mov edi, 0x00; //x = 0
lea TABLE, dword ptr [tab_i_01234567]; // row 0
// mov OUT, INP; // algorithm writes data in-place -> row 0
lea OUT, dword ptr [qwTemp];
lea round_inv_row, dword ptr [r_inv_row]
// for ( x = 0; x < 8; ++x ) // transform one row per iteration
lpa:
movq mm0, qword ptr [INP] ; // 0 ; x3 x2 x1 x0
movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
movq mm2, mm0 ; // 2 ; x3 x2 x1 x0
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
punpcklwd mm0, mm1 ; // x5 x1 x4 x0
// ----------
movq mm5, mm0 ; // 5 ; x5 x1 x4 x0
punpckldq mm0, mm0 ; // x4 x0 x4 x0
movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
punpckhwd mm2, mm1 ; // 1 ; x7 x3 x6 x2
pmaddwd mm3, mm0 ; // x4*w06+x0*w04 x4*w02+x0*w00
movq mm6, mm2 ; // 6 ; x7 x3 x6 x2
movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
punpckldq mm2, mm2 ; // x6 x2 x6 x2
pmaddwd mm4, mm2 ; // x6*w07+x2*w05 x6*w03+x2*w01
punpckhdq mm5, mm5 ; // x5 x1 x5 x1
pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
punpckhdq mm6, mm6 ; // x7 x3 x7 x3
movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
pmaddwd mm1, mm5 ; // x5*w22+x1*w20 x5*w18+x1*w16
paddd mm3, qword ptr [round_inv_row];// +rounder
pmaddwd mm7, mm6 ; // x7*w23+x3*w21 x7*w19+x3*w17
pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
paddd mm3, mm4 ; // 4 ; a1=sum(even1) a0=sum(even0)
pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
movq mm4, mm3 ; // 4 ; a1 a0
pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
paddd mm1, mm7 ; // 7 ; b1=sum(odd1) b0=sum(odd0)
paddd mm0, qword ptr [round_inv_row];// +rounder
psubd mm3, mm1 ; // a1-b1 a0-b0
psrad mm3, SHIFT_INV_ROW ; // y6=a1-b1 y7=a0-b0
paddd mm1, mm4 ; // 4 ; a1+b1 a0+b0
paddd mm0, mm2 ; // 2 ; a3=sum(even3) a2=sum(even2)
psrad mm1, SHIFT_INV_ROW ; // y1=a1+b1 y0=a0+b0
paddd mm5, mm6 ; // 6 ; b3=sum(odd3) b2=sum(odd2)
movq mm4, mm0 ; // 4 ; a3 a2
paddd mm0, mm5 ; // a3+b3 a2+b2
psubd mm4, mm5 ; // 5 ; a3-b3 a2-b2
add INP, 16; // increment INPUT pointer -> row 1
psrad mm4, SHIFT_INV_ROW ; // y4=a3-b3 y5=a2-b2
// add TABLE, 0; // ptr TABLE += 64 -> row 1
psrad mm0, SHIFT_INV_ROW ; // y3=a3+b3 y2=a2+b2
// movq mm2, qword ptr [INP] ; // row+1; 0; x3 x2 x1 x0
packssdw mm4, mm3 ; // 3 ; y6 y7 y4 y5
packssdw mm1, mm0 ; // 0 ; y3 y2 y1 y0
movq mm7, mm4 ; // 7 ; y6 y7 y4 y5
// movq mm0, mm2 ; // row+1; 2 ; x3 x2 x1 x0
psrld mm4, 16 ; // 0 y6 0 y4
movq qword ptr [OUT], mm1 ; // 1 ; save y3 y2 y1 y0
pslld mm7, 16 ; // y7 0 y5 0
// movq mm1, qword ptr [INP+8] ; // row+1; 1 ; x7 x6 x5 x4
por mm7, mm4 ; // 4 ; y7 y6 y5 y4
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
// punpcklwd mm0, mm1 ; // row+1; x5 x1 x4 x0
// begin processing row 1
movq qword ptr [OUT+8], mm7 ; // 7 ; save y7 y6 y5 y4
add edi, 0x01;
add OUT, 16; // increment OUTPUT pointer -> row 1
cmp edi, 0x08;
jl lpa; // end for ( x = 0; x < 8; ++x )
// done with the iDCT row-transformation
// now we have to transpose the output 8x8 matrix
// 8x8 (OUT) -> 8x8't' (IN)
// the transposition is implemented as 4 sub-operations.
// 1) transpose upper-left quad
// 2) transpose lower-right quad
// 3) transpose lower-left quad
// 4) transpose upper-right quad
// mm0 = 1st row [ A B C D ] row1
// mm1 = 2nd row [ E F G H ] 2
// mm2 = 3rd row [ I J K L ] 3
// mm3 = 4th row [ M N O P ] 4
// 1) transpose upper-left quad
lea OUT, dword ptr [qwTemp];
movq mm0, qword ptr [OUT + ROW_STRIDE * 0 ]
movq mm1, qword ptr [OUT + ROW_STRIDE * 1 ]
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE * 2 ]
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
movq mm3, qword ptr [OUT + ROW_STRIDE * 3]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm6, mm2;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?