📄 idct_ap528mmx.cpp
字号:
movq mm4, mm6
punpckldq mm7, mm5 ; tmt0
punpckhdq mm2, mm5 ; tmt2 ; free mm5
;slot
; shuffle the rest of the data, and write it with 2 qword ptr writes
punpckldq mm6, mm3 ; tmt4
;move from next block
movq mm5, mm2 ; duplicate tmt2
punpckhdq mm4, mm3 ; tmt6 ; free mm3
;move from next block
movq mm3, mm0 ; duplicate tmt10
; column 0: odd part (after transpose)
;moved up to prev block
;movq mm3, mm0 ; duplicate tmt10
;movq mm5, mm2 ; duplicate tmt2
psubsw mm0, mm4 ; V110
paddsw mm3, mm4 ; V113 ; free mm4
movq mm4, mm0 ; duplicate V110
paddsw mm2, mm1 ; V111
pmulhw mm0, qword ptr x539f539f539f539f ; 21407-> V117
psubsw mm5, mm1 ; V112 ; free mm1
psubsw mm4, mm5 ; V116
movq mm1, mm2 ; duplicate V111
pmulhw mm5, qword ptr x4546454645464546 ; 17734-> V119
psubsw mm2, mm3 ; V114
pmulhw mm4, qword ptr x61f861f861f861f8 ; 25080-> V120
paddsw mm1, mm3 ; V115 ; free mm3
pmulhw mm2, qword ptr x5a825a825a825a82 ; 23170-> V118
psllw mm0, 2 ; t266
movq qword ptr[esi+8*0], mm1 ; save V115
psllw mm5, 1 ; t268
psubsw mm5, mm4 ; V122
psubsw mm4, mm0 ; V121 ; free mm0
psllw mm5, 1 ; t270
;slot
psubsw mm5, mm1 ; V123 ; free mm1
psllw mm2, 2 ; t272
psubsw mm2, mm5 ; V124 (keep V123)
psllw mm4, 1 ; t274
movq qword ptr[esi+8*2], mm5 ; save V123 ; free mm5
paddsw mm4, mm2 ; V125 (keep V124)
; column 0: even part (after transpose)
movq mm0, qword ptr[esi+8*12] ; tmt12
movq mm3, mm6 ; duplicate tmt4
psubsw mm6, mm0 ; V100
paddsw mm3, mm0 ; V101 ; free mm0
pmulhw mm6, qword ptr x5a825a825a825a82 ; 23170 ->V102
movq mm5, mm7 ; duplicate tmt0
movq mm1, qword ptr[esi+8*8] ; tmt8
;slot
paddsw mm7, mm1 ; V103
psubsw mm5, mm1 ; V104 ; free mm1
movq mm0, mm7 ; duplicate V103
psllw mm6, 2 ; t245
paddsw mm7, mm3 ; V106
movq mm1, mm5 ; duplicate V104
psubsw mm6, mm3 ; V105
psubsw mm0, mm3 ; V109; free mm3
paddsw mm5, mm6 ; V107
psubsw mm1, mm6 ; V108 ; free mm6
; column 0: output butterfly (after transform)
movq mm3, mm1 ; duplicate V108
paddsw mm1, mm2 ; out4
psraw mm1, 4
psubsw mm3, mm2 ; out10 ; free mm2
psraw mm3, 4
movq mm6, mm0 ; duplicate V109
movq qword ptr[esi+8*4], mm1 ; out4 ; free mm1
psubsw mm0, mm4 ; out6
movq qword ptr[esi+8*10], mm3 ; out10 ; free mm3
psraw mm0, 4
paddsw mm6, mm4 ; out8 ; free mm4
movq mm1, mm7 ; duplicate V106
movq qword ptr[esi+8*6], mm0 ; out6 ; free mm0
psraw mm6, 4
movq mm4, qword ptr[esi+8*0] ; V115
;slot
movq qword ptr[esi+8*8], mm6 ; out8 ; free mm6
movq mm2, mm5 ; duplicate V107
movq mm3, qword ptr[esi+8*2] ; V123
paddsw mm7, mm4 ; out0
;moved up from next block
movq mm0, qword ptr scratch3
psraw mm7, 4
;moved up from next block
movq mm6, qword ptr scratch5
psubsw mm1, mm4 ; out14 ; free mm4
paddsw mm5, mm3 ; out2
psraw mm1, 4
movq qword ptr[esi], mm7 ; out0 ; free mm7
psraw mm5, 4
movq qword ptr[esi+8*14], mm1 ; out14 ; free mm1
psubsw mm2, mm3 ; out12 ; free mm3
movq qword ptr[esi+8*2], mm5 ; out2 ; free mm5
psraw mm2, 4
;moved up to the prev block
movq mm4, qword ptr scratch7
;moved up to the prev block
psraw mm0, 4
movq qword ptr[esi+8*12], mm2 ; out12 ; free mm2
;moved up to the prev block
psraw mm6, 4
;move back the data to its correct place
;moved up to the prev block
;movq mm0, qword ptr scratch3
;movq mm6, qword ptr scratch5
;movq mm4, qword ptr scratch7
;psraw mm0, 4
;psraw mm6, 4
movq mm1, qword ptr scratch1
psraw mm4, 4
movq qword ptr [esi+8*3], mm0 ; out3
psraw mm1, 4
movq qword ptr [esi+8*5], mm6 ; out5
;slot
movq qword ptr [esi+8*7], mm4 ; out7
;slot
movq qword ptr [esi+8*1], mm1 ; out1
;slot
// done with the iDCT column-transformation
// now we have to transpose the output 8x8 matrix
// 8x8 (OUT) -> 8x8't' (IN)
// the transposition is implemented as 4 sub-operations.
// 1) transpose upper-left quad (in-place)
// 2) transpose lower-right quad (in-place)
// 3) transpose lower-left quad (copy to scratch regs)
// 4) transpose upper-right quad (in-place)
// 5) copy scratch-regs to lower-left quad
// mm0 = 1st row [ A B C D ] row1
// mm1 = 2nd row [ E F G H ] 2
// mm2 = 3rd row [ I J K L ] 3
// mm3 = 4th row [ M N O P ] 4
// 1) transpose upper-left quad
movq mm0, qword ptr [esi + ROW_STRIDE * 0 ]
movq mm1, qword ptr [esi + ROW_STRIDE * 1 ]
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [esi + ROW_STRIDE * 2 ]
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
movq mm3, qword ptr [esi + ROW_STRIDE * 3]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
movq qword ptr [ esi + ROW_STRIDE * 0 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
// begin reading next quadrant (lower-right)
movq mm0, qword ptr [esi + ROW_STRIDE*4 + 8];
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ esi +ROW_STRIDE * 2], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ esi +ROW_STRIDE * 1], mm1; // store row 2
movq mm1, qword ptr [esi + ROW_STRIDE*5 + 8]
movq qword ptr [ esi +ROW_STRIDE * 3], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 2) transpose lower-right quadrant
// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [esi + ROW_STRIDE*6 + 8]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [esi + ROW_STRIDE*7 + 8]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
; // slot
movq qword ptr [ esi + ROW_STRIDE*4 + 8], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
movq mm0, qword ptr [esi + ROW_STRIDE * 4 ]
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ esi +ROW_STRIDE*6 + 8], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ esi +ROW_STRIDE*5 + 8], mm1; // store row 2
; // slot
movq mm1, qword ptr [esi + ROW_STRIDE * 5 ]
; // slot
movq qword ptr [ esi +ROW_STRIDE*7 + 8], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 3) transpose lower-left
// this segment will store the output in SCRATCH registers
// movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
// movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [esi + ROW_STRIDE * 6 ]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [esi + ROW_STRIDE * 7 ]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
;//slot
movq qword ptr [ scratch1 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
// begin reading next quadrant (upper-right)
movq mm0, qword ptr [esi+ ROW_STRIDE*0 + 8];
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ scratch5 ], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ scratch3 ], mm1; // store row 2
movq mm1, qword ptr [esi + ROW_STRIDE*1 + 8]
movq qword ptr [ scratch7 ], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 2) transpose lower-right quadrant
// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [esi + ROW_STRIDE*2 + 8]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [esi + ROW_STRIDE*3 + 8]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
movq mm7, qword ptr [ scratch1 ]; // load row1[A B C D]
punpckldq mm0, mm2; // final result mm0 = row1q [0 4 8 12]
movq mm5, qword ptr [ scratch3 ]; // load row2q [A B C D]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
movq qword ptr [ esi + ROW_STRIDE*4 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
movq qword ptr [ esi +ROW_STRIDE*5 ], mm1; // store row 2
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ esi +ROW_STRIDE*6 ], mm4; // store row 3
; // slot
movq mm0, qword ptr [ scratch5 ]; // load row 3q
; // slot
movq qword ptr [ esi +ROW_STRIDE*7 ], mm3; // store row 4
; // slot
movq mm1, qword ptr [ scratch7 ]; // load row 4q
; // slot
movq qword ptr [ esi + ROW_STRIDE*0 + 8], mm7; // store row 1q
movq qword ptr [ esi + ROW_STRIDE*1 + 8], mm5; // store row 1q
movq qword ptr [ esi + ROW_STRIDE*2 + 8], mm0; // store row 1q
movq qword ptr [ esi + ROW_STRIDE*3 + 8], mm1; // store row 1q
// done with transpose operation
// emms
} // end __asm
// transpose
// the above MMX-code is equivalent to the simple C-loop below
/* int i,j;
short temp;
for ( i = 1; i < 8; ++i)
for ( j = 0; j < i; ++j )
{
// swap elements matrix elements #(i,j) and #(j,i)
temp = src_result[ i + (j<<3) ];
src_result[ i + (j<<3) ] = src_result[ j + (i<<3) ];
src_result[ j + (i<<3) ] = temp;
}
*/
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -