📄 idct_ap528mmx.cpp
字号:
pmulhw mm5, qword ptr [ecx+8*3] ; V3
psllw mm6,1 ; t146=t152
; note that V15 computation has a correction step:
; this is a 'magic' constant that rebiases the results to be closer to the expected result
; this magic constant can be refined to reduce the error even more
; by doing the correction step in a later stage when the number is actually multiplied by 16
paddw mm4, qword ptr x0005000200010001
paddsw mm0, mm6 ; V63
psraw mm0, 1 ; t154=t156
psubsw mm3, mm6 ; V60 ; free mm6
movq mm1, mm3 ; duplicate V60
movq mm6, mm5 ; duplicate V3
pmulhw mm1, qword ptr x539f539f539f539f ; V67
psraw mm4, 2 ; t148=t150
paddsw mm5, mm4 ; V61
psubsw mm6, mm4 ; V62 ; free mm4
movq mm4, mm5 ; duplicate V61
paddsw mm5, mm0 ; V65 -> result
psllw mm1, 1 ; t169
psubsw mm4, mm0 ; V64 ; free mm0
pmulhw mm4, qword ptr x5a825a825a825a82 ; V68
psraw mm3, 1 ; t158
psubsw mm3, mm6 ; V66
movq mm2, mm5 ; duplicate V65
pmulhw mm3, qword ptr x61f861f861f861f8 ; V70
psllw mm6, 1 ; t165
pmulhw mm6, qword ptr x4546454645464546 ; V69
psraw mm2, 1 ; t172
;moved from next block
movq mm0, qword ptr [esi+8*5] ; V56
psllw mm4, 1 ; t174
;moved from next block
psraw mm0, 1 ; t177=t188
nop ; slot
psubsw mm6, mm3 ; V72
psubsw mm3, mm1 ; V71 ; free mm1
psubsw mm6, mm2 ; V73 ; free mm2
;moved from next block
psraw mm5, 1 ; t178=t189
psubsw mm4, mm6 ; V74
;moved from next block
movq mm1, mm0 ; duplicate t177=t188
paddsw mm3, mm4 ; V75
;moved from next block
paddsw mm0, mm5 ; tm1
;location
; 5 - V56
; 13 - V57
; 9 - V58
; X - V59, mm7
; X - V65, mm5
; X - V73, mm6
; X - V74, mm4
; X - V75, mm3
; free mm0, mm1 & mm2
;move above
;movq mm0, qword ptr [esi+8*5] ; V56
;psllw mm0, 1 ; t177=t188 ! new !!
;psllw mm5, 1 ; t178=t189 ! new !!
;movq mm1, mm0 ; duplicate t177=t188
;paddsw mm0, mm5 ; tm1
movq mm2, qword ptr [esi+8*13] ; V57
psubsw mm1, mm5 ; tm15; free mm5
movq qword ptr [esi+8*1], mm0 ; tm1; free mm0
psraw mm7, 1 ; t182=t184 ! new !!
;save the store as used directly in the transpose
;movq qword ptr [esi+8*15], mm1 ; tm15; free mm1
movq mm5, mm7 ; duplicate t182=t184
psubsw mm7, mm3 ; tm7
paddsw mm5, mm3 ; tm9; free mm3
;slot
movq mm0, qword ptr [esi+8*9] ; V58
movq mm3, mm2 ; duplicate V57
movq qword ptr [esi+8*7], mm7 ; tm7; free mm7
psubsw mm3, mm6 ; tm13
paddsw mm2, mm6 ; tm3 ; free mm6
; moved up from the transpose
movq mm7, mm3
; moved up from the transpose
punpcklwd mm3, mm1
movq mm6, mm0 ; duplicate V58
movq qword ptr [esi+8*3], mm2 ; tm3; free mm2
paddsw mm0, mm4 ; tm5
psubsw mm6, mm4 ; tm11; free mm4
; moved up from the transpose
punpckhwd mm7, mm1
movq qword ptr [esi+8*5], mm0 ; tm5; free mm0
; moved up from the transpose
movq mm2, mm5
; transpose - M4 part
; --------- ---------
; | M1 | M2 | | M1'| M3'|
; --------- --> ---------
; | M3 | M4 | | M2'| M4'|
; --------- ---------
; Two alternatives: use full qword ptr approach so the following code can be
; scheduled before the transpose is done without stores, or use the faster
; half qword ptr stores (when possible)
movd dword ptr [esi+8*9+4], mm3 ; MS part of tmt9
punpcklwd mm5, mm6
movd dword ptr [esi+8*13+4], mm7 ; MS part of tmt13
punpckhwd mm2, mm6
movd dword ptr [esi+8*9], mm5 ; LS part of tmt9
punpckhdq mm5, mm3 ; free mm3
movd dword ptr [esi+8*13], mm2 ; LS part of tmt13
punpckhdq mm2, mm7 ; free mm7
; moved up from the M3 transpose
movq mm0, qword ptr [esi+8*8]
;slot
; moved up from the M3 transpose
movq mm1, qword ptr [esi+8*10]
; moved up from the M3 transpose
movq mm3, mm0
; shuffle the rest of the data, and write it with 2 qword ptr writes
movq qword ptr [esi+8*11], mm5 ; tmt11
; moved up from the M3 transpose
punpcklwd mm0, mm1
movq qword ptr [esi+8*15], mm2 ; tmt15
; moved up from the M3 transpose
punpckhwd mm3, mm1
; transpose - M3 part
; moved up to previous code section
;movq mm0, qword ptr [esi+8*8]
;movq mm1, qword ptr [esi+8*10]
;movq mm3, mm0
;punpcklwd mm0, mm1
;punpckhwd mm3, mm1
movq mm6, qword ptr [esi+8*12]
;slot
movq mm4, qword ptr [esi+8*14]
movq mm2, mm6
; shuffle the data and write out the lower parts of the transposed in 4 dwords
punpcklwd mm6, mm4
movq mm1, mm0
punpckhdq mm1, mm6
movq mm7, mm3
punpckhwd mm2, mm4 ; free mm4
;slot
punpckldq mm0, mm6 ; free mm6
;slot
;moved from next block
movq mm4, qword ptr [esi+8*13] ; tmt13
punpckldq mm3, mm2
punpckhdq mm7, mm2 ; free mm2
;moved from next block
movq mm5, mm3 ; duplicate tmt5
; column 1: even part (after transpose)
;moved above
;movq mm5, mm3 ; duplicate tmt5
;movq mm4, qword ptr [esi+8*13] ; tmt13
psubsw mm3, mm4 ; V134
;slot
pmulhw mm3, qword ptr x5a825a825a825a82 ; 23170 ->V136
;slot
movq mm6, qword ptr [esi+8*9] ; tmt9
paddsw mm5, mm4 ; V135 ; mm4 free
movq mm4, mm0 ; duplicate tmt1
paddsw mm0, mm6 ; V137
psubsw mm4, mm6 ; V138 ; mm6 free
psllw mm3, 2 ; t290
psubsw mm3, mm5 ; V139
movq mm6, mm0 ; duplicate V137
paddsw mm0, mm5 ; V140
movq mm2, mm4 ; duplicate V138
paddsw mm2, mm3 ; V141
psubsw mm4, mm3 ; V142 ; mm3 free
movq qword ptr [esi+8*9], mm0 ; V140
psubsw mm6, mm5 ; V143 ; mm5 free
;moved from next block
movq mm0, qword ptr[esi+8*11] ; tmt11
;slot
movq qword ptr [esi+8*13], mm2 ; V141
;moved from next block
movq mm2, mm0 ; duplicate tmt11
; column 1: odd part (after transpose)
;moved up to the prev block
;movq mm0, qword ptr[esi+8*11] ; tmt11
;movq mm2, mm0 ; duplicate tmt11
movq mm5, qword ptr[esi+8*15] ; tmt15
psubsw mm0, mm7 ; V144
movq mm3, mm0 ; duplicate V144
paddsw mm2, mm7 ; V147 ; free mm7
pmulhw mm0, qword ptr x539f539f539f539f ; 21407-> V151
movq mm7, mm1 ; duplicate tmt3
paddsw mm7, mm5 ; V145
psubsw mm1, mm5 ; V146 ; free mm5
psubsw mm3, mm1 ; V150
movq mm5, mm7 ; duplicate V145
pmulhw mm1, qword ptr x4546454645464546 ; 17734-> V153
psubsw mm5, mm2 ; V148
pmulhw mm3, qword ptr x61f861f861f861f8 ; 25080-> V154
psllw mm0, 2 ; t311
pmulhw mm5, qword ptr x5a825a825a825a82 ; 23170-> V152
paddsw mm7, mm2 ; V149 ; free mm2
psllw mm1, 1 ; t313
nop ; slot
;without the nop above - freeze here for one clock
;the nop cleans the mess a little bit
movq mm2, mm3 ; duplicate V154
psubsw mm3, mm0 ; V155 ; free mm0
psubsw mm1, mm2 ; V156 ; free mm2
;moved from the next block
movq mm2, mm6 ; duplicate V143
;moved from the next block
movq mm0, qword ptr[esi+8*13] ; V141
psllw mm1, 1 ; t315
psubsw mm1, mm7 ; V157 (keep V149)
psllw mm5, 2 ; t317
psubsw mm5, mm1 ; V158
psllw mm3, 1 ; t319
paddsw mm3, mm5 ; V159
;slot
; column 1: output butterfly (after transform)
;moved to the prev block
;movq mm2, mm6 ; duplicate V143
;movq mm0, qword ptr[esi+8*13] ; V141
psubsw mm2, mm3 ; V163
paddsw mm6, mm3 ; V164 ; free mm3
movq mm3, mm4 ; duplicate V142
psubsw mm4, mm5 ; V165 ; free mm5
movq qword ptr scratch7, mm2 ; out7
psraw mm6, 4
psraw mm4, 4
paddsw mm3, mm5 ; V162
movq mm2, qword ptr[esi+8*9] ; V140
movq mm5, mm0 ; duplicate V141
;in order not to perculate this line up, we read [esi+8*9] very near to this location
movq qword ptr [esi+8*9], mm6 ; out9
paddsw mm0, mm1 ; V161
movq qword ptr scratch5, mm3 ; out5
psubsw mm5, mm1 ; V166 ; free mm1
movq qword ptr[esi+8*11], mm4 ; out11
psraw mm5, 4
movq qword ptr scratch3, mm0 ; out3
movq mm4, mm2 ; duplicate V140
movq qword ptr[esi+8*13], mm5 ; out13
paddsw mm2, mm7 ; V160
;moved from the next block
movq mm0, qword ptr [esi+8*1]
psubsw mm4, mm7 ; V167 ; free mm7
;moved from the next block
movq mm7, qword ptr [esi+8*3]
psraw mm4, 4
movq qword ptr scratch1, mm2 ; out1
;moved from the next block
movq mm1, mm0
movq qword ptr[esi+8*15], mm4 ; out15
;moved from the next block
punpcklwd mm0, mm7
; transpose - M2 parts
;moved up to the prev block
;movq mm0, qword ptr [esi+8*1]
;movq mm7, qword ptr [esi+8*3]
;movq mm1, mm0
;punpcklwd mm0, mm7
movq mm5, qword ptr [esi+8*5]
punpckhwd mm1, mm7
movq mm4, qword ptr [esi+8*7]
movq mm3, mm5
; shuffle the data and write out the lower parts of the trasposed in 4 dwords
movd dword ptr [esi+8*8], mm0 ; LS part of tmt8
punpcklwd mm5, mm4
movd dword ptr [esi+8*12], mm1 ; LS part of tmt12
punpckhwd mm3, mm4
movd dword ptr [esi+8*8+4], mm5 ; MS part of tmt8
punpckhdq mm0, mm5 ; tmt10
movd dword ptr [esi+8*12+4], mm3 ; MS part of tmt12
punpckhdq mm1, mm3 ; tmt14
; transpose - M1 parts
movq mm7, qword ptr [esi]
;slot
movq mm2, qword ptr [esi+8*2]
movq mm6, mm7
movq mm5, qword ptr [esi+8*4]
punpcklwd mm7, mm2
movq mm4, qword ptr [esi+8*6]
punpckhwd mm6, mm2 ; free mm2
movq mm3, mm5
punpcklwd mm5, mm4
punpckhwd mm3, mm4 ; free mm4
movq mm2, mm7
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -