⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct_ap528mmx.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:

pmulhw mm5, qword ptr [ecx+8*3]                ; V3
 psllw mm6,1                                     ; t146=t152


; note that V15 computation has a correction step:
; this is a 'magic' constant that rebiases the results to be closer to the expected result
; this magic constant can be refined to reduce the error even more
; by doing the correction step in a later stage when the number is actually multiplied by 16

paddw mm4, qword ptr x0005000200010001
paddsw mm0, mm6                                 ; V63

psraw mm0, 1                                    ; t154=t156
psubsw mm3, mm6                                 ; V60 ; free mm6

movq mm1, mm3                                   ; duplicate V60
movq mm6, mm5                                   ; duplicate V3

pmulhw mm1, qword ptr x539f539f539f539f        ; V67
 psraw mm4, 2                                    ; t148=t150

paddsw mm5, mm4                                 ; V61
psubsw mm6, mm4                                 ; V62 ; free mm4

movq mm4, mm5                                   ; duplicate V61
paddsw mm5, mm0                                 ; V65 -> result

psllw mm1, 1                                    ; t169
psubsw mm4, mm0                                 ; V64 ; free mm0

pmulhw mm4, qword ptr x5a825a825a825a82        ; V68
psraw mm3, 1                                    ; t158

psubsw mm3, mm6                                 ; V66
movq mm2, mm5                                   ; duplicate V65

pmulhw mm3, qword ptr x61f861f861f861f8        ; V70
psllw mm6, 1                                    ; t165

pmulhw mm6, qword ptr x4546454645464546        ; V69
psraw mm2, 1                                    ; t172

;moved from next block
movq mm0, qword ptr [esi+8*5]                  ; V56
psllw mm4, 1                                    ; t174

;moved from next block
psraw mm0, 1                                    ; t177=t188
nop ; slot

psubsw mm6, mm3                                 ; V72
psubsw mm3, mm1                                 ; V71 ; free mm1

psubsw mm6, mm2                                 ; V73 ; free mm2
;moved from next block
psraw mm5, 1                                    ; t178=t189

psubsw mm4, mm6                                 ; V74
;moved from next block
movq mm1, mm0                                   ; duplicate t177=t188

paddsw mm3, mm4                                 ; V75
;moved from next block
paddsw mm0, mm5                                 ; tm1

;location 
;  5 - V56
; 13 - V57
;  9 - V58
;  X - V59, mm7
;  X - V65, mm5
;  X - V73, mm6
;  X - V74, mm4
;  X - V75, mm3
; free mm0, mm1 & mm2
;move above
;movq mm0, qword ptr [esi+8*5]                 ; V56
;psllw mm0, 1                                   ; t177=t188 ! new !!
;psllw mm5, 1                                   ; t178=t189 ! new !!
;movq mm1, mm0                                  ; duplicate t177=t188
;paddsw mm0, mm5                                        ; tm1

movq mm2, qword ptr [esi+8*13]                 ; V57
psubsw mm1, mm5                                 ; tm15; free mm5

movq qword ptr [esi+8*1], mm0                  ; tm1; free mm0
psraw mm7, 1                                    ; t182=t184 ! new !!

;save the store as used directly in the transpose
;movq qword ptr [esi+8*15], mm1                ; tm15; free mm1
movq mm5, mm7                                   ; duplicate t182=t184
psubsw mm7, mm3                                 ; tm7

paddsw mm5, mm3                                 ; tm9; free mm3
;slot

movq mm0, qword ptr [esi+8*9]                  ; V58
movq mm3, mm2                                   ; duplicate V57

movq qword ptr [esi+8*7], mm7                  ; tm7; free mm7
psubsw mm3, mm6                                 ; tm13

paddsw mm2, mm6                                 ; tm3 ; free mm6
; moved up from the transpose 
movq mm7, mm3                  

; moved up from the transpose 
punpcklwd mm3, mm1
movq mm6, mm0                                   ; duplicate V58

movq qword ptr [esi+8*3], mm2                  ; tm3; free mm2
paddsw mm0, mm4                                 ; tm5

psubsw mm6, mm4                                 ; tm11; free mm4
; moved up from the transpose 
punpckhwd mm7, mm1

movq qword ptr [esi+8*5], mm0                  ; tm5; free mm0
; moved up from the transpose 
movq mm2, mm5 

; transpose - M4 part
;  ---------       ---------
; | M1 | M2 |     | M1'| M3'|
;  ---------  -->  ---------
; | M3 | M4 |     | M2'| M4'|
;  ---------       ---------
; Two alternatives: use full qword ptr approach so the following code can be 
; scheduled before the transpose is done without stores, or use the faster
; half qword ptr stores (when possible)

movd dword ptr [esi+8*9+4], mm3        ; MS part of tmt9
punpcklwd mm5, mm6

movd dword ptr [esi+8*13+4], mm7       ; MS part of tmt13 
punpckhwd mm2, mm6

movd dword ptr [esi+8*9], mm5          ; LS part of tmt9 
punpckhdq mm5, mm3                              ; free mm3

movd dword ptr [esi+8*13], mm2         ; LS part of tmt13
punpckhdq mm2, mm7                              ; free mm7

; moved up from the M3 transpose 
movq mm0, qword ptr [esi+8*8] 
;slot

; moved up from the M3 transpose 
movq mm1, qword ptr [esi+8*10] 
; moved up from the M3 transpose 
movq mm3, mm0 

; shuffle the rest of the data, and write it with 2 qword ptr writes
movq qword ptr [esi+8*11], mm5         ; tmt11
; moved up from the M3 transpose 
punpcklwd mm0, mm1

movq qword ptr [esi+8*15], mm2         ; tmt15
; moved up from the M3 transpose 
punpckhwd mm3, mm1

; transpose - M3 part

; moved up to previous code section
;movq mm0, qword ptr [esi+8*8] 
;movq mm1, qword ptr [esi+8*10] 
;movq mm3, mm0 
;punpcklwd mm0, mm1
;punpckhwd mm3, mm1

movq mm6, qword ptr [esi+8*12] 
;slot

movq mm4, qword ptr [esi+8*14] 
movq mm2, mm6 

; shuffle the data and write out the lower parts of the transposed in 4 dwords
punpcklwd mm6, mm4
movq mm1, mm0

punpckhdq mm1, mm6
movq mm7, mm3

punpckhwd mm2, mm4                              ; free mm4
;slot

punpckldq mm0, mm6                              ; free mm6
;slot

;moved from next block
movq mm4, qword ptr [esi+8*13]                 ; tmt13
punpckldq mm3, mm2

punpckhdq mm7, mm2                              ; free mm2
;moved from next block
movq mm5, mm3                                   ; duplicate tmt5

; column 1: even part (after transpose)

;moved above
;movq mm5, mm3                                  ; duplicate tmt5
;movq mm4, qword ptr [esi+8*13]                ; tmt13

psubsw mm3, mm4                                 ; V134
;slot

pmulhw mm3, qword ptr x5a825a825a825a82        ; 23170 ->V136
;slot

movq mm6, qword ptr [esi+8*9]                  ; tmt9
paddsw mm5, mm4                                 ; V135 ; mm4 free

movq mm4, mm0                                   ; duplicate tmt1
paddsw mm0, mm6                                 ; V137

psubsw mm4, mm6                                 ; V138 ; mm6 free
psllw mm3, 2                                    ; t290
psubsw mm3, mm5                                 ; V139
movq mm6, mm0                                   ; duplicate V137

paddsw mm0, mm5                                 ; V140
movq mm2, mm4                                   ; duplicate V138

paddsw mm2, mm3                                 ; V141
psubsw mm4, mm3                                 ; V142 ; mm3 free

movq qword ptr [esi+8*9], mm0                  ; V140
psubsw mm6, mm5                                 ; V143 ; mm5 free

;moved from next block
movq mm0, qword ptr[esi+8*11]                  ; tmt11
;slot

movq qword ptr [esi+8*13], mm2                 ; V141
;moved from next block
movq mm2, mm0                                   ; duplicate tmt11

; column 1: odd part (after transpose)

;moved up to the prev block
;movq mm0, qword ptr[esi+8*11]                 ; tmt11
;movq mm2, mm0                                  ; duplicate tmt11

movq mm5, qword ptr[esi+8*15]                  ; tmt15
psubsw mm0, mm7                                 ; V144
 
movq mm3, mm0                                   ; duplicate V144
paddsw mm2, mm7                                 ; V147 ; free mm7

pmulhw mm0, qword ptr x539f539f539f539f        ; 21407-> V151
movq mm7, mm1                                   ; duplicate tmt3

paddsw mm7, mm5                                 ; V145
psubsw mm1, mm5                                 ; V146 ; free mm5

psubsw mm3, mm1                                 ; V150
movq mm5, mm7                                   ; duplicate V145

pmulhw mm1, qword ptr x4546454645464546        ; 17734-> V153
psubsw mm5, mm2                                 ; V148

pmulhw mm3, qword ptr x61f861f861f861f8        ; 25080-> V154
psllw mm0, 2                                    ; t311

pmulhw mm5, qword ptr x5a825a825a825a82        ; 23170-> V152
paddsw mm7, mm2                                 ; V149 ; free mm2

psllw mm1, 1                                    ; t313
nop ; slot

;without the nop above - freeze here for one clock
;the nop cleans the mess a little bit
movq mm2, mm3                                   ; duplicate V154
psubsw mm3, mm0                                 ; V155 ; free mm0

psubsw mm1, mm2                                 ; V156 ; free mm2
;moved from the next block
movq mm2, mm6                                   ; duplicate V143

;moved from the next block
movq mm0, qword ptr[esi+8*13]  ; V141
psllw mm1, 1                                    ; t315

psubsw mm1, mm7                                 ; V157 (keep V149)
psllw mm5, 2                                    ; t317

psubsw mm5, mm1                                 ; V158
psllw mm3, 1                                    ; t319

paddsw mm3, mm5                                 ; V159
;slot

; column 1: output butterfly (after transform)
;moved to the prev block
;movq mm2, mm6                                  ; duplicate V143
;movq mm0, qword ptr[esi+8*13] ; V141

psubsw mm2, mm3                                 ; V163
paddsw mm6, mm3                                 ; V164 ; free mm3

movq mm3, mm4                                   ; duplicate V142
psubsw mm4, mm5                                 ; V165 ; free mm5

movq qword ptr scratch7, mm2                   ; out7
psraw mm6, 4

psraw mm4, 4
paddsw mm3, mm5                                 ; V162

movq mm2, qword ptr[esi+8*9]                   ; V140
movq mm5, mm0                                   ; duplicate V141

;in order not to perculate this line up, we read [esi+8*9] very near to this location
movq qword ptr [esi+8*9], mm6                  ; out9
paddsw mm0, mm1                                 ; V161

movq qword ptr scratch5, mm3                   ; out5
psubsw mm5, mm1                                 ; V166 ; free mm1

movq qword ptr[esi+8*11], mm4                  ; out11
psraw mm5, 4

movq qword ptr scratch3, mm0                   ; out3
movq mm4, mm2                                   ; duplicate V140

movq qword ptr[esi+8*13], mm5                  ; out13
paddsw mm2, mm7                                 ; V160

;moved from the next block
movq mm0, qword ptr [esi+8*1] 
psubsw mm4, mm7                                 ; V167 ; free mm7

;moved from the next block
movq mm7, qword ptr [esi+8*3] 
psraw mm4, 4

movq qword ptr scratch1, mm2                   ; out1
;moved from the next block
movq mm1, mm0 

movq qword ptr[esi+8*15], mm4                  ; out15
;moved from the next block
punpcklwd mm0, mm7

; transpose - M2 parts
;moved up to the prev block
;movq mm0, qword ptr [esi+8*1] 
;movq mm7, qword ptr [esi+8*3] 
;movq mm1, mm0 
;punpcklwd mm0, mm7

movq mm5, qword ptr [esi+8*5] 
punpckhwd mm1, mm7

movq mm4, qword ptr [esi+8*7] 
movq mm3, mm5 

; shuffle the data and write out the lower parts of the trasposed in 4 dwords
movd dword ptr [esi+8*8], mm0          ; LS part of tmt8
punpcklwd mm5, mm4

movd dword ptr [esi+8*12], mm1                 ; LS part of tmt12
punpckhwd mm3, mm4

movd dword ptr [esi+8*8+4], mm5                ; MS part of tmt8
punpckhdq mm0, mm5                              ; tmt10

movd dword ptr [esi+8*12+4], mm3               ; MS part of tmt12
punpckhdq mm1, mm3                              ; tmt14


; transpose - M1 parts
movq mm7, qword ptr [esi] 
;slot

movq mm2, qword ptr [esi+8*2] 
movq mm6, mm7 

movq mm5, qword ptr [esi+8*4] 
punpcklwd mm7, mm2

movq mm4, qword ptr [esi+8*6] 
punpckhwd mm6, mm2 ; free mm2

movq mm3, mm5 
punpcklwd mm5, mm4

punpckhwd mm3, mm4                              ; free mm4
movq mm2, mm7

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -