📄 idctaan.asm
字号:
; | M1 | M2 | | M1'| M3'|; --------- --> ---------; | M3 | M4 | | M2'| M4'|; --------- ---------; Two alternatives: use full mmword approach so the following code can be ; scheduled before the transpose is done without stores, or use the fatser; half mmword stores (when it possible)movd dword ptr [esi+8*9+4], mm3 ; MS part of tmt9punpcklwd mm5, mm6movd dword ptr [esi+8*13+4], mm7 ; MS part of tmt13 punpckhwd mm2, mm6movd dword ptr [esi+8*9], mm5 ; LS part of tmt9 punpckhdq mm5, mm3 ; free mm3movd dword ptr [esi+8*13], mm2 ; LS part of tmt13punpckhdq mm2, mm7 ; free mm7; moved up from the M3 transpose movq mm0, mmword ptr [esi+8*8] ;slot; moved up from the M3 transpose movq mm1, mmword ptr [esi+8*10] ; moved up from the M3 transpose movq mm3, mm0 ; shuffle the rest of the data, and write it with 2 mmword writesmovq mmword ptr [esi+8*11], mm5 ; tmt11; moved up from the M3 transpose punpcklwd mm0, mm1movq mmword ptr [esi+8*15], mm2 ; tmt15; moved up from the M3 transpose punpckhwd mm3, mm1; transpose - M3 part; moved up to previous code section;movq mm0, mmword ptr [esi+8*8] ;movq mm1, mmword ptr [esi+8*10] ;movq mm3, mm0 ;punpcklwd mm0, mm1;punpckhwd mm3, mm1movq mm6, mmword ptr [esi+8*12] ;slotmovq mm4, mmword ptr [esi+8*14] movq mm2, mm6 ; shuffle the data and write out the lower parts of the trasposed in 4 dwordspunpcklwd mm6, mm4movq mm1, mm0punpckhdq mm1, mm6movq mm7, mm3punpckhwd mm2, mm4 ; free mm4;slotpunpckldq mm0, mm6 ; free mm6;slot;moved from next blockmovq mm4, mmword ptr [esi+8*13] ; tmt13punpckldq mm3, mm2punpckhdq mm7, mm2 ; free mm2;moved from next blockmovq mm5, mm3 ; duplicate tmt5; column 1: even part (after transpose);moved above;movq mm5, mm3 ; duplicate tmt5;movq mm4, mmword ptr [esi+8*13] ; tmt13psubsw mm3, mm4 ; V134;slotpmulhw mm3, mmword ptr x5a825a825a825a82 ; 23170 ->V136;slotmovq mm6, mmword ptr [esi+8*9] ; tmt9paddsw mm5, mm4 ; V135 ; mm4 freemovq mm4, mm0 ; duplicate tmt1paddsw mm0, mm6 ; V137psubsw mm4, mm6 ; V138 ; mm6 freepsllw mm3, 2 ; t290psubsw mm3, mm5 ; V139movq mm6, mm0 ; duplicate V137paddsw mm0, mm5 ; V140movq mm2, mm4 ; duplicate V138paddsw mm2, mm3 ; V141psubsw mm4, mm3 ; V142 ; mm3 freemovq mmword ptr [esi+8*9], mm0 ; V140psubsw mm6, mm5 ; V143 ; mm5 free;moved from next blockmovq mm0, mmword ptr[esi+8*11] ; tmt11;slotmovq mmword ptr [esi+8*13], mm2 ; V141;moved from next blockmovq mm2, mm0 ; duplicate tmt11; column 1: odd part (after transpose);moved up to the prev block;movq mm0, mmword ptr[esi+8*11] ; tmt11;movq mm2, mm0 ; duplicate tmt11movq mm5, mmword ptr[esi+8*15] ; tmt15psubsw mm0, mm7 ; V144 movq mm3, mm0 ; duplicate V144paddsw mm2, mm7 ; V147 ; free mm7pmulhw mm0, mmword ptr x539f539f539f539f ; 21407-> V151movq mm7, mm1 ; duplicate tmt3paddsw mm7, mm5 ; V145psubsw mm1, mm5 ; V146 ; free mm5psubsw mm3, mm1 ; V150movq mm5, mm7 ; duplicate V145pmulhw mm1, mmword ptr x4546454645464546 ; 17734-> V153psubsw mm5, mm2 ; V148pmulhw mm3, mmword ptr x61f861f861f861f8 ; 25080-> V154psllw mm0, 2 ; t311pmulhw mm5, mmword ptr x5a825a825a825a82 ; 23170-> V152paddsw mm7, mm2 ; V149 ; free mm2psllw mm1, 1 ; t313nop ; slot;without the nop above - freeze here for one clock;the nop cleans the mess a little bitmovq mm2, mm3 ; duplicate V154psubsw mm3, mm0 ; V155 ; free mm0psubsw mm1, mm2 ; V156 ; free mm2;moved from the next blockmovq mm2, mm6 ; duplicate V143;moved from the next blockmovq mm0, mmword ptr[esi+8*13] ; V141psllw mm1, 1 ; t315psubsw mm1, mm7 ; V157 (keep V149)psllw mm5, 2 ; t317psubsw mm5, mm1 ; V158psllw mm3, 1 ; t319paddsw mm3, mm5 ; V159;slot; column 1: output butterfly (after transform);moved to the prev block;movq mm2, mm6 ; duplicate V143;movq mm0, mmword ptr[esi+8*13] ; V141psubsw mm2, mm3 ; V163paddsw mm6, mm3 ; V164 ; free mm3movq mm3, mm4 ; duplicate V142psubsw mm4, mm5 ; V165 ; free mm5movq mmword ptr scratch7, mm2 ; out7psraw mm6, 4psraw mm4, 4paddsw mm3, mm5 ; V162movq mm2, mmword ptr[esi+8*9] ; V140movq mm5, mm0 ; duplicate V141;in order not to preculate this line up, we read [esi+8*9] very near to this locationmovq mmword ptr [esi+8*9], mm6 ; out9paddsw mm0, mm1 ; V161movq mmword ptr scratch5, mm3 ; out5psubsw mm5, mm1 ; V166 ; free mm1movq mmword ptr[esi+8*11], mm4 ; out11psraw mm5, 4movq mmword ptr scratch3, mm0 ; out3movq mm4, mm2 ; duplicate V140movq mmword ptr[esi+8*13], mm5 ; out13paddsw mm2, mm7 ; V160;moved from the next blockmovq mm0, mmword ptr [esi+8*1] psubsw mm4, mm7 ; V167 ; free mm7;moved from the next blockmovq mm7, mmword ptr [esi+8*3] psraw mm4, 4movq mmword ptr scratch1, mm2 ; out1;moved from the next blockmovq mm1, mm0 movq mmword ptr[esi+8*15], mm4 ; out15;moved from the next blockpunpcklwd mm0, mm7; transpose - M2 parts;moved up to the prev block;movq mm0, mmword ptr [esi+8*1] ;movq mm7, mmword ptr [esi+8*3] ;movq mm1, mm0 ;punpcklwd mm0, mm7movq mm5, mmword ptr [esi+8*5] punpckhwd mm1, mm7movq mm4, mmword ptr [esi+8*7] movq mm3, mm5 ; shuffle the data and write out the lower parts of the trasposed in 4 dwordsmovd dword ptr [esi+8*8], mm0 ; LS part of tmt8punpcklwd mm5, mm4movd dword ptr [esi+8*12], mm1 ; LS part of tmt12punpckhwd mm3, mm4movd dword ptr [esi+8*8+4], mm5 ; MS part of tmt8punpckhdq mm0, mm5 ; tmt10movd dword ptr [esi+8*12+4], mm3 ; MS part of tmt12punpckhdq mm1, mm3 ; tmt14; transpose - M1 partsmovq mm7, mmword ptr [esi] ;slotmovq mm2, mmword ptr [esi+8*2] movq mm6, mm7 movq mm5, mmword ptr [esi+8*4] punpcklwd mm7, mm2movq mm4, mmword ptr [esi+8*6] punpckhwd mm6, mm2 ; free mm2movq mm3, mm5 punpcklwd mm5, mm4punpckhwd mm3, mm4 ; free mm4movq mm2, mm7movq mm4, mm6punpckldq mm7, mm5 ; tmt0punpckhdq mm2, mm5 ; tmt2 ; free mm5;slot; shuffle the rest of the data, and write it with 2 mmword writespunpckldq mm6, mm3 ; tmt4;move from next blockmovq mm5, mm2 ; duplicate tmt2punpckhdq mm4, mm3 ; tmt6 ; free mm3;move from next blockmovq mm3, mm0 ; duplicate tmt10; column 0: odd part (after transpose);moved up to prev block;movq mm3, mm0 ; duplicate tmt10;movq mm5, mm2 ; duplicate tmt2psubsw mm0, mm4 ; V110paddsw mm3, mm4 ; V113 ; free mm4movq mm4, mm0 ; duplicate V110paddsw mm2, mm1 ; V111pmulhw mm0, mmword ptr x539f539f539f539f ; 21407-> V117psubsw mm5, mm1 ; V112 ; free mm1psubsw mm4, mm5 ; V116movq mm1, mm2 ; duplicate V111pmulhw mm5, mmword ptr x4546454645464546 ; 17734-> V119psubsw mm2, mm3 ; V114pmulhw mm4, mmword ptr x61f861f861f861f8 ; 25080-> V120paddsw mm1, mm3 ; V115 ; free mm3pmulhw mm2, mmword ptr x5a825a825a825a82 ; 23170-> V118psllw mm0, 2 ; t266movq mmword ptr[esi+8*0], mm1 ; save V115psllw mm5, 1 ; t268psubsw mm5, mm4 ; V122psubsw mm4, mm0 ; V121 ; free mm0psllw mm5, 1 ; t270;slotpsubsw mm5, mm1 ; V123 ; free mm1psllw mm2, 2 ; t272psubsw mm2, mm5 ; V124 (keep V123)psllw mm4, 1 ; t274movq mmword ptr[esi+8*2], mm5 ; save V123 ; free mm5paddsw mm4, mm2 ; V125 (keep V124); column 0: even part (after transpose)movq mm0, mmword ptr[esi+8*12] ; tmt12movq mm3, mm6 ; duplicate tmt4psubsw mm6, mm0 ; V100paddsw mm3, mm0 ; V101 ; free mm0pmulhw mm6, mmword ptr x5a825a825a825a82 ; 23170 ->V102movq mm5, mm7 ; duplicate tmt0movq mm1, mmword ptr[esi+8*8] ; tmt8;slotpaddsw mm7, mm1 ; V103psubsw mm5, mm1 ; V104 ; free mm1movq mm0, mm7 ; duplicate V103psllw mm6, 2 ; t245paddsw mm7, mm3 ; V106movq mm1, mm5 ; duplicate V104psubsw mm6, mm3 ; V105psubsw mm0, mm3 ; V109; free mm3 paddsw mm5, mm6 ; V107psubsw mm1, mm6 ; V108 ; free mm6; column 0: output butterfly (after transform)movq mm3, mm1 ; duplicate V108paddsw mm1, mm2 ; out4psraw mm1, 4psubsw mm3, mm2 ; out10 ; free mm2psraw mm3, 4movq mm6, mm0 ; duplicate V109movq mmword ptr[esi+8*4], mm1 ; out4 ; free mm1psubsw mm0, mm4 ; out6movq mmword ptr[esi+8*10], mm3 ; out10 ; free mm3psraw mm0, 4paddsw mm6, mm4 ; out8 ; free mm4movq mm1, mm7 ; duplicate V106movq mmword ptr[esi+8*6], mm0 ; out6 ; free mm0psraw mm6, 4movq mm4, mmword ptr[esi+8*0] ; V115;slotmovq mmword ptr[esi+8*8], mm6 ; out8 ; free mm6movq mm2, mm5 ; duplicate V107movq mm3, mmword ptr[esi+8*2] ; V123paddsw mm7, mm4 ; out0;moved up from next blockmovq mm0, mmword ptr scratch3psraw mm7, 4;moved up from next blockmovq mm6, mmword ptr scratch5psubsw mm1, mm4 ; out14 ; free mm4paddsw mm5, mm3 ; out2psraw mm1, 4movq mmword ptr[esi], mm7 ; out0 ; free mm7psraw mm5, 4movq mmword ptr[esi+8*14], mm1 ; out14 ; free mm1psubsw mm2, mm3 ; out12 ; free mm3movq mmword ptr[esi+8*2], mm5 ; out2 ; free mm5psraw mm2, 4;moved up to the prev blockmovq mm4, mmword ptr scratch7;moved up to the prev blockpsraw mm0, 4movq mmword ptr[esi+8*12], mm2 ; out12 ; free mm2;moved up to the prev blockpsraw mm6, 4;move back the data to its correct place;moved up to the prev block;movq mm0, mmword ptr scratch3;movq mm6, mmword ptr scratch5;movq mm4, mmword ptr scratch7;psraw mm0, 4;psraw mm6, 4movq mm1, mmword ptr scratch1psraw mm4, 4movq mmword ptr [esi+8*3], mm0 ; out3psraw mm1, 4movq mmword ptr [esi+8*5], mm6 ; out5;slotmovq mmword ptr [esi+8*7], mm4 ; out7;slotmovq mmword ptr [esi+8*1], mm1 ; out1;slotemmspop esipop ebpret 0_idct8x8aan ENDP_TEXT ENDSEND
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -