📄 idct_ap922_mmx.asm
字号:
pmaddwd mm7, mm3 ; x7*w31+x3*x29_x7*w27+x3*w25
pmaddwd mm0, [eax] ; x4*w06+x0*w04_x4*w02+x0*w00
pmaddwd mm1, [eax+8] ; x6*w07+x0*w05_x6*w03+x0*w01
pmaddwd mm2, [eax+32] ; x5*w22+x1*w20_x5*w18+x1*w16
pmaddwd mm3, [eax+40] ; x7*w23+x3*w21_x7*w19+x3*w17
paddd mm4, mm5 ; a3_a2
paddd mm6, mm7 ; b3_b2
paddd mm0, mm1 ; a1_a0
paddd mm2, mm3 ; b1_b0
paddd mm4, half11
paddd mm0, half11
movq mm5, mm6
movq mm1, mm2
paddd mm2, mm0 ; a1+b1_a0+b0
paddd mm6, mm4 ; a3+b3_a2+b2
psubd mm0, mm1 ; a1-b1_a0-b0
psubd mm4, mm5 ; a3-b3_a2-b2
psrad mm2, 11 ; y1_y0
psrad mm6, 11 ; y3_y2
psrad mm0, 11 ; y6_y7
psrad mm4, 11 ; y4_y5
packssdw mm2, mm6 ; y3_y2_y1_y0
packssdw mm4, mm0 ; y6_y7_y4_y5
movq [esi+16], mm2 ;
movq mm7, mm4 ;
psrld mm4, 16 ; 00_y6_00_y4
pslld mm7, 16 ; y7_00_y5_00
por mm4, mm7 ; y7_y6_y5_y4
movq [esi+24], mm4 ;
; row 7
movd mm0, [esi+112] ; 00_00_x1_x0
movd mm1, [esi+116] ; 00_00_x3_x2
movd mm2, [esi+120] ; 00_00_x5_x4
movd mm3, [esi+124] ; 00_00_x7_x4
movq mm4, [eax+16] ; w14_w12_w10_w08
movq mm5, [eax+24] ; w15_w13_w11_w09
movq mm6, [eax+48] ; w30_w28_w26_w24
movq mm7, [eax+56] ; w31_w29_w27_w25
punpcklwd mm0, mm2 ; x5_x1_x4_x0
punpcklwd mm1, mm3 ; x7_x3_x6_x2
movq mm2, mm0
movq mm3, mm1
punpckldq mm0, mm0 ; x4_x0_x4_x0
punpckhdq mm2, mm2 ; x5_x1_x5_x1
punpckldq mm1, mm1 ; x6_x2_x6_x2
punpckhdq mm3, mm3 ; x7_x3_x7_x3
pmaddwd mm4, mm0 ; x4*w14+x0*w12_x4*w10+x0*w08
pmaddwd mm5, mm1 ; x6*w15+x2*w13_x6*w11+x2*w09
pmaddwd mm6, mm2 ; x5*w30+x1*w28_x5*w26+x1*w24
pmaddwd mm7, mm3 ; x7*w31+x3*x29_x7*w27+x3*w25
pmaddwd mm0, [eax] ; x4*w06+x0*w04_x4*w02+x0*w00
pmaddwd mm1, [eax+8] ; x6*w07+x0*w05_x6*w03+x0*w01
pmaddwd mm2, [eax+32] ; x5*w22+x1*w20_x5*w18+x1*w16
pmaddwd mm3, [eax+40] ; x7*w23+x3*w21_x7*w19+x3*w17
paddd mm4, mm5 ; a3_a2
paddd mm6, mm7 ; b3_b2
paddd mm0, mm1 ; a1_a0
paddd mm2, mm3 ; b1_b0
paddd mm4, half11
paddd mm0, half11
movq mm5, mm6
movq mm1, mm2
paddd mm2, mm0 ; a1+b1_a0+b0
paddd mm6, mm4 ; a3+b3_a2+b2
psubd mm0, mm1 ; a1-b1_a0-b0
psubd mm4, mm5 ; a3-b3_a2-b2
psrad mm2, 11 ; y1_y0
psrad mm6, 11 ; y3_y2
psrad mm0, 11 ; y6_y7
psrad mm4, 11 ; y4_y5
packssdw mm2, mm6 ; y3_y2_y1_y0
packssdw mm4, mm0 ; y6_y7_y4_y5
movq [esi+112], mm2 ;
movq mm7, mm4 ;
psrld mm4, 16 ; 00_y6_00_y4
pslld mm7, 16 ; y7_00_y5_00
por mm4, mm7 ; y7_y6_y5_y4
movq [esi+120], mm4 ;
; row 2
lea eax, table_26
movd mm0, [esi+32] ; 00_00_x1_x0
movd mm1, [esi+36] ; 00_00_x3_x2
movd mm2, [esi+40] ; 00_00_x5_x4
movd mm3, [esi+44] ; 00_00_x7_x4
movq mm4, [eax+16] ; w14_w12_w10_w08
movq mm5, [eax+24] ; w15_w13_w11_w09
movq mm6, [eax+48] ; w30_w28_w26_w24
movq mm7, [eax+56] ; w31_w29_w27_w25
punpcklwd mm0, mm2 ; x5_x1_x4_x0
punpcklwd mm1, mm3 ; x7_x3_x6_x2
movq mm2, mm0
movq mm3, mm1
punpckldq mm0, mm0 ; x4_x0_x4_x0
punpckhdq mm2, mm2 ; x5_x1_x5_x1
punpckldq mm1, mm1 ; x6_x2_x6_x2
punpckhdq mm3, mm3 ; x7_x3_x7_x3
pmaddwd mm4, mm0 ; x4*w14+x0*w12_x4*w10+x0*w08
pmaddwd mm5, mm1 ; x6*w15+x2*w13_x6*w11+x2*w09
pmaddwd mm6, mm2 ; x5*w30+x1*w28_x5*w26+x1*w24
pmaddwd mm7, mm3 ; x7*w31+x3*x29_x7*w27+x3*w25
pmaddwd mm0, [eax] ; x4*w06+x0*w04_x4*w02+x0*w00
pmaddwd mm1, [eax+8] ; x6*w07+x0*w05_x6*w03+x0*w01
pmaddwd mm2, [eax+32] ; x5*w22+x1*w20_x5*w18+x1*w16
pmaddwd mm3, [eax+40] ; x7*w23+x3*w21_x7*w19+x3*w17
paddd mm4, mm5 ; a3_a2
paddd mm6, mm7 ; b3_b2
paddd mm0, mm1 ; a1_a0
paddd mm2, mm3 ; b1_b0
paddd mm4, half11
paddd mm0, half11
movq mm5, mm6
movq mm1, mm2
paddd mm2, mm0 ; a1+b1_a0+b0
paddd mm6, mm4 ; a3+b3_a2+b2
psubd mm0, mm1 ; a1-b1_a0-b0
psubd mm4, mm5 ; a3-b3_a2-b2
psrad mm2, 11 ; y1_y0
psrad mm6, 11 ; y3_y2
psrad mm0, 11 ; y6_y7
psrad mm4, 11 ; y4_y5
packssdw mm2, mm6 ; y3_y2_y1_y0
packssdw mm4, mm0 ; y6_y7_y4_y5
movq [esi+32], mm2 ;
movq mm7, mm4 ;
psrld mm4, 16 ; 00_y6_00_y4
pslld mm7, 16 ; y7_00_y5_00
por mm4, mm7 ; y7_y6_y5_y4
movq [esi+40], mm4 ;
; row 6
movd mm0, [esi+96] ; 00_00_x1_x0
movd mm1, [esi+100] ; 00_00_x3_x2
movd mm2, [esi+104] ; 00_00_x5_x4
movd mm3, [esi+108] ; 00_00_x7_x4
movq mm4, [eax+16] ; w14_w12_w10_w08
movq mm5, [eax+24] ; w15_w13_w11_w09
movq mm6, [eax+48] ; w30_w28_w26_w24
movq mm7, [eax+56] ; w31_w29_w27_w25
punpcklwd mm0, mm2 ; x5_x1_x4_x0
punpcklwd mm1, mm3 ; x7_x3_x6_x2
movq mm2, mm0
movq mm3, mm1
punpckldq mm0, mm0 ; x4_x0_x4_x0
punpckhdq mm2, mm2 ; x5_x1_x5_x1
punpckldq mm1, mm1 ; x6_x2_x6_x2
punpckhdq mm3, mm3 ; x7_x3_x7_x3
pmaddwd mm4, mm0 ; x4*w14+x0*w12_x4*w10+x0*w08
pmaddwd mm5, mm1 ; x6*w15+x2*w13_x6*w11+x2*w09
pmaddwd mm6, mm2 ; x5*w30+x1*w28_x5*w26+x1*w24
pmaddwd mm7, mm3 ; x7*w31+x3*x29_x7*w27+x3*w25
pmaddwd mm0, [eax] ; x4*w06+x0*w04_x4*w02+x0*w00
pmaddwd mm1, [eax+8] ; x6*w07+x0*w05_x6*w03+x0*w01
pmaddwd mm2, [eax+32] ; x5*w22+x1*w20_x5*w18+x1*w16
pmaddwd mm3, [eax+40] ; x7*w23+x3*w21_x7*w19+x3*w17
paddd mm4, mm5 ; a3_a2
paddd mm6, mm7 ; b3_b2
paddd mm0, mm1 ; a1_a0
paddd mm2, mm3 ; b1_b0
paddd mm4, half11
paddd mm0, half11
movq mm5, mm6
movq mm1, mm2
paddd mm2, mm0 ; a1+b1_a0+b0
paddd mm6, mm4 ; a3+b3_a2+b2
psubd mm0, mm1 ; a1-b1_a0-b0
psubd mm4, mm5 ; a3-b3_a2-b2
psrad mm2, 11 ; y1_y0
psrad mm6, 11 ; y3_y2
psrad mm0, 11 ; y6_y7
psrad mm4, 11 ; y4_y5
packssdw mm2, mm6 ; y3_y2_y1_y0
packssdw mm4, mm0 ; y6_y7_y4_y5
movq [esi+96], mm2 ;
movq mm7, mm4 ;
psrld mm4, 16 ; 00_y6_00_y4
pslld mm7, 16 ; y7_00_y5_00
por mm4, mm7 ; y7_y6_y5_y4
movq [esi+104], mm4 ;
; row 3
lea eax, table_35
movd mm0, [esi+48] ; 00_00_x1_x0
movd mm1, [esi+52] ; 00_00_x3_x2
movd mm2, [esi+56] ; 00_00_x5_x4
movd mm3, [esi+60] ; 00_00_x7_x4
movq mm4, [eax+16] ; w14_w12_w10_w08
movq mm5, [eax+24] ; w15_w13_w11_w09
movq mm6, [eax+48] ; w30_w28_w26_w24
movq mm7, [eax+56] ; w31_w29_w27_w25
punpcklwd mm0, mm2 ; x5_x1_x4_x0
punpcklwd mm1, mm3 ; x7_x3_x6_x2
movq mm2, mm0
movq mm3, mm1
punpckldq mm0, mm0 ; x4_x0_x4_x0
punpckhdq mm2, mm2 ; x5_x1_x5_x1
punpckldq mm1, mm1 ; x6_x2_x6_x2
punpckhdq mm3, mm3 ; x7_x3_x7_x3
pmaddwd mm4, mm0 ; x4*w14+x0*w12_x4*w10+x0*w08
pmaddwd mm5, mm1 ; x6*w15+x2*w13_x6*w11+x2*w09
pmaddwd mm6, mm2 ; x5*w30+x1*w28_x5*w26+x1*w24
pmaddwd mm7, mm3 ; x7*w31+x3*x29_x7*w27+x3*w25
pmaddwd mm0, [eax] ; x4*w06+x0*w04_x4*w02+x0*w00
pmaddwd mm1, [eax+8] ; x6*w07+x0*w05_x6*w03+x0*w01
pmaddwd mm2, [eax+32] ; x5*w22+x1*w20_x5*w18+x1*w16
pmaddwd mm3, [eax+40] ; x7*w23+x3*w21_x7*w19+x3*w17
paddd mm4, mm5 ; a3_a2
paddd mm6, mm7 ; b3_b2
paddd mm0, mm1 ; a1_a0
paddd mm2, mm3 ; b1_b0
paddd mm4, half11
paddd mm0, half11
movq mm5, mm6
movq mm1, mm2
paddd mm2, mm0 ; a1+b1_a0+b0
paddd mm6, mm4 ; a3+b3_a2+b2
psubd mm0, mm1 ; a1-b1_a0-b0
psubd mm4, mm5 ; a3-b3_a2-b2
psrad mm2, 11 ; y1_y0
psrad mm6, 11 ; y3_y2
psrad mm0, 11 ; y6_y7
psrad mm4, 11 ; y4_y5
packssdw mm2, mm6 ; y3_y2_y1_y0
packssdw mm4, mm0 ; y6_y7_y4_y5
movq [esi+48], mm2 ;
movq mm7, mm4 ;
psrld mm4, 16 ; 00_y6_00_y4
pslld mm7, 16 ; y7_00_y5_00
por mm4, mm7 ; y7_y6_y5_y4
movq [esi+56], mm4 ;
; row 5
movd mm0, [esi+80] ; 00_00_x1_x0
movd mm1, [esi+84] ; 00_00_x3_x2
movd mm2, [esi+88] ; 00_00_x5_x4
movd mm3, [esi+92] ; 00_00_x7_x4
movq mm4, [eax+16] ; w14_w12_w10_w08
movq mm5, [eax+24] ; w15_w13_w11_w09
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -