📄 idct_llm_mmx.asm
字号:
; mm4 - o1_o2
; mm6 - o3_o4
;
movq mm1, mm0
movq mm3, mm2
paddd mm0, mm6 ; w[1]_w[0]
paddd mm2, mm4 ; w[3]_w[2]
psubd mm1, mm6 ; w[6]_w[7]
psubd mm3, mm4 ; w[4]_w[5]
movq mm5, mm1 ; w[6]_w[7]
movq mm7, mm3 ; w[4]_w[5]
psllq mm7, 32 ; w[5]_____
psllq mm5, 32 ; w[7]_____
punpckhdq mm3, mm7 ; w[5]_w[4]
punpckhdq mm1, mm5 ; w[7]_w[6]
paddd mm0, half_10bit
paddd mm1, half_10bit
paddd mm2, half_10bit
paddd mm3, half_10bit
psrad mm0, 10
psrad mm1, 10
psrad mm2, 10
psrad mm3, 10
movq [edi ], mm0
movq [edi+ 8], mm2
movq [edi+16], mm3
movq [edi+24], mm1
;add 6, sub 2, psxl 4, psxa 4,
;-------------------------------------------------------------------
; 廔抂僠僃僢僋
idct_llm_mmx_raw_last:
lea esi, [esi+16]
lea edi, [edi+32]
test ecx, ecx
jnz idct_llm_mmx_raw_loop
;-------------------------------------------------------------------
; 擖弌椡僷儔儊乕僞廋惓
mov edi, esi
sub edi, 128
lea esi, [esp+36]
mov ecx, 4
;-------------------------------------------------------------------
; IDCT_COL
idct_llm_mmx_col_loop:
dec ecx
;-------------------------------------------------------------------
; 戞侾抜 婏悢崁傪寁嶼偡傞
;
; w0 = w[7*8];
; w1 = w[5*8];
; w2 = w[3*8];
; w3 = w[1*8];
;
; z1 = w0 + w3;
; z2 = w1 + w2;
; z3 = w0 + w2;
; z4 = w1 + w3;
; z5 = (z3 + z4) * FIX_1_175875602;
;
; w0 *= FIX_0_298631336;
; w1 *= FIX_2_053119869;
; w2 *= FIX_3_072711026;
; w3 *= FIX_1_501321110;
; z1 *= (- FIX_0_899976223);
; z2 *= (- FIX_2_562915447);
; z3 *= (- FIX_1_961570560);
; Z4 *= (- FIX_0_390180644);
;
; z3 += z5;
; z4 += z5;
;
; w0 += z1 + z3;
; w1 += z2 + z4;
; w2 += z2 + z3;
; w3 += z1 + z4;
;
movq mm0, [esi+7*32]
movq mm1, [esi+5*32]
movq mm2, [esi+3*32]
movq mm3, [esi+1*32]
movq mm4, mm0
movq mm5, mm1
paddd mm4, mm3 ; z1
paddd mm5, mm2 ; z2
movq mm6, mm0
movq mm7, mm1
paddd mm6, mm2 ; z3
paddd mm7, mm3 ; z4
pmaddwd mm0, p0_298631336
pmaddwd mm1, p2_053119869
pmaddwd mm4, m0_899976223
pmaddwd mm5, m2_562915447
pmaddwd mm2, p3_072711026
pmaddwd mm3, p1_501321110
paddd mm0, mm4
paddd mm1, mm5
paddd mm2, mm5
paddd mm3, mm4
movq mm5, mm6 ; z5
paddd mm5, mm7
pmaddwd mm5, p1_175875602
pmaddwd mm6, m1_961570560
pmaddwd mm7, m0_390180644
paddd mm6, mm5
paddd mm7, mm5
paddd mm0, mm6
paddd mm1, mm7
paddd mm2, mm6
paddd mm3, mm7
movq [esp+ 4], mm0
movq [esp+12], mm1
movq [esp+20], mm2
movq [esp+28], mm3
;-------------------------------------------------------------------
; 戞俀抜 嬼悢崁傪寁嶼偡傞
;
; z2 = w[2*8];
; z3 = w[6*8];
;
; z1 = (z2+z3) * FIX_0_541196100;
; w2 = z1 + (z3 * (- FIX_1_847759065));
; w3 = z1 + (z2 * FIX_0_765366865);
;
; w0 = (w[0*8] + w[4*8]) << 13;
; w1 = (w[0*8] - w[4*8]) << 13;
;
; w4 = w0 + w3;
; w7 = w0 - w3;
; w5 = w1 + w2;
; w6 = w1 - w2;
;
movq mm0, [esi] ;
movq mm2, [esi+32*2] ;
movq mm4, [esi+32*4] ;
movq mm6, [esi+32*6] ;
movq mm3, mm2 ; z1
movq mm5, mm0 ;
paddd mm0, mm4 ;
psubd mm5, mm4
paddd mm3, mm6
pslld mm0, 13 ; w0
pslld mm5, 13 ; w1
pmaddwd mm2, p0_765366865
pmaddwd mm6, m1_847759065
pmaddwd mm3, p0_541196100
paddd mm2, mm3 ; w3
paddd mm6, mm3 ; w2
movq mm7, mm0 ; w0
movq mm3, mm5 ; w1
paddd mm0, mm2 ; w4
paddd mm3, mm6 ; w5
psubd mm7, mm2 ; w7
psubd mm5, mm6 ; w6
movq mm1, mm0
movq mm2, mm3
movq mm4, mm5
movq mm6, mm7
;-------------------------------------------------------------------
; 戞俁抜 婏悢崁偲嬼悢崁偐傜嵟廔弌椡傪寁嶼偟偰尦偺 block 偵栠偡
;
; (w4+w3) >> 20;
; (w5+w2) >> 20;
; (w6+w1) >> 20;
; (w7+w0) >> 20;
; (w7-w0) >> 20;
; (w6-w1) >> 20;
; (w5-w2) >> 20;
; (w4-w3) >> 20;
;
paddd mm6, [esp+ 4] ; d[3]
paddd mm4, [esp+12] ; d[2]
paddd mm2, [esp+20] ; d[1]
paddd mm0, [esp+28] ; d[0]
psubd mm7, [esp+ 4] ; d[4]
psubd mm5, [esp+12] ; d[5]
psubd mm3, [esp+20] ; d[6]
psubd mm1, [esp+28] ; d[7]
paddd mm6, half_19bit
paddd mm4, half_19bit
paddd mm2, half_19bit
paddd mm0, half_19bit
paddd mm7, half_19bit
paddd mm5, half_19bit
paddd mm3, half_19bit
paddd mm1, half_19bit
psrad mm6, 19
psrad mm4, 19
psrad mm2, 19
psrad mm0, 19
psrad mm7, 19
psrad mm5, 19
psrad mm3, 19
psrad mm1, 19
packssdw mm0, mm7 ; d[4]_d[0]
packssdw mm2, mm5 ; d[5]_d[1]
packssdw mm4, mm3 ; d[6]_d[2]
packssdw mm6, mm1 ; d[7]_d[3]
movd [edi ], mm0
movd [edi+16*1], mm2
movd [edi+16*2], mm4
movd [edi+16*3], mm6
psrlq mm0, 32
psrlq mm2, 32
psrlq mm4, 32
psrlq mm6, 32
movd [edi+16*4], mm0
movd [edi+16*5], mm2
movd [edi+16*6], mm4
movd [edi+16*7], mm6
;-------------------------------------------------------------------
; 廔抂僠僃僢僋
lea esi, [esi+8]
lea edi, [edi+4]
test ecx, ecx
jnz idct_llm_mmx_col_loop
;-------------------------------------------------------------------
; 屻巒枛
add esp, 292
add esp, eax
pop ebx
pop eax
pop ecx
pop edi
pop esi
ret 4
;-------------------------------------------------------------------
_idct_llm_mmx@4 ENDP
;-------------------------------------------------------------------
; 廔椆
END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -