📄 fdct_mmx.asm
字号:
movq mm5, mm2
movq mm4, mm0
movq mm3, mm0
psubusw mm2, mm4
psubusw mm0, mm5
por mm0, mm2
add esi, 16
paddw mm6, mm0 ; var0 stored!
dec ecx
movq mm0, mm3
jnz .lp_var_sav ;// looping ...
movq mm2, [esi]
paddw mm2, mm5
movq mm4, mm0
movq mm5, mm2
psubusw mm2, mm4
psubusw mm0, mm5
por mm0, mm2
;///////////////////////////////////////////////
movq mm3, [mmx_one]
paddw mm6, mm0 ; var0 stored!
pmaddwd mm7, mm3 ; merge sum; mm3 = 1*4
mov ecx, [esp+4+8] ; mask_sav
pmaddwd mm6, mm3 ; merge sum; mm3 = 1*4
movq mm5, mm7
movq mm4, mm6
psrlq mm5, 32
psrlq mm4, 32
paddd mm7, mm5 ; mm7 = sav0
paddd mm6, mm4 ; mm6 = var0
movd esi, mm7 ; esi = sav0
cmp esi, ecx
movd eax, mm6 ; edi = var0
ja .active
mov ecx, [esp+4+12] ; mask_var
cmp eax, ecx
ja .active
mov eax, 1
jmp .done
.active
xor eax, eax
.done
pop esi
ret
;----------------------------------------------------------
;
; void fdct_mmx(short *block);
;
;----------------------------------------------------------
ALIGN 64
global _fdct8x8_mmx
_fdct8x8_mmx:
push ebx
mov INP, dword [esp + 8] ; block
mov TABLEF, tg_all_16
mov OUT, INP
mmx32_fdct_col103:
movq mm0, [x1] ; 0 ; x1
movq mm1, [x6] ; 1 ; x6
movq mm2, mm0 ; 2 ; x1
movq mm3, [x2] ; 3 ; x2
paddsw mm0, mm1 ; t1 = x[1] + x[6]
movq mm4, [x5] ; 4 ; x5
psllw mm0, SHIFT_FRW_COL ; t1
movq mm5, [x0] ; 5 ; x0
paddsw mm4, mm3 ; t2 = x[2] + x[5]
paddsw mm5, [x7] ; t0 = x[0] + x[7]
psllw mm4, SHIFT_FRW_COL ; t2
movq mm6, mm0 ; 6 ; t1
psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
movq mm1, qword [tg_2_16] ; 1 ; tg_2_16
psubsw mm0, mm4 ; tm12 = t1 - t2
movq mm7, [x3] ; 7 ; x3
pmulhw mm1, mm0 ; tm12*tg_2_16
paddsw mm7, [x4] ; t3 = x[3] + x[4]
psllw mm5, SHIFT_FRW_COL ; t0
paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
psllw mm7, SHIFT_FRW_COL ; t3
movq mm4, mm5 ; 4 ; t0
psubsw mm5, mm7 ; tm03 = t0 - t3
paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
por mm1, qword [one_corr] ; correction y2 +0.5
psllw mm2, SHIFT_FRW_COL+1 ; t6
pmulhw mm5, qword [tg_2_16] ; tm03*tg_2_16
movq mm7, mm4 ; 7 ; tp03
psubsw mm3, [x5] ; t5 = x[2] - x[5]
psubsw mm4, mm6 ; y4 = tp03 - tp12
movq [y2], mm1 ; 1 ; save y2
paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
movq mm1, [x3] ; 1 ; x3
psllw mm3, SHIFT_FRW_COL+1 ; t5
psubsw mm1, [x4] ; t4 = x[3] - x[4]
movq mm6, mm2 ; 6 ; t6
movq [y4], mm4 ; 4 ; save y4
paddsw mm2, mm3 ; t6 + t5
pmulhw mm2, qword [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
psubsw mm6, mm3 ; 3 ; t6 - t5
pmulhw mm6, qword [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
por mm5, qword [one_corr] ; correction y6 +0.5
psllw mm1, SHIFT_FRW_COL ; t4
por mm2, qword [one_corr] ; correction tp65 +0.5
movq mm4, mm1 ; 4 ; t4
movq mm3, [x0] ; 3 ; x0
paddsw mm1, mm6 ; tp465 = t4 + tm65
psubsw mm3, [x7] ; t7 = x[0] - x[7]
psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
movq mm0, qword [tg_1_16] ; 0 ; tg_1_16
psllw mm3, SHIFT_FRW_COL ; t7
movq mm6, qword [tg_3_16] ; 6 ; tg_3_16
pmulhw mm0, mm1 ; tp465*tg_1_16
movq [y0], mm7 ; 7 ; save y0
pmulhw mm6, mm4 ; tm465*tg_3_16
movq [y6], mm5 ; 5 ; save y6
movq mm7, mm3 ; 7 ; t7
movq mm5, qword [tg_3_16] ; 5 ; tg_3_16
psubsw mm7, mm2 ; tm765 = t7 - tp65
paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
pmulhw mm5, mm7 ; tm765*tg_3_16
paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
paddsw mm6, mm4 ; tm465*tg_3_16
pmulhw mm3, qword [tg_1_16] ; tp765*tg_1_16
por mm0, qword [one_corr] ; correction y1 +0.5
paddsw mm5, mm7 ; tm765*tg_3_16
psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
add INP, 0x08 ;// increment pointer
movq [y1], mm0 ; 0 ; save y1
paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
movq [y3], mm7 ; 7 ; save y3
psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
movq [y5], mm5 ; 5 ; save y5
mmx32_fdct_col47: ; begin processing column 4-7
movq mm0, [x1] ; 0 ; x1
movq [y7], mm3 ; 3 ; save y7 (columns 0-4)
movq mm1, [x6] ; 1 ; x6
movq mm2, mm0 ; 2 ; x1
movq mm3, [x2] ; 3 ; x2
paddsw mm0, mm1 ; t1 = x[1] + x[6]
movq mm4, [x5] ; 4 ; x5
psllw mm0, SHIFT_FRW_COL ; t1
movq mm5, [x0] ; 5 ; x0
paddsw mm4, mm3 ; t2 = x[2] + x[5]
paddsw mm5, [x7] ; t0 = x[0] + x[7]
psllw mm4, SHIFT_FRW_COL ; t2
movq mm6, mm0 ; 6 ; t1
psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
movq mm1, qword [tg_2_16] ; 1 ; tg_2_16
psubsw mm0, mm4 ; tm12 = t1 - t2
movq mm7, [x3] ; 7 ; x3
pmulhw mm1, mm0 ; tm12*tg_2_16
paddsw mm7, [x4] ; t3 = x[3] + x[4]
psllw mm5, SHIFT_FRW_COL ; t0
paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
psllw mm7, SHIFT_FRW_COL ; t3
movq mm4, mm5 ; 4 ; t0
psubsw mm5, mm7 ; tm03 = t0 - t3
paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
por mm1, qword [one_corr] ; correction y2 +0.5
psllw mm2, SHIFT_FRW_COL+1 ; t6
pmulhw mm5, qword [tg_2_16] ; tm03*tg_2_16
movq mm7, mm4 ; 7 ; tp03
psubsw mm3, [x5] ; t5 = x[2] - x[5]
psubsw mm4, mm6 ; y4 = tp03 - tp12
movq [y2+8], mm1 ; 1 ; save y2
paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
movq mm1, [x3] ; 1 ; x3
psllw mm3, SHIFT_FRW_COL+1 ; t5
psubsw mm1, [x4] ; t4 = x[3] - x[4]
movq mm6, mm2 ; 6 ; t6
movq [y4+8], mm4 ; 4 ; save y4
paddsw mm2, mm3 ; t6 + t5
pmulhw mm2, qword [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
psubsw mm6, mm3 ; 3 ; t6 - t5
pmulhw mm6, qword [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
por mm5, qword [one_corr] ; correction y6 +0.5
psllw mm1, SHIFT_FRW_COL ; t4
por mm2, qword [one_corr] ; correction tp65 +0.5
movq mm4, mm1 ; 4 ; t4
movq mm3, [x0] ; 3 ; x0
paddsw mm1, mm6 ; tp465 = t4 + tm65
psubsw mm3, [x7] ; t7 = x[0] - x[7]
psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
movq mm0, qword [tg_1_16] ; 0 ; tg_1_16
psllw mm3, SHIFT_FRW_COL ; t7
movq mm6, qword [tg_3_16] ; 6 ; tg_3_16
pmulhw mm0, mm1 ; tp465*tg_1_16
movq [y0+8], mm7 ; 7 ; save y0
pmulhw mm6, mm4 ; tm465*tg_3_16
movq [y6+8], mm5 ; 5 ; save y6
movq mm7, mm3 ; 7 ; t7
movq mm5, qword [tg_3_16] ; 5 ; tg_3_16
psubsw mm7, mm2 ; tm765 = t7 - tp65
paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
pmulhw mm5, mm7 ; tm765*tg_3_16
paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
paddsw mm6, mm4 ; tm465*tg_3_16
pmulhw mm3, [tg_1_16] ; tp765*tg_1_16
por mm0, qword [one_corr] ; correction y1 +0.5
paddsw mm5, mm7 ; tm765*tg_3_16
psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
movq [y1+8], mm0 ; 0 ; save y1
paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
movq [y3+8], mm7 ; 7 ; save y3
psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
movq [y5+8], mm5 ; 5 ; save y5
movq [y7+8], mm3 ; 3 ; save y7
mov INP, [esp + 8] ; row 0
lea TABLEF, [tab_frw_01234567] ; row 0 ;!!!
mov OUT, INP
lea round_frw_row, [r_frw_row] ; !!!
;//lp_mmx_fdct_row1:
fdct_one_row INP_0, TABLE_0, OUT_0
fdct_one_row INP_1, TABLE_1, OUT_1
fdct_one_row INP_2, TABLE_2, OUT_2
fdct_one_row INP_3, TABLE_3, OUT_3
fdct_one_row INP_4, TABLE_4, OUT_4
fdct_one_row INP_5, TABLE_5, OUT_5
fdct_one_row INP_6, TABLE_6, OUT_6
fdct_one_row INP_7, TABLE_7, OUT_7
pop ebx
;emms ; all emms will be done after the frame processing.
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -