📄 predict-a.asm
字号:
%endmacro%define PALIGNR PALIGNR_MMXPREDICT_FILTER mmxext%define PALIGNR PALIGNR_SSSE3PREDICT_FILTER ssse3;-----------------------------------------------------------------------------; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------cglobal predict_8x8_v_mmxext, 2,2 movq mm0, [r1+16] STORE8x8 mm0, mm0 RET;-----------------------------------------------------------------------------; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );-----------------------------------------------------------------------------INIT_MMXcglobal predict_8x8_h_mmxext, 2,2 movu m3, [r1+7] mova m7, m3 punpckhbw m3, m3 punpcklbw m7, m7 pshufw m0, m3, 0xff pshufw m1, m3, 0xaa pshufw m2, m3, 0x55 pshufw m3, m3, 0x00 pshufw m4, m7, 0xff pshufw m5, m7, 0xaa pshufw m6, m7, 0x55 pshufw m7, m7, 0x00%assign n 0%rep 8 mova [r0+n*FDEC_STRIDE], m %+ n%assign n n+1%endrep RET;-----------------------------------------------------------------------------; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );;-----------------------------------------------------------------------------cglobal predict_8x8_dc_mmxext, 2,2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] psadbw mm1, [r1+16] paddw mm0, [pw_8 GLOBAL] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 RET;-----------------------------------------------------------------------------; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );;-----------------------------------------------------------------------------%macro PRED8x8_DC 2cglobal %1, 2,2 pxor mm0, mm0 psadbw mm0, [r1+%2] paddw mm0, [pw_4 GLOBAL] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 RET%endmacroPRED8x8_DC predict_8x8_dc_top_mmxext, 16PRED8x8_DC predict_8x8_dc_left_mmxext, 7%ifndef ARCH_X86_64; sse2 is faster even on amd, so there's no sense in spending exe size on these; functions if we know sse2 is available.;-----------------------------------------------------------------------------; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------cglobal predict_8x8_ddl_mmxext, 2,2 movq mm5, [r1+16] movq mm2, [r1+17] movq mm3, [r1+23] movq mm4, [r1+25] movq mm1, mm5 psllq mm1, 8 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6%assign Y 7%rep 6 movq [r0+Y*FDEC_STRIDE], mm1 movq mm2, mm0 psllq mm1, 8 psrlq mm2, 56 psllq mm0, 8 por mm1, mm2%assign Y (Y-1)%endrep movq [r0+Y*FDEC_STRIDE], mm1 psllq mm1, 8 psrlq mm0, 56 por mm1, mm0%assign Y (Y-1) movq [r0+Y*FDEC_STRIDE], mm1 RET;-----------------------------------------------------------------------------; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------cglobal predict_8x8_ddr_mmxext, 2,2 movq mm1, [r1+7] movq mm2, [r1+9] movq mm3, [r1+15] movq mm4, [r1+17] PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6%assign Y 7%rep 6 movq [r0+Y*FDEC_STRIDE], mm0 movq mm2, mm1 psrlq mm0, 8 psllq mm2, 56 psrlq mm1, 8 por mm0, mm2%assign Y (Y-1)%endrep movq [r0+Y*FDEC_STRIDE], mm0 psrlq mm0, 8 psllq mm1, 56 por mm0, mm1%assign Y (Y-1) movq [r0+Y*FDEC_STRIDE], mm0 RET;-----------------------------------------------------------------------------; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------%define PALIGNR PALIGNR_MMXcglobal predict_8x8_hu_mmxext, 2,2 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 add r0, 4*FDEC_STRIDE pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 psllq mm1, 56 ; l7 .. .. .. .. .. .. .. movq mm2, mm0 psllw mm0, 8 psrlw mm2, 8 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 movq mm3, mm2 movq mm4, mm2 movq mm5, mm2 psrlq mm2, 8 psrlq mm3, 16 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw mm1, mm1 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 pavgb mm4, mm2 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 movq mm5, mm4 punpcklbw mm4, mm1 ; p4 p3 p2 p1 punpckhbw mm5, mm1 ; p8 p7 p6 p5 movq mm6, mm5 movq mm7, mm5 movq mm0, mm5 PALIGNR mm5, mm4, 2, mm1 pshufw mm1, mm6, 11111001b PALIGNR mm6, mm4, 4, mm2 pshufw mm2, mm7, 11111110b PALIGNR mm7, mm4, 6, mm3 pshufw mm3, mm0, 11111111b movq [r0-4*FDEC_STRIDE], mm4 movq [r0-3*FDEC_STRIDE], mm5 movq [r0-2*FDEC_STRIDE], mm6 movq [r0-1*FDEC_STRIDE], mm7 movq [r0+0*FDEC_STRIDE], mm0 movq [r0+1*FDEC_STRIDE], mm1 movq [r0+2*FDEC_STRIDE], mm2 movq [r0+3*FDEC_STRIDE], mm3 RET;-----------------------------------------------------------------------------; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------; fills only some pixels:; f01234567; 0........; 1,,,,,,,,; 2 .......; 3 ,,,,,,,; 4 ......; 5 ,,,,,,; 6 .....; 7 ,,,,,cglobal predict_8x8_vr_core_mmxext, 2,2 movq mm2, [r1+16] movq mm3, [r1+15] movq mm1, [r1+14] movq mm4, mm3 pavgb mm3, mm2 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7%assign Y 0%rep 3 movq [r0+ Y *FDEC_STRIDE], mm3 movq [r0+(Y+1)*FDEC_STRIDE], mm0 psllq mm3, 8 psllq mm0, 8%assign Y (Y+2)%endrep movq [r0+ Y *FDEC_STRIDE], mm3 movq [r0+(Y+1)*FDEC_STRIDE], mm0 RET;-----------------------------------------------------------------------------; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );-----------------------------------------------------------------------------cglobal predict_8x8c_p_core_mmxext, 1,2 LOAD_PLANE_ARGS movq mm1, mm2 pmullw mm2, [pw_3210 GLOBAL] psllw mm1, 2 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} mov r1d, 8ALIGN 4.loop: movq mm5, mm0 movq mm6, mm1 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0], mm5 paddsw mm0, mm4 paddsw mm1, mm4 add r0, FDEC_STRIDE dec r1d jg .loop REP_RET;-----------------------------------------------------------------------------; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );-----------------------------------------------------------------------------cglobal predict_16x16_p_core_mmxext, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 pmullw mm5, [pw_3210 GLOBAL] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} mov r1d, 16ALIGN 4.loop: movq mm5, mm0 movq mm6, mm1 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0], mm5 movq mm5, mm2 movq mm6, mm3 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0+8], mm5 paddsw mm0, mm4 paddsw mm1, mm4 paddsw mm2, mm4 paddsw mm3, mm4 add r0, FDEC_STRIDE dec r1d jg .loop REP_RET%endif ; !ARCH_X86_64;-----------------------------------------------------------------------------; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------cglobal predict_8x8_ddl_sse2, 2,2 movdqa xmm3, [r1+16] movdqu xmm2, [r1+17] movdqa xmm1, xmm3 pslldq xmm1, 1 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4%assign Y 0%rep 8 psrldq xmm0, 1 movq [r0+Y*FDEC_STRIDE], xmm0%assign Y (Y+1)%endrep RET;-----------------------------------------------------------------------------; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------cglobal predict_8x8_ddr_sse2, 2,2 movdqu xmm3, [r1+8] movdqu xmm1, [r1+7] movdqa xmm2, xmm3 psrldq xmm2, 1 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 movdqa xmm1, xmm0 psrldq xmm1, 1%assign Y 7%rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 movq [r0+(Y-1)*FDEC_STRIDE], xmm1 psrldq xmm0, 2 psrldq xmm1, 2%assign Y (Y-2)%endrep movq [r0+1*FDEC_STRIDE], xmm0 movq [r0+0*FDEC_STRIDE], xmm1 RET;-----------------------------------------------------------------------------; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------cglobal predict_8x8_vl_sse2, 2,2 movdqa xmm4, [r1+16] movdqa xmm2, xmm4 movdqa xmm1, xmm4 movdqa xmm3, xmm4 psrldq xmm2, 1 pslldq xmm1, 1 pavgb xmm3, xmm2 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5; xmm0: (t0 + 2*t1 + t2 + 2) >> 2; xmm3: (t0 + t1 + 1) >> 1%assign Y 0%rep 3 psrldq xmm0, 1 movq [r0+ Y *FDEC_STRIDE], xmm3 movq [r0+(Y+1)*FDEC_STRIDE], xmm0 psrldq xmm3, 1%assign Y (Y+2)%endrep psrldq xmm0, 1 movq [r0+ Y *FDEC_STRIDE], xmm3 movq [r0+(Y+1)*FDEC_STRIDE], xmm0 RET;-----------------------------------------------------------------------------; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------cglobal predict_8x8_vr_sse2, 2,2,7 movdqu xmm0, [r1+8] movdqa xmm6, [pw_ff00 GLOBAL] add r0, 4*FDEC_STRIDE movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 pslldq xmm0, 1 pslldq xmm1, 2 pavgb xmm2, xmm0 PRED8x8_LOWPASS_XMM xmm4, xmm3, xmm1, xmm0, xmm5 pandn xmm6, xmm4 movdqa xmm5, xmm4 psrlw xmm4, 8 packuswb xmm6, xmm4 movhlps xmm4, xmm6 movhps [r0-3*FDEC_STRIDE], xmm5 movhps [r0-4*FDEC_STRIDE], xmm2 psrldq xmm5, 4 movss xmm5, xmm6 psrldq xmm2, 4 movss xmm2, xmm4%assign Y 3%rep 3 psrldq xmm5, 1 psrldq xmm2, 1 movq [r0+Y*FDEC_STRIDE], xmm5 movq [r0+(Y-1)*FDEC_STRIDE], xmm2%assign Y (Y-2)%endrep RET;-----------------------------------------------------------------------------; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------%define PALIGNR PALIGNR_MMXcglobal predict_8x8_hd_mmxext, 2,2 add r0, 4*FDEC_STRIDE movq mm0, [r1] ; l7 .. .. .. .. .. .. .. movq mm1, [r1+8] ; lt l0 l1 l2 l3 l4 l5 l6 movq mm2, [r1+16] ; t7 t6 t5 t4 t3 t2 t1 t0 movq mm3, mm1 ; lt l0 l1 l2 l3 l4 l5 l6 movq mm4, mm2 ; t7 t6 t5 t4 t3 t2 t1 t0 PALIGNR mm2, mm1, 7, mm5 ; t6 t5 t4 t3 t2 t1 t0 lt PALIGNR mm1, mm0, 7, mm6 ; l0 l1 l2 l3 l4 l5 l6 l7 PALIGNR mm4, mm3, 1, mm7 ; t0 lt l0 l1 l2 l3 l4 l5 movq mm5, mm3 pavgb mm3, mm1 PRED8x8_LOWPASS mm0, mm4, mm1, mm5, mm7 movq mm4, mm2 movq mm1, mm2 ; t6 t5 t4 t3 t2 t1 t0 lt psrlq mm4, 16 ; .. .. t6 t5 t4 t3 t2 t1 psrlq mm1, 8 ; .. t6 t5 t4 t3 t2 t1 t0 PRED8x8_LOWPASS mm6, mm4, mm2, mm1, mm5
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -