📄 predict-a.asm
字号:
; .. p11 p10 p9 movq mm7, mm3 punpcklbw mm3, mm0 ; p4 p3 p2 p1 punpckhbw mm7, mm0 ; p8 p7 p6 p5 movq mm1, mm7 movq mm0, mm7 movq mm4, mm7 movq [r0+3*FDEC_STRIDE], mm3 PALIGNR mm7, mm3, 2, mm5 movq [r0+2*FDEC_STRIDE], mm7 PALIGNR mm1, mm3, 4, mm5 movq [r0+1*FDEC_STRIDE], mm1 PALIGNR mm0, mm3, 6, mm3 movq [r0+0*FDEC_STRIDE], mm0 movq mm2, mm6 movq mm3, mm6 movq [r0-1*FDEC_STRIDE], mm4 PALIGNR mm6, mm4, 2, mm5 movq [r0-2*FDEC_STRIDE], mm6 PALIGNR mm2, mm4, 4, mm5 movq [r0-3*FDEC_STRIDE], mm2 PALIGNR mm3, mm4, 6, mm4 movq [r0-4*FDEC_STRIDE], mm3 RET;-----------------------------------------------------------------------------; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------%macro PREDICT_8x8_HD 1cglobal predict_8x8_hd_%1, 2,2 add r0, 4*FDEC_STRIDE movdqa xmm0, [r1] movdqa xmm1, [r1+16] movdqa xmm2, xmm1 movdqa xmm3, xmm1 PALIGNR xmm1, xmm0, 7, xmm4 PALIGNR xmm2, xmm0, 9, xmm5 PALIGNR xmm3, xmm0, 8, xmm0 movdqa xmm4, xmm1 pavgb xmm4, xmm3 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5 punpcklbw xmm4, xmm0 movhlps xmm0, xmm4%assign Y 3%rep 3 movq [r0+(Y)*FDEC_STRIDE], xmm4 movq [r0+(Y-4)*FDEC_STRIDE], xmm0 psrldq xmm4, 2 psrldq xmm0, 2%assign Y (Y-1)%endrep movq [r0+(Y)*FDEC_STRIDE], xmm4 movq [r0+(Y-4)*FDEC_STRIDE], xmm0 RET%endmacroINIT_XMMPREDICT_8x8_HD sse2%define PALIGNR PALIGNR_SSSE3PREDICT_8x8_HD ssse3INIT_MMX%define PALIGNR PALIGNR_MMX;-----------------------------------------------------------------------------; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge );-----------------------------------------------------------------------------%macro PREDICT_8x8_HU 1cglobal predict_8x8_hu_%1, 2,2 add r0, 4*FDEC_STRIDE%ifidn %1, ssse3 movq mm5, [r1+7] movq mm6, [pb_reverse GLOBAL] movq mm1, mm5 movq mm2, mm5 movq mm3, mm5 pshufb mm5, mm6 psrlq mm6, 8 pshufb mm2, mm6 psrlq mm6, 8 pshufb mm3, mm6 movq mm4, mm5%else movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 movq mm2, mm0 psllw mm0, 8 psrlw mm2, 8 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 psllq mm1, 56 ; l7 .. .. .. .. .. .. .. movq mm3, mm2 movq mm4, mm2 movq mm5, mm2 psrlq mm2, 8 psrlq mm3, 16 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw mm1, mm1 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2%endif pavgb mm4, mm2 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 movq2dq xmm0, mm4 movq2dq xmm1, mm1 punpcklbw xmm0, xmm1 punpckhbw mm4, mm1%assign Y -4%rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 psrldq xmm0, 2%assign Y (Y+1)%endrep pshufw mm5, mm4, 11111001b pshufw mm6, mm4, 11111110b pshufw mm7, mm4, 11111111b movq [r0+Y*FDEC_STRIDE], xmm0 movq [r0+0*FDEC_STRIDE], mm4 movq [r0+1*FDEC_STRIDE], mm5 movq [r0+2*FDEC_STRIDE], mm6 movq [r0+3*FDEC_STRIDE], mm7 RET%endmacroPREDICT_8x8_HU sse2PREDICT_8x8_HU ssse3;-----------------------------------------------------------------------------; void predict_8x8c_v_mmx( uint8_t *src );-----------------------------------------------------------------------------cglobal predict_8x8c_v_mmx, 1,1 movq mm0, [r0 - FDEC_STRIDE] STORE8x8 mm0, mm0 RET;-----------------------------------------------------------------------------; void predict_8x8c_h_mmxext( uint8_t *src );-----------------------------------------------------------------------------%macro PRED_8x8C_H 1cglobal predict_8x8c_h_%1, 1,1%ifidn %1, ssse3 mova m1, [pb_3 GLOBAL]%endif%assign n 0%rep 8 SPLATB m0, r0+FDEC_STRIDE*n-1, m1 mova [r0+FDEC_STRIDE*n], m0%assign n n+1%endrep RET%endmacroINIT_MMX%define SPLATB SPLATB_MMXPRED_8x8C_H mmxext%define SPLATB SPLATB_SSSE3PRED_8x8C_H ssse3;-----------------------------------------------------------------------------; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );-----------------------------------------------------------------------------cglobal predict_8x8c_dc_core_mmxext, 1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 punpckhbw mm1, mm0 punpcklbw mm0, mm2 psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0%ifdef ARCH_X86_64 movd mm4, r1d movd mm5, r2d paddw mm0, mm4 pshufw mm2, mm5, 0%else paddw mm0, r1m pshufw mm2, r2m, 0%endif psrlw mm0, 3 paddw mm1, [pw_2 GLOBAL] movq mm3, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) paddw mm3, mm1 psrlw mm3, 3 ; dc3 (w) psrlw mm2, 2 ; dc2 (w) psrlw mm1, 2 ; dc1 (w) packuswb mm0, mm1 ; dc0,dc1 (b) packuswb mm2, mm3 ; dc2,dc3 (b) STORE8x8 mm0, mm2 RETcglobal predict_8x8c_dc_top_mmxext, 1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 punpckhbw mm1, mm0 punpcklbw mm0, mm2 psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 psrlw mm1, 1 psrlw mm0, 1 pavgw mm1, mm2 pavgw mm0, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) packuswb mm0, mm1 ; dc0,dc1 (b) STORE8x8 mm0, mm0 RET;-----------------------------------------------------------------------------; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );-----------------------------------------------------------------------------cglobal predict_8x8c_p_core_sse2, 1,1 movd xmm0, r1m movd xmm2, r2m movd xmm4, r3m pshuflw xmm0, xmm0, 0 pshuflw xmm2, xmm2, 0 pshuflw xmm4, xmm4, 0 punpcklqdq xmm0, xmm0 punpcklqdq xmm2, xmm2 punpcklqdq xmm4, xmm4 pmullw xmm2, [pw_76543210 GLOBAL] paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} movdqa xmm3, xmm0 paddsw xmm3, xmm4 paddsw xmm4, xmm4call .loop add r0, FDEC_STRIDE*4.loop: movdqa xmm5, xmm0 movdqa xmm1, xmm3 psraw xmm0, 5 psraw xmm3, 5 packuswb xmm0, xmm3 movq [r0+FDEC_STRIDE*0], xmm0 movhps [r0+FDEC_STRIDE*1], xmm0 paddsw xmm5, xmm4 paddsw xmm1, xmm4 movdqa xmm0, xmm5 movdqa xmm3, xmm1 psraw xmm5, 5 psraw xmm1, 5 packuswb xmm5, xmm1 movq [r0+FDEC_STRIDE*2], xmm5 movhps [r0+FDEC_STRIDE*3], xmm5 paddsw xmm0, xmm4 paddsw xmm3, xmm4 RET;-----------------------------------------------------------------------------; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );-----------------------------------------------------------------------------cglobal predict_16x16_p_core_sse2, 1,2,8 movd xmm0, r1m movd xmm1, r2m movd xmm2, r3m pshuflw xmm0, xmm0, 0 pshuflw xmm1, xmm1, 0 pshuflw xmm2, xmm2, 0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 punpcklqdq xmm2, xmm2 movdqa xmm3, xmm1 pmullw xmm3, [pw_76543210 GLOBAL] psllw xmm1, 3 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} movdqa xmm7, xmm2 paddsw xmm7, xmm7 mov r1d, 8ALIGN 4.loop: movdqa xmm3, xmm0 movdqa xmm4, xmm1 movdqa xmm5, xmm0 movdqa xmm6, xmm1 psraw xmm3, 5 psraw xmm4, 5 paddsw xmm5, xmm2 paddsw xmm6, xmm2 psraw xmm5, 5 psraw xmm6, 5 packuswb xmm3, xmm4 packuswb xmm5, xmm6 movdqa [r0+FDEC_STRIDE*0], xmm3 movdqa [r0+FDEC_STRIDE*1], xmm5 paddsw xmm0, xmm7 paddsw xmm1, xmm7 add r0, FDEC_STRIDE*2 dec r1d jg .loop REP_RET;-----------------------------------------------------------------------------; void predict_16x16_v_mmx( uint8_t *src );-----------------------------------------------------------------------------cglobal predict_16x16_v_mmx, 1,2 movq mm0, [r0 - FDEC_STRIDE] movq mm1, [r0 - FDEC_STRIDE + 8] STORE16x16 mm0, mm1 REP_RET;-----------------------------------------------------------------------------; void predict_16x16_v_sse2( uint8_t *src );-----------------------------------------------------------------------------cglobal predict_16x16_v_sse2, 1,1 movdqa xmm0, [r0 - FDEC_STRIDE] STORE16x16_SSE2 xmm0 RET;-----------------------------------------------------------------------------; void predict_16x16_h_mmxext( uint8_t *src );-----------------------------------------------------------------------------%macro PRED_16x16_H 1cglobal predict_16x16_h_%1, 1,2 mov r1, FDEC_STRIDE*12%ifidn %1, ssse3 mova m1, [pb_3 GLOBAL]%endif.vloop:%assign n 0%rep 4 SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1 mova [r0+r1+FDEC_STRIDE*n], m0%if mmsize==8 mova [r0+r1+FDEC_STRIDE*n+8], m0%endif%assign n n+1%endrep add r1, -FDEC_STRIDE*4 jge .vloop REP_RET%endmacro;no SSE2, its slower than MMX on all systems that don't support SSSE3INIT_MMX%define SPLATB SPLATB_MMXPRED_16x16_H mmxextINIT_XMM%define SPLATB SPLATB_SSSE3PRED_16x16_H ssse3;-----------------------------------------------------------------------------; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );-----------------------------------------------------------------------------%macro PRED16x16_DC 2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r0 - FDEC_STRIDE] psadbw mm1, [r0 - FDEC_STRIDE + 8] paddusw mm0, mm1 paddusw mm0, %1 psrlw mm0, %2 ; dc pshufw mm0, mm0, 0 packuswb mm0, mm0 ; dc in bytes STORE16x16 mm0, mm0%endmacrocglobal predict_16x16_dc_core_mmxext, 1,2%ifdef ARCH_X86_64 movd mm2, r1d PRED16x16_DC mm2, 5%else PRED16x16_DC r1m, 5%endif REP_RETcglobal predict_16x16_dc_top_mmxext, 1,2 PRED16x16_DC [pw_8 GLOBAL], 4 REP_RETcglobal predict_16x16_dc_left_core_mmxext, 1,1 movd mm0, r1m pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE16x16 mm0, mm0 REP_RET;-----------------------------------------------------------------------------; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );-----------------------------------------------------------------------------%macro PRED16x16_DC_SSE2 2 pxor xmm0, xmm0 psadbw xmm0, [r0 - FDEC_STRIDE] movhlps xmm1, xmm0 paddw xmm0, xmm1 paddusw xmm0, %1 psrlw xmm0, %2 ; dc pshuflw xmm0, xmm0, 0 punpcklqdq xmm0, xmm0 packuswb xmm0, xmm0 ; dc in bytes STORE16x16_SSE2 xmm0%endmacrocglobal predict_16x16_dc_core_sse2, 1,1 movd xmm2, r1m PRED16x16_DC_SSE2 xmm2, 5 RETcglobal predict_16x16_dc_top_sse2, 1,1 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 RETcglobal predict_16x16_dc_left_core_sse2, 1,1 movd xmm0, r1m pshuflw xmm0, xmm0, 0 punpcklqdq xmm0, xmm0 packuswb xmm0, xmm0 STORE16x16_SSE2 xmm0 RET
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -