📄 h264dsp_mmx.c.svn-base
字号:
"packuswb %%mm1, %%mm0 \n\t"\ "packuswb %%mm3, %%mm2 \n\t"\ PAVGB" (%0), %%mm0 \n\t"\ PAVGB" (%0,%3), %%mm2 \n\t"\ OP(%%mm0, (%2), %%mm5, q)\ OP(%%mm2, (%2,%4), %%mm5, q)\ ::"a"(src8), "c"(src16), "d"(dst),\ "r"((long)src8Stride), "r"((long)dstStride)\ :"memory");\ src8 += 2L*src8Stride;\ src16 += 48;\ dst += 2L*dstStride;\ }while(h-=2);\}\static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\{\ OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\}\#ifdef ARCH_X86_64#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ int h=16;\ asm volatile(\ "pxor %%xmm15, %%xmm15 \n\t"\ "movdqa %6, %%xmm14 \n\t"\ "movdqa %7, %%xmm13 \n\t"\ "1: \n\t"\ "lddqu 3(%0), %%xmm1 \n\t"\ "lddqu -5(%0), %%xmm7 \n\t"\ "movdqa %%xmm1, %%xmm0 \n\t"\ "punpckhbw %%xmm15, %%xmm1 \n\t"\ "punpcklbw %%xmm15, %%xmm0 \n\t"\ "punpcklbw %%xmm15, %%xmm7 \n\t"\ "movdqa %%xmm1, %%xmm2 \n\t"\ "movdqa %%xmm0, %%xmm6 \n\t"\ "movdqa %%xmm1, %%xmm3 \n\t"\ "movdqa %%xmm0, %%xmm8 \n\t"\ "movdqa %%xmm1, %%xmm4 \n\t"\ "movdqa %%xmm0, %%xmm9 \n\t"\ "movdqa %%xmm1, %%xmm5 \n\t"\ "movdqa %%xmm0, %%xmm10 \n\t"\ "palignr $6, %%xmm0, %%xmm5 \n\t"\ "palignr $6, %%xmm7, %%xmm10\n\t"\ "palignr $8, %%xmm0, %%xmm4 \n\t"\ "palignr $8, %%xmm7, %%xmm9 \n\t"\ "palignr $10,%%xmm0, %%xmm3 \n\t"\ "palignr $10,%%xmm7, %%xmm8 \n\t"\ "paddw %%xmm1, %%xmm5 \n\t"\ "paddw %%xmm0, %%xmm10 \n\t"\ "palignr $12,%%xmm0, %%xmm2 \n\t"\ "palignr $12,%%xmm7, %%xmm6 \n\t"\ "palignr $14,%%xmm0, %%xmm1 \n\t"\ "palignr $14,%%xmm7, %%xmm0 \n\t"\ "paddw %%xmm3, %%xmm2 \n\t"\ "paddw %%xmm8, %%xmm6 \n\t"\ "paddw %%xmm4, %%xmm1 \n\t"\ "paddw %%xmm9, %%xmm0 \n\t"\ "psllw $2, %%xmm2 \n\t"\ "psllw $2, %%xmm6 \n\t"\ "psubw %%xmm1, %%xmm2 \n\t"\ "psubw %%xmm0, %%xmm6 \n\t"\ "paddw %%xmm13,%%xmm5 \n\t"\ "paddw %%xmm13,%%xmm10 \n\t"\ "pmullw %%xmm14,%%xmm2 \n\t"\ "pmullw %%xmm14,%%xmm6 \n\t"\ "lddqu (%2), %%xmm3 \n\t"\ "paddw %%xmm5, %%xmm2 \n\t"\ "paddw %%xmm10,%%xmm6 \n\t"\ "psraw $5, %%xmm2 \n\t"\ "psraw $5, %%xmm6 \n\t"\ "packuswb %%xmm2,%%xmm6 \n\t"\ "pavgb %%xmm3, %%xmm6 \n\t"\ OP(%%xmm6, (%1), %%xmm4, dqa)\ "add %5, %0 \n\t"\ "add %5, %1 \n\t"\ "add %4, %2 \n\t"\ "decl %3 \n\t"\ "jg 1b \n\t"\ : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ : "D"((long)src2Stride), "S"((long)dstStride),\ "m"(ff_pw_5), "m"(ff_pw_16)\ : "memory"\ );\}#else // ARCH_X86_64#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ src += 8*dstStride;\ dst += 8*dstStride;\ src2 += 8*src2Stride;\ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\}#endif // ARCH_X86_64#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ int h=8;\ asm volatile(\ "pxor %%xmm7, %%xmm7 \n\t"\ "movdqa %0, %%xmm6 \n\t"\ :: "m"(ff_pw_5)\ );\ do{\ asm volatile(\ "lddqu -5(%0), %%xmm1 \n\t"\ "movdqa %%xmm1, %%xmm0 \n\t"\ "punpckhbw %%xmm7, %%xmm1 \n\t"\ "punpcklbw %%xmm7, %%xmm0 \n\t"\ "movdqa %%xmm1, %%xmm2 \n\t"\ "movdqa %%xmm1, %%xmm3 \n\t"\ "movdqa %%xmm1, %%xmm4 \n\t"\ "movdqa %%xmm1, %%xmm5 \n\t"\ "palignr $6, %%xmm0, %%xmm5 \n\t"\ "palignr $8, %%xmm0, %%xmm4 \n\t"\ "palignr $10,%%xmm0, %%xmm3 \n\t"\ "paddw %%xmm1, %%xmm5 \n\t"\ "palignr $12,%%xmm0, %%xmm2 \n\t"\ "palignr $14,%%xmm0, %%xmm1 \n\t"\ "paddw %%xmm3, %%xmm2 \n\t"\ "paddw %%xmm4, %%xmm1 \n\t"\ "psllw $2, %%xmm2 \n\t"\ "movq (%2), %%xmm3 \n\t"\ "psubw %%xmm1, %%xmm2 \n\t"\ "paddw %5, %%xmm5 \n\t"\ "pmullw %%xmm6, %%xmm2 \n\t"\ "paddw %%xmm5, %%xmm2 \n\t"\ "psraw $5, %%xmm2 \n\t"\ "packuswb %%xmm2, %%xmm2 \n\t"\ "pavgb %%xmm3, %%xmm2 \n\t"\ OP(%%xmm2, (%1), %%xmm4, q)\ "add %4, %0 \n\t"\ "add %4, %1 \n\t"\ "add %3, %2 \n\t"\ : "+a"(src), "+c"(dst), "+d"(src2)\ : "D"((long)src2Stride), "S"((long)dstStride),\ "m"(ff_pw_16)\ : "memory"\ );\ }while(--h);\}\QPEL_H264_H16_XMM(OPNAME, OP, MMX)\\static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ int h=8;\ asm volatile(\ "pxor %%xmm7, %%xmm7 \n\t"\ "movdqa %5, %%xmm6 \n\t"\ "1: \n\t"\ "lddqu -5(%0), %%xmm1 \n\t"\ "movdqa %%xmm1, %%xmm0 \n\t"\ "punpckhbw %%xmm7, %%xmm1 \n\t"\ "punpcklbw %%xmm7, %%xmm0 \n\t"\ "movdqa %%xmm1, %%xmm2 \n\t"\ "movdqa %%xmm1, %%xmm3 \n\t"\ "movdqa %%xmm1, %%xmm4 \n\t"\ "movdqa %%xmm1, %%xmm5 \n\t"\ "palignr $6, %%xmm0, %%xmm5 \n\t"\ "palignr $8, %%xmm0, %%xmm4 \n\t"\ "palignr $10,%%xmm0, %%xmm3 \n\t"\ "paddw %%xmm1, %%xmm5 \n\t"\ "palignr $12,%%xmm0, %%xmm2 \n\t"\ "palignr $14,%%xmm0, %%xmm1 \n\t"\ "paddw %%xmm3, %%xmm2 \n\t"\ "paddw %%xmm4, %%xmm1 \n\t"\ "psllw $2, %%xmm2 \n\t"\ "psubw %%xmm1, %%xmm2 \n\t"\ "paddw %6, %%xmm5 \n\t"\ "pmullw %%xmm6, %%xmm2 \n\t"\ "paddw %%xmm5, %%xmm2 \n\t"\ "psraw $5, %%xmm2 \n\t"\ "packuswb %%xmm2, %%xmm2 \n\t"\ OP(%%xmm2, (%1), %%xmm4, q)\ "add %3, %0 \n\t"\ "add %4, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(src), "+c"(dst), "+g"(h)\ : "D"((long)srcStride), "S"((long)dstStride),\ "m"(ff_pw_5), "m"(ff_pw_16)\ : "memory"\ );\}\static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ src += 8*srcStride;\ dst += 8*dstStride;\ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\}\#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ src -= 2*srcStride;\ \ asm volatile(\ "pxor %%xmm7, %%xmm7 \n\t"\ "movq (%0), %%xmm0 \n\t"\ "add %2, %0 \n\t"\ "movq (%0), %%xmm1 \n\t"\ "add %2, %0 \n\t"\ "movq (%0), %%xmm2 \n\t"\ "add %2, %0 \n\t"\ "movq (%0), %%xmm3 \n\t"\ "add %2, %0 \n\t"\ "movq (%0), %%xmm4 \n\t"\ "add %2, %0 \n\t"\ "punpcklbw %%xmm7, %%xmm0 \n\t"\ "punpcklbw %%xmm7, %%xmm1 \n\t"\ "punpcklbw %%xmm7, %%xmm2 \n\t"\ "punpcklbw %%xmm7, %%xmm3 \n\t"\ "punpcklbw %%xmm7, %%xmm4 \n\t"\ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ \ : "+a"(src), "+c"(dst)\ : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "memory"\ );\ if(h==16){\ asm volatile(\ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ \ : "+a"(src), "+c"(dst)\ : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "memory"\ );\ }\}\static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\}\static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\}static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ int w = (size+8)>>3; src -= 2*srcStride+2; while(w--){ asm volatile( "pxor %%xmm7, %%xmm7 \n\t" "movq (%0), %%xmm0 \n\t" "add %2, %0 \n\t" "movq (%0), %%xmm1 \n\t" "add %2, %0 \n\t" "movq (%0), %%xmm2 \n\t" "add %2, %0 \n\t" "movq (%0), %%xmm3 \n\t" "add %2, %0 \n\t" "movq (%0), %%xmm4 \n\t" "add %2, %0 \n\t" "punpcklbw %%xmm7, %%xmm0 \n\t" "punpcklbw %%xmm7, %%xmm1 \n\t" "punpcklbw %%xmm7, %%xmm2 \n\t" "punpcklbw %%xmm7, %%xmm3 \n\t" "punpcklbw %%xmm7, %%xmm4 \n\t" QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) : "+a"(src) : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) : "memory" ); if(size==16){ asm volatile( QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) : "+a"(src) : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) : "memory" ); } tmp += 8; src += 8 - (size+5)*srcStride; }}#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ int h = size;\ if(size == 16){\ asm volatile(\ "1: \n\t"\ "movdqa 32(%0), %%xmm4 \n\t"\ "movdqa 16(%0), %%xmm5 \n\t"\ "movdqa (%0), %%xmm7 \n\t"\ "movdqa %%xmm4, %%xmm3 \n\t"\ "movdqa %%xmm4, %%xmm2 \n\t"\ "movdqa %%xmm4, %%xmm1 \n\t"\ "movdqa %%xmm4, %%xmm0 \n\t"\
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -