📄 h264dsp_mmx.c
字号:
"add %4, %1 \n\t"\
"add %3, %2 \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2)\
: "D"((long)src2Stride), "S"((long)dstStride)\
: "memory"\
);\
}while(--h);\
}\
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
src -= 2*srcStride;\
asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
\
: "+a"(src), "+c"(dst)\
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\
);\
}\
static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int h=4;\
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
\
: "+a"(src)\
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\
);\
tmp += 4;\
src += 4 - 9*srcStride;\
}\
tmp -= 3*4;\
asm volatile(\
"movq %4, %%mm6 \n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"paddw 10(%0), %%mm0 \n\t"\
"movq 2(%0), %%mm1 \n\t"\
"paddw 8(%0), %%mm1 \n\t"\
"movq 4(%0), %%mm2 \n\t"\
"paddw 6(%0), %%mm2 \n\t"\
"psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
"psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
"psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
"paddsw %%mm2, %%mm0 \n\t"\
"psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
"paddw %%mm6, %%mm2 \n\t"\
"paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 +32 */\
"psraw $6, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, d)\
"add $24, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\
: "S"((long)dstStride), "m"(ff_pw_32)\
: "memory"\
);\
}\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=8;\
asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq %5, %%mm6 \n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"psllw $2, %%mm0 \n\t"\
"psllw $2, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movd -2(%0), %%mm2 \n\t"\
"movd 7(%0), %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"movq %6, %%mm5 \n\t"\
"paddw %%mm5, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm4, %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q)\
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\
: "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\
);\
}\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=8;\
asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq %0, %%mm6 \n\t"\
:: "m"(ff_pw_5)\
);\
do{\
asm volatile(\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"psllw $2, %%mm0 \n\t"\
"psllw $2, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movd -2(%0), %%mm2 \n\t"\
"movd 7(%0), %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"movq %5, %%mm5 \n\t"\
"paddw %%mm5, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm4, %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"movq (%2), %%mm4 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
PAVGB" %%mm4, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q)\
"add %4, %0 \n\t"\
"add %4, %1 \n\t"\
"add %3, %2 \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2)\
: "D"((long)src2Stride), "S"((long)dstStride),\
"m"(ff_pw_16)\
: "memory"\
);\
}while(--h);\
}\
\
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
int w= 2;\
src -= 2*srcStride;\
\
while(w--){\
asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
\
: "+a"(src), "+c"(dst)\
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\
);\
if(h==16){\
asm volatile(\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
\
: "+a"(src), "+c"(dst)\
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\
);\
}\
src += 4-(h+5)*srcStride;\
dst += 4-h*dstStride;\
}\
}\
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
int h = size;\
int w = (size+8)>>2;\
src -= 2*srcStride+2;\
while(w--){\
asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
: "+a"(src)\
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\
);\
if(size==16){\
asm volatile(\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
: "+a"(src)\
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\
);\
}\
tmp += 4;\
src += 4 - (size+5)*srcStride;\
}\
tmp -= size+8;\
w = size>>4;\
do{\
h = size;\
asm volatile(\
"movq %4, %%mm6 \n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 8(%0), %%mm3 \n\t"\
"movq 2(%0), %%mm1 \n\t"\
"movq 10(%0), %%mm4 \n\t"\
"paddw %%mm4, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"paddw 18(%0), %%mm3 \n\t"\
"paddw 16(%0), %%mm4 \n\t"\
"movq 4(%0), %%mm2 \n\t"\
"movq 12(%0), %%mm5 \n\t"\
"paddw 6(%0), %%mm2 \n\t"\
"paddw 14(%0), %%mm5 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psubw %%mm4, %%mm3 \n\t"\
"psraw $2, %%mm0 \n\t"\
"psraw $2, %%mm3 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psubw %%mm4, %%mm3 \n\t"\
"paddsw %%mm2, %%mm0 \n\t"\
"paddsw %%mm5, %%mm3 \n\t"\
"psraw $2, %%mm0 \n\t"\
"psraw $2, %%mm3 \n\t"\
"paddw %%mm6, %%mm2 \n\t"\
"paddw %%mm6, %%mm5 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm5, %%mm3 \n\t"\
"psraw $6, %%mm0 \n\t"\
"psraw $6, %%mm3 \n\t"\
"packuswb %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, q)\
"add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\
: "S"((long)dstStride), "m"(ff_pw_32)\
: "memory"\
);\
tmp += 8 - size*24;\
dst += 8 - size*dstStride;\
}while(w--);\
}\
\
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -