📄 h264dsp_mmx.c
字号:
static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0){ //FIXME: could cut some load/stores by merging transpose with filter uint8_t trans[8*4]; transpose4x4(trans, pix-2, 8, stride); transpose4x4(trans+4, pix-2+4*stride, 8, stride); h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); transpose4x4(pix-2, trans, stride, 8); transpose4x4(pix-2+4*stride, trans+4, stride, 8);}// p0 = (p0 + q1 + 2*p1 + 2) >> 2#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ "movq "#p0", %%mm4 \n\t"\ "pxor "#q1", %%mm4 \n\t"\ "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ "pavgb "#q1", "#p0" \n\t"\ "psubusb %%mm4, "#p0" \n\t"\ "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1){ asm volatile( "movq (%0), %%mm0 \n\t" "movq (%0,%2), %%mm1 \n\t" "movq (%1), %%mm2 \n\t" "movq (%1,%2), %%mm3 \n\t" H264_DEBLOCK_MASK(%3, %4) "movq %%mm1, %%mm5 \n\t" "movq %%mm2, %%mm6 \n\t" H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' "psubb %%mm5, %%mm1 \n\t" "psubb %%mm6, %%mm2 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm7, %%mm2 \n\t" "paddb %%mm5, %%mm1 \n\t" "paddb %%mm6, %%mm2 \n\t" "movq %%mm1, (%0,%2) \n\t" "movq %%mm2, (%1) \n\t" :: "r"(pix-2*stride), "r"(pix), "r"((long)stride), "m"(alpha1), "m"(beta1), "m"(mm_bone) );}static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta){ h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);}static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta){ //FIXME: could cut some load/stores by merging transpose with filter uint8_t trans[8*4]; transpose4x4(trans, pix-2, 8, stride); transpose4x4(trans+4, pix-2+4*stride, 8, stride); h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); transpose4x4(pix-2, trans, stride, 8); transpose4x4(pix-2+4*stride, trans+4, stride, 8);}/***********************************//* motion compensation */#define QPEL_H264V(A,B,C,D,E,F,OP)\ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ "paddw "#D", %%mm6 \n\t"\ "psllw $2, %%mm6 \n\t"\ "psubw "#B", %%mm6 \n\t"\ "psubw "#E", %%mm6 \n\t"\ "pmullw %4, %%mm6 \n\t"\ "add %2, %0 \n\t"\ "punpcklbw %%mm7, "#F" \n\t"\ "paddw %5, "#A" \n\t"\ "paddw "#F", "#A" \n\t"\ "paddw "#A", %%mm6 \n\t"\ "psraw $5, %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\ OP(%%mm6, (%1), A, d)\ "add %3, %1 \n\t" #define QPEL_H264HV(A,B,C,D,E,F,OF)\ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ "paddw "#D", %%mm6 \n\t"\ "psllw $2, %%mm6 \n\t"\ "psubw "#B", %%mm6 \n\t"\ "psubw "#E", %%mm6 \n\t"\ "pmullw %3, %%mm6 \n\t"\ "add %2, %0 \n\t"\ "punpcklbw %%mm7, "#F" \n\t"\ "paddw "#F", "#A" \n\t"\ "paddw "#A", %%mm6 \n\t"\ "movq %%mm6, "#OF"(%1) \n\t" #define QPEL_H264(OPNAME, OP, MMX)\static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ int h=4;\\ asm volatile(\ "pxor %%mm7, %%mm7 \n\t"\ "movq %5, %%mm4 \n\t"\ "movq %6, %%mm5 \n\t"\ "1: \n\t"\ "movd -1(%0), %%mm1 \n\t"\ "movd (%0), %%mm2 \n\t"\ "movd 1(%0), %%mm3 \n\t"\ "movd 2(%0), %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "paddw %%mm0, %%mm1 \n\t"\ "paddw %%mm3, %%mm2 \n\t"\ "movd -2(%0), %%mm0 \n\t"\ "movd 3(%0), %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\ "paddw %%mm3, %%mm0 \n\t"\ "psllw $2, %%mm2 \n\t"\ "psubw %%mm1, %%mm2 \n\t"\ "pmullw %%mm4, %%mm2 \n\t"\ "paddw %%mm5, %%mm0 \n\t"\ "paddw %%mm2, %%mm0 \n\t"\ "psraw $5, %%mm0 \n\t"\ "packuswb %%mm0, %%mm0 \n\t"\ OP(%%mm0, (%1),%%mm6, d)\ "add %3, %0 \n\t"\ "add %4, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(src), "+c"(dst), "+m"(h)\ : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "memory"\ );\}\static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ src -= 2*srcStride;\ asm volatile(\ "pxor %%mm7, %%mm7 \n\t"\ "movd (%0), %%mm0 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm1 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm2 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm3 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm4 \n\t"\ "add %2, %0 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ \ : "+a"(src), "+c"(dst)\ : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "memory"\ );\}\static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ int h=4;\ int w=3;\ src -= 2*srcStride+2;\ while(w--){\ asm volatile(\ "pxor %%mm7, %%mm7 \n\t"\ "movd (%0), %%mm0 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm1 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm2 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm3 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm4 \n\t"\ "add %2, %0 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ \ : "+a"(src)\ : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ : "memory"\ );\ tmp += 4;\ src += 4 - 9*srcStride;\ }\ tmp -= 3*4;\ asm volatile(\ "movq %4, %%mm6 \n\t"\ "1: \n\t"\ "movq (%0), %%mm0 \n\t"\ "paddw 10(%0), %%mm0 \n\t"\ "movq 2(%0), %%mm1 \n\t"\ "paddw 8(%0), %%mm1 \n\t"\ "movq 4(%0), %%mm2 \n\t"\ "paddw 6(%0), %%mm2 \n\t"\ "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ "paddsw %%mm2, %%mm0 \n\t"\ "psraw $2, %%mm0 \n\t"/*((a-b)/4-b)/4 */\ "paddw %%mm6, %%mm2 \n\t"\ "paddw %%mm2, %%mm0 \n\t"\ "psraw $6, %%mm0 \n\t"\ "packuswb %%mm0, %%mm0 \n\t"\ OP(%%mm0, (%1),%%mm7, d)\ "add $24, %0 \n\t"\ "add %3, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(tmp), "+c"(dst), "+m"(h)\ : "S"((long)dstStride), "m"(ff_pw_32)\ : "memory"\ );\}\\static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ int h=8;\ asm volatile(\ "pxor %%mm7, %%mm7 \n\t"\ "movq %5, %%mm6 \n\t"\ "1: \n\t"\ "movq (%0), %%mm0 \n\t"\ "movq 1(%0), %%mm2 \n\t"\ "movq %%mm0, %%mm1 \n\t"\ "movq %%mm2, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpckhbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "paddw %%mm2, %%mm0 \n\t"\ "paddw %%mm3, %%mm1 \n\t"\ "psllw $2, %%mm0 \n\t"\ "psllw $2, %%mm1 \n\t"\ "movq -1(%0), %%mm2 \n\t"\ "movq 2(%0), %%mm4 \n\t"\ "movq %%mm2, %%mm3 \n\t"\ "movq %%mm4, %%mm5 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ "punpckhbw %%mm7, %%mm5 \n\t"\ "paddw %%mm4, %%mm2 \n\t"\ "paddw %%mm3, %%mm5 \n\t"\ "psubw %%mm2, %%mm0 \n\t"\ "psubw %%mm5, %%mm1 \n\t"\ "pmullw %%mm6, %%mm0 \n\t"\ "pmullw %%mm6, %%mm1 \n\t"\ "movd -2(%0), %%mm2 \n\t"\ "movd 7(%0), %%mm5 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm5 \n\t"\ "paddw %%mm3, %%mm2 \n\t"\ "paddw %%mm5, %%mm4 \n\t"\ "movq %6, %%mm5 \n\t"\ "paddw %%mm5, %%mm2 \n\t"\ "paddw %%mm5, %%mm4 \n\t"\ "paddw %%mm2, %%mm0 \n\t"\ "paddw %%mm4, %%mm1 \n\t"\ "psraw $5, %%mm0 \n\t"\ "psraw $5, %%mm1 \n\t"\ "packuswb %%mm1, %%mm0 \n\t"\ OP(%%mm0, (%1),%%mm5, q)\ "add %3, %0 \n\t"\ "add %4, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(src), "+c"(dst), "+m"(h)\ : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "memory"\ );\}\\static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ int h= 2;\ src -= 2*srcStride;\ \ while(h--){\ asm volatile(\ "pxor %%mm7, %%mm7 \n\t"\ "movd (%0), %%mm0 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm1 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm2 \n\t"\ "add %2, %0 \n\t"\ "movd (%0), %%mm3 \n\t"\ "add %2, %0 \n\t"\
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -