📄 dsputil_mmx.c
字号:
: "%ecx"); return tmp;}#undef SUMstatic int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; assert( (((int)pix1) & 7) == 0); assert( (((int)pix2) & 7) == 0); assert((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0),%%mm2\n"\ "movq (%1)," #out0 "\n"\ "movq 8(%0),%%mm3\n"\ "movq 8(%1)," #out1 "\n"\ "add %3,%0\n"\ "add %3,%1\n"\ "psubb " #out0 ", %%mm2\n"\ "psubb " #out1 ", %%mm3\n"\ "pxor %%mm7, %%mm2\n"\ "pxor %%mm7, %%mm3\n"\ "movq %%mm2, " #out0 "\n"\ "movq %%mm3, " #out1 "\n"\ "psubusb " #in0 ", %%mm2\n"\ "psubusb " #in1 ", %%mm3\n"\ "psubusb " #out0 ", " #in0 "\n"\ "psubusb " #out1 ", " #in1 "\n"\ "por %%mm2, " #in0 "\n"\ "por %%mm3, " #in1 "\n"\ "movq " #in0 ", %%mm2\n"\ "movq " #in1 ", %%mm3\n"\ "punpcklbw %%mm7, " #in0 "\n"\ "punpcklbw %%mm7, " #in1 "\n"\ "punpckhbw %%mm7, %%mm2\n"\ "punpckhbw %%mm7, %%mm3\n"\ "paddw " #in1 ", " #in0 "\n"\ "paddw %%mm3, %%mm2\n"\ "paddw %%mm2, " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" asm volatile ( "movl %4,%%ecx\n" "pxor %%mm6,%%mm6\n" "pcmpeqw %%mm7,%%mm7\n" "psllw $15, %%mm7\n" "packsswb %%mm7, %%mm7\n" "movq (%0),%%mm0\n" "movq (%1),%%mm2\n" "movq 8(%0),%%mm1\n" "movq 8(%1),%%mm3\n" "add %3,%0\n" "add %3,%1\n" "subl $2, %%ecx\n" "psubb %%mm2, %%mm0\n" "psubb %%mm3, %%mm1\n" "pxor %%mm7, %%mm0\n" "pxor %%mm7, %%mm1\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" SUM(%%mm4, %%mm5, %%mm0, %%mm1) SUM(%%mm0, %%mm1, %%mm4, %%mm5) "subl $2, %%ecx\n" "jnz 1b\n" "movq %%mm6,%%mm0\n" "psrlq $32, %%mm6\n" "paddw %%mm6,%%mm0\n" "movq %%mm0,%%mm6\n" "psrlq $16, %%mm0\n" "paddw %%mm6,%%mm0\n" "movd %%mm0,%2\n" : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp & 0x7FFF;}#undef SUMstatic int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; assert( (((int)pix1) & 7) == 0); assert( (((int)pix2) & 7) == 0); assert((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0)," #out0 "\n"\ "movq (%1),%%mm2\n"\ "movq 8(%0)," #out1 "\n"\ "movq 8(%1),%%mm3\n"\ "add %3,%0\n"\ "add %3,%1\n"\ "psubb %%mm2, " #out0 "\n"\ "psubb %%mm3, " #out1 "\n"\ "pxor %%mm7, " #out0 "\n"\ "pxor %%mm7, " #out1 "\n"\ "psadbw " #out0 ", " #in0 "\n"\ "psadbw " #out1 ", " #in1 "\n"\ "paddw " #in1 ", " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" asm volatile ( "movl %4,%%ecx\n" "pxor %%mm6,%%mm6\n" "pcmpeqw %%mm7,%%mm7\n" "psllw $15, %%mm7\n" "packsswb %%mm7, %%mm7\n" "movq (%0),%%mm0\n" "movq (%1),%%mm2\n" "movq 8(%0),%%mm1\n" "movq 8(%1),%%mm3\n" "add %3,%0\n" "add %3,%1\n" "subl $2, %%ecx\n" "psubb %%mm2, %%mm0\n" "psubb %%mm3, %%mm1\n" "pxor %%mm7, %%mm0\n" "pxor %%mm7, %%mm1\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" SUM(%%mm4, %%mm5, %%mm0, %%mm1) SUM(%%mm0, %%mm1, %%mm4, %%mm5) "subl $2, %%ecx\n" "jnz 1b\n" "movd %%mm6,%2\n" : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp;}#undef SUMstatic void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ long i=0; asm volatile( "1: \n\t" "movq (%2, %0), %%mm0 \n\t" "movq (%1, %0), %%mm1 \n\t" "psubb %%mm0, %%mm1 \n\t" "movq %%mm1, (%3, %0) \n\t" "movq 8(%2, %0), %%mm0 \n\t" "movq 8(%1, %0), %%mm1 \n\t" "psubb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%3, %0) \n\t" "add $16, %0 \n\t" "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) ); for(; i<w; i++) dst[i+0] = src1[i+0]-src2[i+0];}static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ long i=0; uint8_t l, lt; asm volatile( "1: \n\t" "movq -1(%1, %0), %%mm0 \n\t" // LT "movq (%1, %0), %%mm1 \n\t" // T "movq -1(%2, %0), %%mm2 \n\t" // L "movq (%2, %0), %%mm3 \n\t" // X "movq %%mm2, %%mm4 \n\t" // L "psubb %%mm0, %%mm2 \n\t" "paddb %%mm1, %%mm2 \n\t" // L + T - LT "movq %%mm4, %%mm5 \n\t" // L "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) "pminub %%mm5, %%mm1 \n\t" // min(T, L) "pminub %%mm2, %%mm4 \n\t" "pmaxub %%mm1, %%mm4 \n\t" "psubb %%mm4, %%mm3 \n\t" // dst - pred "movq %%mm3, (%3, %0) \n\t" "add $8, %0 \n\t" "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) ); l= *left; lt= *left_top; dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); *left_top= src1[w-1]; *left = src2[w-1];}#define LBUTTERFLY2(a1,b1,a2,b2)\ "paddw " #b1 ", " #a1 " \n\t"\ "paddw " #b2 ", " #a2 " \n\t"\ "paddw " #b1 ", " #b1 " \n\t"\ "paddw " #b2 ", " #b2 " \n\t"\ "psubw " #a1 ", " #b1 " \n\t"\ "psubw " #a2 ", " #b2 " \n\t"#define HADAMARD48\ LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\ LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\ LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\ LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\ LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\#define MMABS(a,z)\ "pxor " #z ", " #z " \n\t"\ "pcmpgtw " #a ", " #z " \n\t"\ "pxor " #z ", " #a " \n\t"\ "psubw " #z ", " #a " \n\t"#define MMABS_SUM(a,z, sum)\ "pxor " #z ", " #z " \n\t"\ "pcmpgtw " #a ", " #z " \n\t"\ "pxor " #z ", " #a " \n\t"\ "psubw " #z ", " #a " \n\t"\ "paddusw " #a ", " #sum " \n\t"#define MMABS_MMX2(a,z)\ "pxor " #z ", " #z " \n\t"\ "psubw " #a ", " #z " \n\t"\ "pmaxsw " #z ", " #a " \n\t"#define MMABS_SUM_MMX2(a,z, sum)\ "pxor " #z ", " #z " \n\t"\ "psubw " #a ", " #z " \n\t"\ "pmaxsw " #z ", " #a " \n\t"\ "paddusw " #a ", " #sum " \n\t" #define SBUTTERFLY(a,b,t,n)\ "movq " #a ", " #t " \n\t" /* abcd */\ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\#define TRANSPOSE4(a,b,c,d,t)\ SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */#define LOAD4(o, a, b, c, d)\ "movq "#o"(%1), " #a " \n\t"\ "movq "#o"+16(%1), " #b " \n\t"\ "movq "#o"+32(%1), " #c " \n\t"\ "movq "#o"+48(%1), " #d " \n\t"#define STORE4(o, a, b, c, d)\ "movq "#a", "#o"(%1) \n\t"\ "movq "#b", "#o"+16(%1) \n\t"\ "movq "#c", "#o"+32(%1) \n\t"\ "movq "#d", "#o"+48(%1) \n\t"\static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ uint64_t temp[16] __align8; int sum=0; assert(h==8); diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); asm volatile( LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) HADAMARD48 "movq %%mm7, 112(%1) \n\t" TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) "movq 112(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) HADAMARD48 "movq %%mm7, 120(%1) \n\t" TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) "movq 120(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) "movq %%mm7, %%mm5 \n\t"//FIXME remove "movq %%mm6, %%mm7 \n\t" "movq %%mm0, %%mm6 \n\t"// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) HADAMARD48 "movq %%mm7, 64(%1) \n\t" MMABS(%%mm0, %%mm7) MMABS_SUM(%%mm1, %%mm7, %%mm0) MMABS_SUM(%%mm2, %%mm7, %%mm0) MMABS_SUM(%%mm3, %%mm7, %%mm0) MMABS_SUM(%%mm4, %%mm7, %%mm0) MMABS_SUM(%%mm5, %%mm7, %%mm0) MMABS_SUM(%%mm6, %%mm7, %%mm0) "movq 64(%1), %%mm1 \n\t" MMABS_SUM(%%mm1, %%mm7, %%mm0) "movq %%mm0, 64(%1) \n\t" LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) HADAMARD48 "movq %%mm7, (%1) \n\t" MMABS(%%mm0, %%mm7) MMABS_SUM(%%mm1, %%mm7, %%mm0) MMABS_SUM(%%mm2, %%mm7, %%mm0) MMABS_SUM(%%mm3, %%mm7, %%mm0) MMABS_SUM(%%mm4, %%mm7, %%mm0) MMABS_SUM(%%mm5, %%mm7, %%mm0) MMABS_SUM(%%mm6, %%mm7, %%mm0) "movq (%1), %%mm1 \n\t" MMABS_SUM(%%mm1, %%mm7, %%mm0) "movq 64(%1), %%mm1 \n\t" MMABS_SUM(%%mm1, %%mm7, %%mm0) "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" "paddusw %%mm1, %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "psrlq $16, %%mm0 \n\t" "paddusw %%mm1, %%mm0 \n\t" "movd %%mm0, %0 \n\t" : "=r" (sum) : "r"(temp) ); return sum&0xFFFF;}static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ uint64_t temp[16] __align8; int sum=0; assert(h==8); diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); asm volatile( LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) HADAMARD48 "movq %%mm7, 112(%1) \n\t" TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) "movq 112(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) HADAMARD48 "movq %%mm7, 120(%1) \n\t" TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) "movq 120(%1), %%mm7 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) "movq %%mm7, %%mm5 \n\t"//FIXME remove "movq %%mm6, %%mm7 \n\t" "movq %%mm0, %%mm6 \n\t"// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) HADAMARD48 "movq %%mm7, 64(%1) \n\t" MMABS_MMX2(%%mm0, %%mm7) MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -