📄 dsputil_mmx.c
字号:
"add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((long)line_size) : "%"REG_a, "memory" );}static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h){ __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((long)line_size) : "%"REG_a, "memory" );}static void clear_blocks_mmx(DCTELEM *blocks){ __asm __volatile( "pxor %%mm7, %%mm7 \n\t" "mov $-128*6, %%"REG_a" \n\t" "1: \n\t" "movq %%mm7, (%0, %%"REG_a") \n\t" "movq %%mm7, 8(%0, %%"REG_a") \n\t" "movq %%mm7, 16(%0, %%"REG_a") \n\t" "movq %%mm7, 24(%0, %%"REG_a") \n\t" "add $32, %%"REG_a" \n\t" " js 1b \n\t" : : "r" (((uint8_t *)blocks)+128*6) : "%"REG_a );}#ifdef CONFIG_ENCODERSstatic int pix_sum16_mmx(uint8_t * pix, int line_size){ const int h=16; int sum; long index= -line_size*h; __asm __volatile( "pxor %%mm7, %%mm7 \n\t" "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq (%2, %1), %%mm0 \n\t" "movq (%2, %1), %%mm1 \n\t" "movq 8(%2, %1), %%mm2 \n\t" "movq 8(%2, %1), %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" "paddw %%mm0, %%mm1 \n\t" "paddw %%mm2, %%mm3 \n\t" "paddw %%mm1, %%mm3 \n\t" "paddw %%mm3, %%mm6 \n\t" "add %3, %1 \n\t" " js 1b \n\t" "movq %%mm6, %%mm5 \n\t" "psrlq $32, %%mm6 \n\t" "paddw %%mm5, %%mm6 \n\t" "movq %%mm6, %%mm5 \n\t" "psrlq $16, %%mm6 \n\t" "paddw %%mm5, %%mm6 \n\t" "movd %%mm6, %0 \n\t" "andl $0xFFFF, %0 \n\t" : "=&r" (sum), "+r" (index) : "r" (pix - index), "r" ((long)line_size) ); return sum;}#endif //CONFIG_ENCODERSstatic void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ long i=0; asm volatile( "1: \n\t" "movq (%1, %0), %%mm0 \n\t" "movq (%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, (%2, %0) \n\t" "movq 8(%1, %0), %%mm0 \n\t" "movq 8(%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%2, %0) \n\t" "add $16, %0 \n\t" "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src), "r"(dst), "r"((long)w-15) ); for(; i<w; i++) dst[i+0] += src[i+0];}#define H263_LOOP_FILTER \ "pxor %%mm7, %%mm7 \n\t"\ "movq %0, %%mm0 \n\t"\ "movq %0, %%mm1 \n\t"\ "movq %3, %%mm2 \n\t"\ "movq %3, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpckhbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "psubw %%mm2, %%mm0 \n\t"\ "psubw %%mm3, %%mm1 \n\t"\ "movq %1, %%mm2 \n\t"\ "movq %1, %%mm3 \n\t"\ "movq %2, %%mm4 \n\t"\ "movq %2, %%mm5 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ "punpckhbw %%mm7, %%mm5 \n\t"\ "psubw %%mm2, %%mm4 \n\t"\ "psubw %%mm3, %%mm5 \n\t"\ "psllw $2, %%mm4 \n\t"\ "psllw $2, %%mm5 \n\t"\ "paddw %%mm0, %%mm4 \n\t"\ "paddw %%mm1, %%mm5 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ "pcmpgtw %%mm4, %%mm6 \n\t"\ "pcmpgtw %%mm5, %%mm7 \n\t"\ "pxor %%mm6, %%mm4 \n\t"\ "pxor %%mm7, %%mm5 \n\t"\ "psubw %%mm6, %%mm4 \n\t"\ "psubw %%mm7, %%mm5 \n\t"\ "psrlw $3, %%mm4 \n\t"\ "psrlw $3, %%mm5 \n\t"\ "packuswb %%mm5, %%mm4 \n\t"\ "packsswb %%mm7, %%mm6 \n\t"\ "pxor %%mm7, %%mm7 \n\t"\ "movd %4, %%mm2 \n\t"\ "punpcklbw %%mm2, %%mm2 \n\t"\ "punpcklbw %%mm2, %%mm2 \n\t"\ "punpcklbw %%mm2, %%mm2 \n\t"\ "psubusb %%mm4, %%mm2 \n\t"\ "movq %%mm2, %%mm3 \n\t"\ "psubusb %%mm4, %%mm3 \n\t"\ "psubb %%mm3, %%mm2 \n\t"\ "movq %1, %%mm3 \n\t"\ "movq %2, %%mm4 \n\t"\ "pxor %%mm6, %%mm3 \n\t"\ "pxor %%mm6, %%mm4 \n\t"\ "paddusb %%mm2, %%mm3 \n\t"\ "psubusb %%mm2, %%mm4 \n\t"\ "pxor %%mm6, %%mm3 \n\t"\ "pxor %%mm6, %%mm4 \n\t"\ "paddusb %%mm2, %%mm2 \n\t"\ "packsswb %%mm1, %%mm0 \n\t"\ "pcmpgtb %%mm0, %%mm7 \n\t"\ "pxor %%mm7, %%mm0 \n\t"\ "psubb %%mm7, %%mm0 \n\t"\ "movq %%mm0, %%mm1 \n\t"\ "psubusb %%mm2, %%mm0 \n\t"\ "psubb %%mm0, %%mm1 \n\t"\ "pand %5, %%mm1 \n\t"\ "psrlw $2, %%mm1 \n\t"\ "pxor %%mm7, %%mm1 \n\t"\ "psubb %%mm7, %%mm1 \n\t"\ "movq %0, %%mm5 \n\t"\ "movq %3, %%mm6 \n\t"\ "psubb %%mm1, %%mm5 \n\t"\ "paddb %%mm1, %%mm6 \n\t"static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ const int strength= ff_h263_loop_filter_strength[qscale]; asm volatile( H263_LOOP_FILTER "movq %%mm3, %1 \n\t" "movq %%mm4, %2 \n\t" "movq %%mm5, %0 \n\t" "movq %%mm6, %3 \n\t" : "+m" (*(uint64_t*)(src - 2*stride)), "+m" (*(uint64_t*)(src - 1*stride)), "+m" (*(uint64_t*)(src + 0*stride)), "+m" (*(uint64_t*)(src + 1*stride)) : "g" (2*strength), "m"(ff_pb_FC) );}static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ asm volatile( //FIXME could save 1 instruction if done as 8x4 ... "movd %4, %%mm0 \n\t" "movd %5, %%mm1 \n\t" "movd %6, %%mm2 \n\t" "movd %7, %%mm3 \n\t" "punpcklbw %%mm1, %%mm0 \n\t" "punpcklbw %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "punpcklwd %%mm2, %%mm0 \n\t" "punpckhwd %%mm2, %%mm1 \n\t" "movd %%mm0, %0 \n\t" "punpckhdq %%mm0, %%mm0 \n\t" "movd %%mm0, %1 \n\t" "movd %%mm1, %2 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movd %%mm1, %3 \n\t" : "=m" (*(uint32_t*)(dst + 0*dst_stride)), "=m" (*(uint32_t*)(dst + 1*dst_stride)), "=m" (*(uint32_t*)(dst + 2*dst_stride)), "=m" (*(uint32_t*)(dst + 3*dst_stride)) : "m" (*(uint32_t*)(src + 0*src_stride)), "m" (*(uint32_t*)(src + 1*src_stride)), "m" (*(uint32_t*)(src + 2*src_stride)), "m" (*(uint32_t*)(src + 3*src_stride)) );}static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ const int strength= ff_h263_loop_filter_strength[qscale]; uint64_t temp[4] __attribute__ ((aligned(8))); uint8_t *btemp= (uint8_t*)temp; src -= 2; transpose4x4(btemp , src , 8, stride); transpose4x4(btemp+4, src + 4*stride, 8, stride); asm volatile( H263_LOOP_FILTER // 5 3 4 6 : "+m" (temp[0]), "+m" (temp[1]), "+m" (temp[2]), "+m" (temp[3]) : "g" (2*strength), "m"(ff_pb_FC) ); asm volatile( "movq %%mm5, %%mm1 \n\t" "movq %%mm4, %%mm0 \n\t" "punpcklbw %%mm3, %%mm5 \n\t" "punpcklbw %%mm6, %%mm4 \n\t" "punpckhbw %%mm3, %%mm1 \n\t" "punpckhbw %%mm6, %%mm0 \n\t" "movq %%mm5, %%mm3 \n\t" "movq %%mm1, %%mm6 \n\t" "punpcklwd %%mm4, %%mm5 \n\t" "punpcklwd %%mm0, %%mm1 \n\t" "punpckhwd %%mm4, %%mm3 \n\t" "punpckhwd %%mm0, %%mm6 \n\t" "movd %%mm5, (%0) \n\t" "punpckhdq %%mm5, %%mm5 \n\t" "movd %%mm5, (%0,%2) \n\t" "movd %%mm3, (%0,%2,2) \n\t" "punpckhdq %%mm3, %%mm3 \n\t" "movd %%mm3, (%0,%3) \n\t" "movd %%mm1, (%1) \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movd %%mm1, (%1,%2) \n\t" "movd %%mm6, (%1,%2,2) \n\t" "punpckhdq %%mm6, %%mm6 \n\t" "movd %%mm6, (%1,%3) \n\t" :: "r" (src), "r" (src + 4*stride), "r" ((long) stride ), "r" ((long)(3*stride)) );}#ifdef CONFIG_ENCODERSstatic int pix_norm1_mmx(uint8_t *pix, int line_size) { int tmp; asm volatile ( "movl $16,%%ecx\n" "pxor %%mm0,%%mm0\n" "pxor %%mm7,%%mm7\n" "1:\n" "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ "pmaddwd %%mm3,%%mm3\n" "pmaddwd %%mm4,%%mm4\n" "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, pix2^2+pix3^2+pix6^2+pix7^2) */ "paddd %%mm3,%%mm4\n" "paddd %%mm2,%%mm7\n" "add %2, %0\n" "paddd %%mm4,%%mm7\n" "dec %%ecx\n" "jnz 1b\n" "movq %%mm7,%%mm1\n" "psrlq $32, %%mm7\n" /* shift hi dword to lo */ "paddd %%mm7,%%mm1\n" "movd %%mm1,%1\n" : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); return tmp;}static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; asm volatile ( "movl %4,%%ecx\n" "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ "1:\n" "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ "movq %%mm1,%%mm5\n" "psubusb %%mm2,%%mm1\n" "psubusb %%mm5,%%mm2\n" "por %%mm1,%%mm2\n" "movq %%mm2,%%mm1\n" "punpckhbw %%mm0,%%mm2\n" "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ "pmaddwd %%mm2,%%mm2\n" "pmaddwd %%mm1,%%mm1\n" "add %3,%0\n" "add %3,%1\n" "paddd %%mm2,%%mm1\n" "paddd %%mm1,%%mm7\n" "decl %%ecx\n" "jnz 1b\n" "movq %%mm7,%%mm1\n" "psrlq $32, %%mm7\n" /* shift hi dword to lo */ "paddd %%mm7,%%mm1\n" "movd %%mm1,%2\n" : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp;}static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; asm volatile ( "movl %4,%%ecx\n" "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ "1:\n" "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ /* todo: mm1-mm2, mm3-mm4 */ /* algo: substract mm1 from mm2 with saturation and vice versa */ /* OR the results to get absolute difference */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -