📄 dsputil_mmx.c
字号:
MOVQ_ZERO(mm7); i = 4; do { asm volatile( "movq (%2), %%mm0 \n\t" "movq 8(%2), %%mm1 \n\t" "movq 16(%2), %%mm2 \n\t" "movq 24(%2), %%mm3 \n\t" "movq %0, %%mm4 \n\t" "movq %1, %%mm6 \n\t" "movq %%mm4, %%mm5 \n\t" "punpcklbw %%mm7, %%mm4 \n\t" "punpckhbw %%mm7, %%mm5 \n\t" "paddsw %%mm4, %%mm0 \n\t" "paddsw %%mm5, %%mm1 \n\t" "movq %%mm6, %%mm5 \n\t" "punpcklbw %%mm7, %%mm6 \n\t" "punpckhbw %%mm7, %%mm5 \n\t" "paddsw %%mm6, %%mm2 \n\t" "paddsw %%mm5, %%mm3 \n\t" "packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm3, %%mm2 \n\t" "movq %%mm0, %0 \n\t" "movq %%mm2, %1 \n\t" :"+m"(*pix), "+m"(*(pix+line_size)) :"r"(p) :"memory"); pix += line_size*2; p += 16; } while (--i);}static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h){ asm volatile( "lea (%3, %3), %%"REG_a" \n\t" ASMALIGN(3) "1: \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" "movd %%mm0, (%2) \n\t" "movd %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" "movd %%mm0, (%2) \n\t" "movd %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((x86_reg)line_size) : "%"REG_a, "memory" );}static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h){ asm volatile( "lea (%3, %3), %%"REG_a" \n\t" ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((x86_reg)line_size) : "%"REG_a, "memory" );}static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h){ asm volatile( "lea (%3, %3), %%"REG_a" \n\t" ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((x86_reg)line_size) : "%"REG_a, "memory" );}static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h){ asm volatile( "1: \n\t" "movdqu (%1), %%xmm0 \n\t" "movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%4), %%xmm3 \n\t" "movdqa %%xmm0, (%2) \n\t" "movdqa %%xmm1, (%2,%3) \n\t" "movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm3, (%2,%4) \n\t" "subl $4, %0 \n\t" "lea (%1,%3,4), %1 \n\t" "lea (%2,%3,4), %2 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) : "memory" );}static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h){ asm volatile( "1: \n\t" "movdqu (%1), %%xmm0 \n\t" "movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%4), %%xmm3 \n\t" "pavgb (%2), %%xmm0 \n\t" "pavgb (%2,%3), %%xmm1 \n\t" "pavgb (%2,%3,2), %%xmm2 \n\t" "pavgb (%2,%4), %%xmm3 \n\t" "movdqa %%xmm0, (%2) \n\t" "movdqa %%xmm1, (%2,%3) \n\t" "movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm3, (%2,%4) \n\t" "subl $4, %0 \n\t" "lea (%1,%3,4), %1 \n\t" "lea (%2,%3,4), %2 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) : "memory" );}static void clear_blocks_mmx(DCTELEM *blocks){ asm volatile( "pxor %%mm7, %%mm7 \n\t" "mov $-128*6, %%"REG_a" \n\t" "1: \n\t" "movq %%mm7, (%0, %%"REG_a") \n\t" "movq %%mm7, 8(%0, %%"REG_a") \n\t" "movq %%mm7, 16(%0, %%"REG_a") \n\t" "movq %%mm7, 24(%0, %%"REG_a") \n\t" "add $32, %%"REG_a" \n\t" " js 1b \n\t" : : "r" (((uint8_t *)blocks)+128*6) : "%"REG_a );}static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ x86_reg i=0; asm volatile( "jmp 2f \n\t" "1: \n\t" "movq (%1, %0), %%mm0 \n\t" "movq (%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, (%2, %0) \n\t" "movq 8(%1, %0), %%mm0 \n\t" "movq 8(%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%2, %0) \n\t" "add $16, %0 \n\t" "2: \n\t" "cmp %3, %0 \n\t" " js 1b \n\t" : "+r" (i) : "r"(src), "r"(dst), "r"((x86_reg)w-15) ); for(; i<w; i++) dst[i+0] += src[i+0];}static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ x86_reg i=0; asm volatile( "jmp 2f \n\t" "1: \n\t" "movq (%2, %0), %%mm0 \n\t" "movq 8(%2, %0), %%mm1 \n\t" "paddb (%3, %0), %%mm0 \n\t" "paddb 8(%3, %0), %%mm1 \n\t" "movq %%mm0, (%1, %0) \n\t" "movq %%mm1, 8(%1, %0) \n\t" "add $16, %0 \n\t" "2: \n\t" "cmp %4, %0 \n\t" " js 1b \n\t" : "+r" (i) : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) ); for(; i<w; i++) dst[i] = src1[i] + src2[i];}#define H263_LOOP_FILTER \ "pxor %%mm7, %%mm7 \n\t"\ "movq %0, %%mm0 \n\t"\ "movq %0, %%mm1 \n\t"\ "movq %3, %%mm2 \n\t"\ "movq %3, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpckhbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "psubw %%mm2, %%mm0 \n\t"\ "psubw %%mm3, %%mm1 \n\t"\ "movq %1, %%mm2 \n\t"\ "movq %1, %%mm3 \n\t"\ "movq %2, %%mm4 \n\t"\ "movq %2, %%mm5 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ "punpckhbw %%mm7, %%mm5 \n\t"\ "psubw %%mm2, %%mm4 \n\t"\ "psubw %%mm3, %%mm5 \n\t"\ "psllw $2, %%mm4 \n\t"\ "psllw $2, %%mm5 \n\t"\ "paddw %%mm0, %%mm4 \n\t"\ "paddw %%mm1, %%mm5 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ "pcmpgtw %%mm4, %%mm6 \n\t"\ "pcmpgtw %%mm5, %%mm7 \n\t"\ "pxor %%mm6, %%mm4 \n\t"\ "pxor %%mm7, %%mm5 \n\t"\ "psubw %%mm6, %%mm4 \n\t"\ "psubw %%mm7, %%mm5 \n\t"\ "psrlw $3, %%mm4 \n\t"\ "psrlw $3, %%mm5 \n\t"\ "packuswb %%mm5, %%mm4 \n\t"\ "packsswb %%mm7, %%mm6 \n\t"\ "pxor %%mm7, %%mm7 \n\t"\ "movd %4, %%mm2 \n\t"\ "punpcklbw %%mm2, %%mm2 \n\t"\ "punpcklbw %%mm2, %%mm2 \n\t"\ "punpcklbw %%mm2, %%mm2 \n\t"\ "psubusb %%mm4, %%mm2 \n\t"\ "movq %%mm2, %%mm3 \n\t"\ "psubusb %%mm4, %%mm3 \n\t"\ "psubb %%mm3, %%mm2 \n\t"\ "movq %1, %%mm3 \n\t"\ "movq %2, %%mm4 \n\t"\ "pxor %%mm6, %%mm3 \n\t"\ "pxor %%mm6, %%mm4 \n\t"\ "paddusb %%mm2, %%mm3 \n\t"\ "psubusb %%mm2, %%mm4 \n\t"\ "pxor %%mm6, %%mm3 \n\t"\ "pxor %%mm6, %%mm4 \n\t"\ "paddusb %%mm2, %%mm2 \n\t"\ "packsswb %%mm1, %%mm0 \n\t"\ "pcmpgtb %%mm0, %%mm7 \n\t"\ "pxor %%mm7, %%mm0 \n\t"\ "psubb %%mm7, %%mm0 \n\t"\ "movq %%mm0, %%mm1 \n\t"\ "psubusb %%mm2, %%mm0 \n\t"\ "psubb %%mm0, %%mm1 \n\t"\ "pand %5, %%mm1 \n\t"\ "psrlw $2, %%mm1 \n\t"\ "pxor %%mm7, %%mm1 \n\t"\ "psubb %%mm7, %%mm1 \n\t"\ "movq %0, %%mm5 \n\t"\ "movq %3, %%mm6 \n\t"\ "psubb %%mm1, %%mm5 \n\t"\
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -