📄 snowdsp_mmx.c
字号:
"pavgw %%mm3, %%mm6 \n\t" snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") "2: \n\t" "sub $32, %%"REG_d" \n\t" "jge 1b \n\t" :"+d"(i) :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));}#endif //HAVE_7REGS#define snow_inner_add_yblock_sse2_header \ IDWTELEM * * dst_array = sb->line + src_y;\ long tmp;\ asm volatile(\ "mov %7, %%"REG_c" \n\t"\ "mov %6, %2 \n\t"\ "mov %4, %%"REG_S" \n\t"\ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ "pcmpeqd %%xmm3, %%xmm3 \n\t"\ "psllw $15, %%xmm3 \n\t"\ "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ "1: \n\t"\ "mov %1, %%"REG_D" \n\t"\ "mov (%%"REG_D"), %%"REG_D" \n\t"\ "add %3, %%"REG_D" \n\t"#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ "movq (%%"REG_d"), %%"out_reg1" \n\t"\ "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ "punpcklbw %%xmm7, %%xmm0 \n\t"\ "punpcklbw %%xmm7, %%xmm4 \n\t"\ "pmullw %%xmm0, %%"out_reg1" \n\t"\ "pmullw %%xmm4, %%"out_reg2" \n\t"#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ "movq (%%"REG_d"), %%"out_reg1" \n\t"\ "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ "punpcklbw %%xmm7, %%xmm0 \n\t"\ "punpcklbw %%xmm7, %%xmm4 \n\t"\ "pmullw %%xmm0, %%"out_reg1" \n\t"\ "pmullw %%xmm4, %%"out_reg2" \n\t"#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ "paddusw %%xmm2, %%xmm1 \n\t"\ "paddusw %%xmm6, %%xmm5 \n\t"#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ "paddusw %%xmm2, %%xmm1 \n\t"\ "paddusw %%xmm6, %%xmm5 \n\t"#define snow_inner_add_yblock_sse2_end_common1\ "add $32, %%"REG_S" \n\t"\ "add %%"REG_c", %0 \n\t"\ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ "add %%"REG_c", (%%"REG_a") \n\t"#define snow_inner_add_yblock_sse2_end_common2\ "jnz 1b \n\t"\ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ :\ "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");#define snow_inner_add_yblock_sse2_end_8\ "sal $1, %%"REG_c" \n\t"\ "add $"PTR_SIZE"*2, %1 \n\t"\ snow_inner_add_yblock_sse2_end_common1\ "sar $1, %%"REG_c" \n\t"\ "sub $2, %2 \n\t"\ snow_inner_add_yblock_sse2_end_common2#define snow_inner_add_yblock_sse2_end_16\ "add $"PTR_SIZE"*1, %1 \n\t"\ snow_inner_add_yblock_sse2_end_common1\ "dec %2 \n\t"\ snow_inner_add_yblock_sse2_end_common2static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){snow_inner_add_yblock_sse2_headersnow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")snow_inner_add_yblock_sse2_accum_8("2", "8")snow_inner_add_yblock_sse2_accum_8("1", "128")snow_inner_add_yblock_sse2_accum_8("0", "136") "mov %0, %%"REG_d" \n\t" "movdqa (%%"REG_D"), %%xmm0 \n\t" "movdqa %%xmm1, %%xmm2 \n\t" "punpckhwd %%xmm7, %%xmm1 \n\t" "punpcklwd %%xmm7, %%xmm2 \n\t" "paddd %%xmm2, %%xmm0 \n\t" "movdqa 16(%%"REG_D"), %%xmm2 \n\t" "paddd %%xmm1, %%xmm2 \n\t" "paddd %%xmm3, %%xmm0 \n\t" "paddd %%xmm3, %%xmm2 \n\t" "mov %1, %%"REG_D" \n\t" "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" "add %3, %%"REG_D" \n\t" "movdqa (%%"REG_D"), %%xmm4 \n\t" "movdqa %%xmm5, %%xmm6 \n\t" "punpckhwd %%xmm7, %%xmm5 \n\t" "punpcklwd %%xmm7, %%xmm6 \n\t" "paddd %%xmm6, %%xmm4 \n\t" "movdqa 16(%%"REG_D"), %%xmm6 \n\t" "paddd %%xmm5, %%xmm6 \n\t" "paddd %%xmm3, %%xmm4 \n\t" "paddd %%xmm3, %%xmm6 \n\t" "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ "packssdw %%xmm2, %%xmm0 \n\t" "packuswb %%xmm7, %%xmm0 \n\t" "movq %%xmm0, (%%"REG_d") \n\t" "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ "packssdw %%xmm6, %%xmm4 \n\t" "packuswb %%xmm7, %%xmm4 \n\t" "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"snow_inner_add_yblock_sse2_end_8}static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){snow_inner_add_yblock_sse2_headersnow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")snow_inner_add_yblock_sse2_accum_16("2", "16")snow_inner_add_yblock_sse2_accum_16("1", "512")snow_inner_add_yblock_sse2_accum_16("0", "528") "mov %0, %%"REG_d" \n\t" "psrlw $4, %%xmm1 \n\t" "psrlw $4, %%xmm5 \n\t" "paddw (%%"REG_D"), %%xmm1 \n\t" "paddw 16(%%"REG_D"), %%xmm5 \n\t" "paddw %%xmm3, %%xmm1 \n\t" "paddw %%xmm3, %%xmm5 \n\t" "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ "packuswb %%xmm5, %%xmm1 \n\t" "movdqu %%xmm1, (%%"REG_d") \n\t"snow_inner_add_yblock_sse2_end_16}#define snow_inner_add_yblock_mmx_header \ IDWTELEM * * dst_array = sb->line + src_y;\ long tmp;\ asm volatile(\ "mov %7, %%"REG_c" \n\t"\ "mov %6, %2 \n\t"\ "mov %4, %%"REG_S" \n\t"\ "pxor %%mm7, %%mm7 \n\t" /* 0 */\ "pcmpeqd %%mm3, %%mm3 \n\t"\ "psllw $15, %%mm3 \n\t"\ "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ "1: \n\t"\ "mov %1, %%"REG_D" \n\t"\ "mov (%%"REG_D"), %%"REG_D" \n\t"\ "add %3, %%"REG_D" \n\t"#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ "punpcklbw %%mm7, %%"out_reg1" \n\t"\ "punpcklbw %%mm7, %%"out_reg2" \n\t"\ "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ "pmullw %%mm0, %%"out_reg1" \n\t"\ "pmullw %%mm4, %%"out_reg2" \n\t"#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ "paddusw %%mm2, %%mm1 \n\t"\ "paddusw %%mm6, %%mm5 \n\t"#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ "mov %0, %%"REG_d" \n\t"\ "psrlw $4, %%mm1 \n\t"\ "psrlw $4, %%mm5 \n\t"\ "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ "paddw %%mm3, %%mm1 \n\t"\ "paddw %%mm3, %%mm5 \n\t"\ "psraw $4, %%mm1 \n\t"\ "psraw $4, %%mm5 \n\t"\ "packuswb %%mm5, %%mm1 \n\t"\ "movq %%mm1, "write_offset"(%%"REG_d") \n\t"#define snow_inner_add_yblock_mmx_end(s_step)\ "add $"s_step", %%"REG_S" \n\t"\ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ "add %%"REG_c", (%%"REG_a") \n\t"\ "add $"PTR_SIZE"*1, %1 \n\t"\ "add %%"REG_c", %0 \n\t"\ "dec %2 \n\t"\ "jnz 1b \n\t"\ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ :\ "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){snow_inner_add_yblock_mmx_headersnow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")snow_inner_add_yblock_mmx_accum("2", "8", "0")snow_inner_add_yblock_mmx_accum("1", "128", "0")snow_inner_add_yblock_mmx_accum("0", "136", "0")snow_inner_add_yblock_mmx_mix("0", "0")snow_inner_add_yblock_mmx_end("16")}static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){snow_inner_add_yblock_mmx_headersnow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")snow_inner_add_yblock_mmx_accum("2", "16", "0")snow_inner_add_yblock_mmx_accum("1", "512", "0")snow_inner_add_yblock_mmx_accum("0", "528", "0")snow_inner_add_yblock_mmx_mix("0", "0")snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")snow_inner_add_yblock_mmx_accum("2", "24", "8")snow_inner_add_yblock_mmx_accum("1", "520", "8")snow_inner_add_yblock_mmx_accum("0", "536", "8")snow_inner_add_yblock_mmx_mix("16", "8")snow_inner_add_yblock_mmx_end("32")}void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ if (b_w == 16) inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); else if (b_w == 8 && obmc_stride == 16) { if (!(b_h & 1)) inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); else inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); } else ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);}void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ if (b_w == 16) inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); else if (b_w == 8 && obmc_stride == 16) inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); else ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -