📄 mc.c

📁 图象压缩程序
💻 C
📖 第 1 页 / 共 3 页
字号:
        src += i_src_stride;        dst += i_dst_stride;    }}static inline void mc_hv_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    int y;    src -= 2 * i_src_stride;    asm volatile(        "pxor %%mm7,        %%mm7\n"        "movq x264_w0x10,   %%mm4\n"        : : );    for( y = 0; y < i_height; y++ )    {        asm volatile(            "leal   (%0, %1),   %%eax\n"            "movq       (%0),   %%mm0\n"    /* load pix-2 */            "movq       %%mm0,  %%mm2\n"            "punpcklbw  %%mm7,  %%mm0\n"            "punpckhbw  %%mm7,  %%mm2\n"            "movq       (%%eax),%%mm1\n"    /* load pix-1 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psubw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "psubw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "psubw      %%mm3,  %%mm2\n"            "movq       (%%eax,%1),%%mm1\n"  /* load pix */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "movq       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "movq       (%0,%1,4),%%mm1\n"  /* load pix+2 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psubw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "psubw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "psubw      %%mm3,  %%mm2\n"            "movq       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "paddw      %%mm1,  %%mm0\n"            "paddw      %%mm3,  %%mm2\n"            "paddw      %%mm4,  %%mm0\n"            "psraw      $5,     %%mm0\n"            "paddw      %%mm4,  %%mm2\n"            "psraw      $5,     %%mm2\n"            "packuswb   %%mm2,  %%mm0\n"            "movq       %%mm0,  (%2)\n"            : : "r"(src), "r"(i_src_stride), "r"(dst) : "%eax" );        src += i_src_stride;        dst += i_dst_stride;    }}static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    int x, y;    asm volatile( "pxor %%mm7,        %%mm7\n" : : );    for( y = 0; y < i_height; y++ )    {        int16_t tap[5+8];        /* first 8 */        asm volatile(            "leal   (%0, %1),   %%eax\n"            "movq       (%0),   %%mm0\n"    /* load pix-2 */            "movq       %%mm0,  %%mm2\n"            "punpcklbw  %%mm7,  %%mm0\n"            "punpckhbw  %%mm7,  %%mm2\n"            "movq       (%%eax),%%mm1\n"    /* load pix-1 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psubw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "psubw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "psubw      %%mm3,  %%mm2\n"            "movq       (%%eax,%1),%%mm1\n"  /* load pix */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "movq       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "paddw      %%mm3,  %%mm2\n"            "movq       (%0,%1,4),%%mm1\n"  /* load pix+2 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psubw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "psubw      %%mm3,  %%mm2\n"            "psllw      $2,     %%mm3\n"            "psubw      %%mm3,  %%mm2\n"            "movq       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "paddw      %%mm1,  %%mm0\n"            "paddw      %%mm3,  %%mm2\n"            "movq       %%mm0,   (%2)\n"            "movq       %%mm2,  8(%2)\n"            "addl   $8,         %%eax\n"            "addl   $8,         %0\n"            "movd       (%0),   %%mm0\n"    /* load pix-2 */            "punpcklbw  %%mm7,  %%mm0\n"            "movd       (%%eax),%%mm1\n"    /* load pix-1 */            "punpcklbw  %%mm7,  %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "movd       (%%eax,%1),%%mm1\n"  /* load pix */            "punpcklbw  %%mm7,  %%mm1\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "movd       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */            "punpcklbw  %%mm7,  %%mm1\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "movd       (%0,%1,4),%%mm1\n"  /* load pix+2 */            "punpcklbw  %%mm7,  %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "psubw      %%mm1,  %%mm0\n"            "movd       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */            "punpcklbw  %%mm7,  %%mm1\n"            "paddw      %%mm1,  %%mm0\n"            "movq       %%mm0,  16(%2)\n"            : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" );        /* last one */        tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride );        for( x = 0; x < 8; x++ )        {            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );        }        src += i_src_stride;        dst += i_dst_stride;    }}/* mc I+H */static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp[8*16];    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );}static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp[8*16];    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );}/* mc I+V */static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp[8*16];    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );}static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp[8*16];    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );}/* H+V */static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );    mc_hh_w8( src,   i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hv_w8( src,              i_src_stride, tmp1, 8, i_height );    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hv_w8( src+1,            i_src_stride, tmp1, 8, i_height );    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );    mc_hv_w8( src, i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hc_w8( src,   i_src_stride, tmp1, 8, i_height );    mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    uint8_t tmp1[8*16];    uint8_t tmp2[8*16];    mc_hc_w8( src,              i_src_stride, tmp1, 8, i_height );    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}/***************************************************************************** * MC with width == 16 (height <= 16) *****************************************************************************/static void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    int y;    for( y = 0; y < i_height; y++ )    {        memcpy( dst, src, 16 );        src += i_src_stride;        dst += i_dst_stride;    }}static inline void mc_hh_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    int x, y;    for( y = 0; y < i_height; y++ )    {        for( x = 0; x < 16; x++ )        {            dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) + 16 ) >> 5 );        }        src += i_src_stride;        dst += i_dst_stride;    }}static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    mc_hv_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );    mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );}static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    int x, y;    asm volatile( "pxor %%mm7,        %%mm7\n" : : );    for( y = 0; y < i_height; y++ )    {        int16_t tap[5+16];        asm volatile(            "leal   (%0, %1),   %%eax\n"            "movq       (%0),   %%mm0\n"    /* load pix-2 */            "movq       %%mm0,  %%mm2\n"            "punpcklbw  %%mm7,  %%mm0\n"            "punpckhbw  %%mm7,  %%mm2\n"            "movq       (%%eax),%%mm1\n"    /* load pix-1 */            "movq       %%mm1,  %%mm3\n"            "punpcklbw  %%mm7,  %%mm1\n"            "punpckhbw  %%mm7,  %%mm3\n"            "psubw      %%mm1,  %%mm0\n"            "psllw      $2,     %%mm1\n"            "psubw      %%mm1,  %%mm0\n"
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -