speedy.c

来自「linux下的MPEG1」· C语言 代码 · 共 2,101 行 · 第 1/5 页

C
2,101
字号
                               | (multiply_alpha( cur_a, input[ 2 ] ) << 16)                               | (multiply_alpha( cur_a, input[ 1 ] ) << 8)                               | cur_a;        output += 4;        input += 4;    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void premultiply_packed4444_scanline_mmxext( uint8_t *output, uint8_t *input, int width ){    const mmx_t round  = { 0x0080008000800080ULL };    const mmx_t alpha  = { 0x00000000000000ffULL };    const mmx_t noalp  = { 0xffffffffffff0000ULL };    pxor_r2r( mm7, mm7 );    while( width-- ) {        movd_m2r( *input, mm0 );        punpcklbw_r2r( mm7, mm0 );        movq_r2r( mm0, mm2 );        pshufw_r2r( mm2, mm2, 0 );        movq_r2r( mm2, mm4 );        pand_m2r( alpha, mm4 );        pmullw_r2r( mm2, mm0 );        paddw_m2r( round, mm0 );        movq_r2r( mm0, mm3 );        psrlw_i2r( 8, mm3 );        paddw_r2r( mm3, mm0 );        psrlw_i2r( 8, mm0 );        pand_m2r( noalp, mm0 );        paddw_r2r( mm4, mm0 );        packuswb_r2r( mm0, mm0 );        movd_r2m( mm0, *output );        output += 4;        input += 4;    }    sfence();    emms();}#endifstatic void blend_packed422_scanline_c( uint8_t *output, uint8_t *src1,                                        uint8_t *src2, int width, int pos ){    if( pos == 0 ) {        blit_packed422_scanline( output, src1, width );    } else if( pos == 256 ) {        blit_packed422_scanline( output, src2, width );    } else if( pos == 128 ) {        interpolate_packed422_scanline( output, src1, src2, width );    } else {        width *= 2;        while( width-- ) {            *output++ = ( (*src1++ * ( 256 - pos )) + (*src2++ * pos) + 0x80 ) >> 8;        }    }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blend_packed422_scanline_mmxext( uint8_t *output, uint8_t *src1,                                             uint8_t *src2, int width, int pos ){    if( pos <= 0 ) {        blit_packed422_scanline( output, src1, width );    } else if( pos >= 256 ) {        blit_packed422_scanline( output, src2, width );    } else if( pos == 128 ) {        interpolate_packed422_scanline( output, src1, src2, width );    } else {        const mmx_t all256 = { 0x0100010001000100ULL };        const mmx_t round  = { 0x0080008000800080ULL };        movd_m2r( pos, mm0 );        pshufw_r2r( mm0, mm0, 0 );        movq_m2r( all256, mm1 );        psubw_r2r( mm0, mm1 );        pxor_r2r( mm7, mm7 );        for( width /= 2; width; width-- ) {            movd_m2r( *src1, mm3 );            movd_m2r( *src2, mm4 );            punpcklbw_r2r( mm7, mm3 );            punpcklbw_r2r( mm7, mm4 );            pmullw_r2r( mm1, mm3 );            pmullw_r2r( mm0, mm4 );            paddw_r2r( mm4, mm3 );            paddw_m2r( round, mm3 );            psrlw_i2r( 8, mm3 );            packuswb_r2r( mm3, mm3 );            movd_r2m( mm3, *output );            output += 4;            src1 += 4;            src2 += 4;        }        sfence();        emms();    }}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void quarter_blit_vertical_packed422_scanline_mmxext( uint8_t *output, uint8_t *one,                                                             uint8_t *three, int width ){    int i;    for( i = width/16; i; --i ) {        movq_m2r( *one, mm0 );        movq_m2r( *three, mm1 );        movq_m2r( *(one + 8), mm2 );        movq_m2r( *(three + 8), mm3 );        movq_m2r( *(one + 16), mm4 );        movq_m2r( *(three + 16), mm5 );        movq_m2r( *(one + 24), mm6 );        movq_m2r( *(three + 24), mm7 );        pavgb_r2r( mm1, mm0 );        pavgb_r2r( mm1, mm0 );        pavgb_r2r( mm3, mm2 );        pavgb_r2r( mm3, mm2 );        pavgb_r2r( mm5, mm4 );        pavgb_r2r( mm5, mm4 );        pavgb_r2r( mm7, mm6 );        pavgb_r2r( mm7, mm6 );        movntq_r2m( mm0, *output );        movntq_r2m( mm2, *(output + 8) );        movntq_r2m( mm4, *(output + 16) );        movntq_r2m( mm6, *(output + 24) );        output += 32;        one += 32;        three += 32;    }    width = (width & 0xf);    for( i = width/4; i; --i ) {        movq_m2r( *one, mm0 );        movq_m2r( *three, mm1 );        pavgb_r2r( mm1, mm0 );        pavgb_r2r( mm1, mm0 );        movntq_r2m( mm0, *output );        output += 8;        one += 8;        three += 8;    }    width = width & 0x7;    /* Handle last few pixels. */    for( i = width * 2; i; --i ) {        *output++ = (*one + *three + *three + *three + 2) / 4;        one++;        three++;    }    sfence();    emms();}#endifstatic void quarter_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *one,                                                        uint8_t *three, int width ){    width *= 2;    while( width-- ) {        *output++ = (*one + *three + *three + *three + 2) / 4;        one++;        three++;    }}static void subpix_blit_vertical_packed422_scanline_c( uint8_t *output, uint8_t *top,                                                       uint8_t *bot, int subpixpos, int width ){    if( subpixpos == 32768 ) {        interpolate_packed422_scanline( output, top, bot, width );    } else if( subpixpos == 16384 ) {        quarter_blit_vertical_packed422_scanline( output, top, bot, width );    } else if( subpixpos == 49152 ) {        quarter_blit_vertical_packed422_scanline( output, bot, top, width );    } else {        int x;        width *= 2;        for( x = 0; x < width; x++ ) {            output[ x ] = ( ( top[ x ] * subpixpos ) + ( bot[ x ] * ( 0xffff - subpixpos ) ) ) >> 16;        }    }}static void a8_subpix_blit_scanline_c( uint8_t *output, uint8_t *input,                                       int lasta, int startpos, int width ){    int pos = 0xffff - (startpos & 0xffff);    int prev = lasta;    int x;    for( x = 0; x < width; x++ ) {        output[ x ] = ( ( prev * pos ) + ( input[ x ] * ( 0xffff - pos ) ) ) >> 16;        prev = input[ x ];    }}/** * These are from lavtools in mjpegtools: * * colorspace.c:  Routines to perform colorspace conversions. * *  Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com> * *  This program is free software; you can redistribute it and/or *  modify it under the terms of the GNU General Public License *  as published by the Free Software Foundation; either version 2 *  of the License, or (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */#define FP_BITS 18/* precomputed tables */static int Y_R[256];static int Y_G[256];static int Y_B[256];static int Cb_R[256];static int Cb_G[256];static int Cb_B[256];static int Cr_R[256];static int Cr_G[256];static int Cr_B[256];static int conv_RY_inited = 0;static int RGB_Y[256];static int R_Cr[256];static int G_Cb[256];static int G_Cr[256];static int B_Cb[256];static int conv_YR_inited = 0;static int myround(double n){  if (n >= 0)     return (int)(n + 0.5);  else    return (int)(n - 0.5);}static void init_RGB_to_YCbCr_tables(void){  int i;  /*   * Q_Z[i] =   (coefficient * i   *             * (Q-excursion) / (Z-excursion) * fixed-point-factor)   *   * to one of each, add the following:   *             + (fixed-point-factor / 2)         --- for rounding later   *             + (Q-offset * fixed-point-factor)  --- to add the offset   *                */  for (i = 0; i < 256; i++) {    Y_R[i] = myround(0.299 * (double)i * 219.0 / 255.0 * (double)(1<<FP_BITS));    Y_G[i] = myround(0.587 * (double)i * 219.0 / 255.0 * (double)(1<<FP_BITS));    Y_B[i] = myround((0.114 * (double)i * 219.0 / 255.0 * (double)(1<<FP_BITS))                     + (double)(1<<(FP_BITS-1)) + (16.0 * (double)(1<<FP_BITS)));    Cb_R[i] = myround(-0.168736 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));    Cb_G[i] = myround(-0.331264 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));    Cb_B[i] = myround((0.500 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS))                       + (double)(1<<(FP_BITS-1)) + (128.0 * (double)(1<<FP_BITS)));    Cr_R[i] = myround(0.500 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));    Cr_G[i] = myround(-0.418688 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS));    Cr_B[i] = myround((-0.081312 * (double)i * 224.0 / 255.0 * (double)(1<<FP_BITS))                      + (double)(1<<(FP_BITS-1)) + (128.0 * (double)(1<<FP_BITS)));  }  conv_RY_inited = 1;}static void init_YCbCr_to_RGB_tables(void){  int i;  /*   * Q_Z[i] =   (coefficient * i   *             * (Q-excursion) / (Z-excursion) * fixed-point-factor)   *   * to one of each, add the following:   *             + (fixed-point-factor / 2)         --- for rounding later   *             + (Q-offset * fixed-point-factor)  --- to add the offset   *                */  /* clip Y values under 16 */  for (i = 0; i < 16; i++) {    RGB_Y[i] = myround((1.0 * (double)(16) * 255.0 / 219.0 * (double)(1<<FP_BITS))                       + (double)(1<<(FP_BITS-1)));  }  for (i = 16; i < 236; i++) {    RGB_Y[i] = myround((1.0 * (double)(i - 16) * 255.0 / 219.0 * (double)(1<<FP_BITS))                       + (double)(1<<(FP_BITS-1)));  }  /* clip Y values above 235 */  for (i = 236; i < 256; i++) {    RGB_Y[i] = myround((1.0 * (double)(235) * 255.0 / 219.0 * (double)(1<<FP_BITS))                       + (double)(1<<(FP_BITS-1)));  }      /* clip Cb/Cr values below 16 */  for (i = 0; i < 16; i++) {    R_Cr[i] = myround(1.402 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));    G_Cr[i] = myround(-0.714136 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));    G_Cb[i] = myround(-0.344136 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));    B_Cb[i] = myround(1.772 * (double)(-112) * 255.0 / 224.0 * (double)(1<<FP_BITS));  }  for (i = 16; i < 241; i++) {    R_Cr[i] = myround(1.402 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));    G_Cr[i] = myround(-0.714136 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));    G_Cb[i] = myround(-0.344136 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));    B_Cb[i] = myround(1.772 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));  }  /* clip Cb/Cr values above 240 */  for (i = 241; i < 256; i++) {    R_Cr[i] = myround(1.402 * (double)(112) * 255.0 / 224.0 * (double)(1<<FP_BITS));    G_Cr[i] = myround(-0.714136 * (double)(112) * 255.0 / 224.0 * (double)(1<<FP_BITS));    G_Cb[i] = myround(-0.344136 * (double)(i - 128) * 255.0 / 224.0 * (double)(1<<FP_BITS));    B_Cb[i] = myround(1.772 * (double)(112) * 255.0 / 224.0 * (double)(1<<FP_BITS));  }  conv_YR_inited = 1;}static void rgb24_to_packed444_rec601_scanline_c( uint8_t *output, uint8_t *input, int width ){    if( !conv_RY_inited ) init_RGB_to_YCbCr_tables();    while( width-- ) {        int r = input[ 0 ];        int g = input[ 1 ];        int b = input[ 2 ];        output[ 0 ] = (Y_R[ r ] + Y_G[ g ] + Y_B[ b ]) >> FP_BITS;        output[ 1 ] = (Cb_R[ r ] + Cb_G[ g ] + Cb_B[ b ]) >> FP_BITS;        output[ 2 ] = (Cr_R[ r ] + Cr_G[ g ] + Cr_B[ b ]) >> FP_BITS;        output += 3;        input += 3;    }}static void rgba32_to_packed4444_rec601_scanline_c( uint8_t *output, uint8_t *input, int width ){    if( !conv_RY_inited ) init_RGB_to_YCbCr_tables();    while( width-- ) {        int r = input[ 0 ];        int g = input[ 1 ];        int b = input[ 2 ];        int a = input[ 3 ];                output[ 0 ] = a;        output[ 1 ] = (Y_R[ r ] + Y_G[ g ] + Y_B[ b ]) >> FP_BITS;        output[ 2 ] = (Cb_R[ r ] + Cb_G[ g ] + Cb_B[ b ]) >> FP_BITS;        output[ 3 ] = (Cr_R[ r ] + Cr_G[ g ] + Cr_B[ b ]) >> FP_BITS;        output += 4;        input += 4;    }}static void packed444_to_rgb24_rec601_scanline_c( uint8_t *output, uint8_t *input, int width ){    if( !conv_YR_inited ) init_YCbCr_to_RGB_tables();    while( width-- ) {        int luma = input[ 0 ];        int cb = input[ 1 ];        int cr = input[ 2 ];        output[ 0 ] = clip255( (RGB_Y[ luma ] + R_Cr[ cr ]) >> FP_BITS );        output[ 1 ] = clip255( (RGB_Y[ luma ] + G_Cb[ cb ] + G_Cr[cr]) >> FP_BITS );        output[ 2 ] = clip255( (RGB_Y[ luma ] + B_Cb[ cb ]) >> FP_BITS );        output += 3;        input += 3;    }}/** * 601 numbers: * * Y' =  0.299*R' + 0.587*G' + 0.114*B' (in  0.0 to  1.0) * Cb = -0.169*R' - 0.331*G' + 0.500*B' (in -0.5 to +0.5) * Cr =  0.500*R' - 0.419*G' - 0.081*B' (in -0.5 to +0.5) * * Inverse: *      Y         Cb        Cr * R  1.0000   -0.0009    1.4017 * G  1.0000   -0.3437   -0.7142 * B  1.0000    1.7722    0.0010 * * S170M numbers: * Y'   =  0.299*R' + 0.587*G' + 0.114*B' (in  0.0 to 1.0) * B-Y' = -0.299*R' - 0.587*G' 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?