speedy.c
来自「linux下的MPEG1」· C语言 代码 · 共 2,101 行 · 第 1/5 页
C
2,101 行
{ int r1 = 0; int r2 = 0; data += 2; width -= 1; while( width-- ) { int s1, s2; s1 = *data + r1; r1 = *data; s2 = s1 + r2; r2 = s1; *(data - 2) = s2 >> 2; data += 2; }}static void filter_luma_14641_packed422_inplace_scanline_c( uint8_t *data, int width ){ int r1 = 0; int r2 = 0; int r3 = 0; int r4 = 0; width -= 4; data += 4; while( width-- ) { int s1, s2, s3, s4; s1 = *data + r1; r1 = *data; s2 = s1 + r2; r2 = s1; s3 = s2 + r3; r3 = s2; s4 = s3 + r4; r4 = s3; *(data - 4) = s4 >> 4; data += 2; }}static void interpolate_packed422_scanline_c( uint8_t *output, uint8_t *top, uint8_t *bot, int width ){ int i; for( i = width*2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top, uint8_t *bot, int width ){ const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); pand_m2r( shiftmask, mm2 ); pand_m2r( shiftmask, mm3 ); pand_m2r( shiftmask, mm4 ); pand_m2r( shiftmask, mm5 ); pand_m2r( shiftmask, mm6 ); pand_m2r( shiftmask, mm7 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); psrlw_i2r( 1, mm2 ); psrlw_i2r( 1, mm3 ); psrlw_i2r( 1, mm4 ); psrlw_i2r( 1, mm5 ); psrlw_i2r( 1, mm6 ); psrlw_i2r( 1, mm7 ); paddb_r2r( mm1, mm0 ); paddb_r2r( mm3, mm2 ); paddb_r2r( mm5, mm4 ); paddb_r2r( mm7, mm6 ); movq_r2m( mm0, *output ); movq_r2m( mm2, *(output + 8) ); movq_r2m( mm4, *(output + 16) ); movq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); paddb_r2r( mm1, mm0 ); movq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } emms();}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top, uint8_t *bot, int width ){ int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pavgb_r2r( mm1, mm0 ); pavgb_r2r( mm3, mm2 ); pavgb_r2r( mm5, mm4 ); pavgb_r2r( mm7, mm6 ); movntq_r2m( mm0, *output ); movntq_r2m( mm2, *(output + 8) ); movntq_r2m( mm4, *(output + 16) ); movntq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pavgb_r2r( mm1, mm0 ); movntq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } sfence(); emms();}#endifstatic void blit_colour_packed422_scanline_c( uint8_t *output, int width, int y, int cb, int cr ){ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; uint32_t *o = (uint32_t *) output; for( width /= 2; width; --width ) { *o++ = colour; }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed422_scanline_mmx( uint8_t *output, int width, int y, int cb, int cr ){ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; int i; movd_m2r( colour, mm1 ); movd_m2r( colour, mm2 ); psllq_i2r( 32, mm1 ); por_r2r( mm1, mm2 ); for( i = width / 16; i; --i ) { movq_r2m( mm2, *output ); movq_r2m( mm2, *(output + 8) ); movq_r2m( mm2, *(output + 16) ); movq_r2m( mm2, *(output + 24) ); output += 32; } width = (width & 0xf); for( i = width / 4; i; --i ) { movq_r2m( mm2, *output ); output += 8; } width = (width & 0x7); for( i = width / 2; i; --i ) { *((uint32_t *) output) = colour; output += 4; } if( width & 1 ) { *output = y; *(output + 1) = cb; } emms();}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed422_scanline_mmxext( uint8_t *output, int width, int y, int cb, int cr ){ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; int i; movd_m2r( colour, mm1 ); movd_m2r( colour, mm2 ); psllq_i2r( 32, mm1 ); por_r2r( mm1, mm2 ); for( i = width / 16; i; --i ) { movntq_r2m( mm2, *output ); movntq_r2m( mm2, *(output + 8) ); movntq_r2m( mm2, *(output + 16) ); movntq_r2m( mm2, *(output + 24) ); output += 32; } width = (width & 0xf); for( i = width / 4; i; --i ) { movntq_r2m( mm2, *output ); output += 8; } width = (width & 0x7); for( i = width / 2; i; --i ) { *((uint32_t *) output) = colour; output += 4; } if( width & 1 ) { *output = y; *(output + 1) = cb; } sfence(); emms();}#endifstatic void blit_colour_packed4444_scanline_c( uint8_t *output, int width, int alpha, int luma, int cb, int cr ){ int j; for( j = 0; j < width; j++ ) { *output++ = alpha; *output++ = luma; *output++ = cb; *output++ = cr; }}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed4444_scanline_mmx( uint8_t *output, int width, int alpha, int luma, int cb, int cr ){ uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha; int i; movd_m2r( colour, mm1 ); movd_m2r( colour, mm2 ); psllq_i2r( 32, mm1 ); por_r2r( mm1, mm2 ); for( i = width / 8; i; --i ) { movq_r2m( mm2, *output ); movq_r2m( mm2, *(output + 8) ); movq_r2m( mm2, *(output + 16) ); movq_r2m( mm2, *(output + 24) ); output += 32; } width = (width & 0x7); for( i = width / 2; i; --i ) { movq_r2m( mm2, *output ); output += 8; } width = (width & 0x1); if( width ) { *((uint32_t *) output) = colour; output += 4; } emms();}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_colour_packed4444_scanline_mmxext( uint8_t *output, int width, int alpha, int luma, int cb, int cr ){ uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha; int i; movd_m2r( colour, mm1 ); movd_m2r( colour, mm2 ); psllq_i2r( 32, mm1 ); por_r2r( mm1, mm2 ); for( i = width / 8; i; --i ) { movntq_r2m( mm2, *output ); movntq_r2m( mm2, *(output + 8) ); movntq_r2m( mm2, *(output + 16) ); movntq_r2m( mm2, *(output + 24) ); output += 32; } width = (width & 0x7); for( i = width / 2; i; --i ) { movntq_r2m( mm2, *output ); output += 8; } width = (width & 0x1); if( width ) { *((uint32_t *) output) = colour; output += 4; } sfence(); emms();}#endif#define speedy_memcpy_c xine_fast_memcpy#define speedy_memcpy_mmx xine_fast_memcpy#define speedy_memcpy_mmxext xine_fast_memcpystatic void blit_packed422_scanline_c( uint8_t *dest, const uint8_t *src, int width ){ speedy_memcpy_c( dest, src, width*2 );}#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_packed422_scanline_mmx( uint8_t *dest, const uint8_t *src, int width ){ speedy_memcpy_mmx( dest, src, width*2 );}#endif#if defined(ARCH_X86) || defined(ARCH_X86_64)static void blit_packed422_scanline_mmxext( uint8_t *dest, const uint8_t *src, int width ){ speedy_memcpy_mmxext( dest, src, width*2 );}#endifstatic void composite_packed4444_alpha_to_packed422_scanline_c( uint8_t *output, uint8_t *input, uint8_t *foreground, int width, int alpha ){ int i; for( i = 0; i < width; i++ ) { int af = foreground[ 0 ]; if( af ) { int a = ((af * alpha) + 0x80) >> 8; if( a == 0xff ) { output[ 0 ] = foreground[ 1 ]; if( ( i & 1 ) == 0 ) { output[ 1 ] = foreground[ 2 ]; output[ 3 ] = foreground[ 3 ]; } } else if( a ) { /** * (1 - alpha)*B + alpha*F * (1 - af*a)*B + af*a*F * B - af*a*B + af*a*F * B + a*(af*F - af*B) */ output[ 0 ] = input[ 0 ] + ((alpha*( foreground[ 1 ] - multiply_alpha( foreground[ 0 ], input[ 0 ] ) ) + 0x80) >> 8); if( ( i & 1 ) == 0 ) { /** * At first I thought I was doing this incorrectly, but * the following math has convinced me otherwise. * * C_r = (1 - alpha)*B + alpha*F * C_r = B - af*a*B + af*a*F * * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128)) * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128) * C_r = B - af*a*B + a*af*F */ output[ 1 ] = input[ 1 ] + ((alpha*( foreground[ 2 ] - multiply_alpha( foreground[ 0 ], input[ 1 ] ) ) + 0x80) >> 8); output[ 3 ] = input[ 3 ] + ((alpha*( foreground[ 3 ] - multiply_alpha( foreground[ 0 ], input[ 3 ] ) ) + 0x80) >> 8); } } }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?