📄 rgb2rgb_template.c
字号:
/* * * rgb2rgb.c, Software RGB to RGB convertor * pluralize by Software PAL8 to RGB convertor * Software YUV to YUV convertor * Software YUV to RGB convertor * Written by Nick Kurshev. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) */#include <stddef.h>#include <inttypes.h> /* for __WORDSIZE */#ifndef __WORDSIZE// #warning You have misconfigured system and probably will lose performance!#define __WORDSIZE MP_WORDSIZE#endif#undef PREFETCH#undef MOVNTQ#undef EMMS#undef SFENCE#undef MMREG_SIZE#undef PREFETCHW#undef PAVGB#ifdef HAVE_SSE2#define MMREG_SIZE 16#else#define MMREG_SIZE 8#endif#ifdef HAVE_3DNOW#define PREFETCH "prefetch"#define PREFETCHW "prefetchw"#define PAVGB "pavgusb"#elif defined ( HAVE_MMX2 )#define PREFETCH "prefetchnta"#define PREFETCHW "prefetcht0"#define PAVGB "pavgb"#else#define PREFETCH "/nop"#define PREFETCHW "/nop"#endif#ifdef HAVE_3DNOW/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */#define EMMS "femms"#else#define EMMS "emms"#endif#ifdef HAVE_MMX2#define MOVNTQ "movntq"#define SFENCE "sfence"#else#define MOVNTQ "movq"#define SFENCE "/nop"#endifstatic inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size){ uint8_t *dest = dst; const uint8_t *s = src; const uint8_t *end;#ifdef HAVE_MMX const uint8_t *mm_end;#endif end = s + src_size;#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); mm_end = end - 23; __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); while(s < mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" "movd %1, %%mm0\n\t" "punpckldq 3%1, %%mm0\n\t" "movd 6%1, %%mm1\n\t" "punpckldq 9%1, %%mm1\n\t" "movd 12%1, %%mm2\n\t" "punpckldq 15%1, %%mm2\n\t" "movd 18%1, %%mm3\n\t" "punpckldq 21%1, %%mm3\n\t" "pand %%mm7, %%mm0\n\t" "pand %%mm7, %%mm1\n\t" "pand %%mm7, %%mm2\n\t" "pand %%mm7, %%mm3\n\t" MOVNTQ" %%mm0, %0\n\t" MOVNTQ" %%mm1, 8%0\n\t" MOVNTQ" %%mm2, 16%0\n\t" MOVNTQ" %%mm3, 24%0" :"=m"(*dest) :"m"(*s) :"memory"); dest += 32; s += 24; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory");#endif while(s < end) { *dest++ = *s++; *dest++ = *s++; *dest++ = *s++; *dest++ = 0; }}static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size){ uint8_t *dest = dst; const uint8_t *s = src; const uint8_t *end;#ifdef HAVE_MMX const uint8_t *mm_end;#endif end = s + src_size;#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); mm_end = end - 31; while(s < mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" "movq %1, %%mm0\n\t" "movq 8%1, %%mm1\n\t" "movq 16%1, %%mm4\n\t" "movq 24%1, %%mm5\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm1, %%mm3\n\t" "movq %%mm4, %%mm6\n\t" "movq %%mm5, %%mm7\n\t" "psrlq $8, %%mm2\n\t" "psrlq $8, %%mm3\n\t" "psrlq $8, %%mm6\n\t" "psrlq $8, %%mm7\n\t" "pand %2, %%mm0\n\t" "pand %2, %%mm1\n\t" "pand %2, %%mm4\n\t" "pand %2, %%mm5\n\t" "pand %3, %%mm2\n\t" "pand %3, %%mm3\n\t" "pand %3, %%mm6\n\t" "pand %3, %%mm7\n\t" "por %%mm2, %%mm0\n\t" "por %%mm3, %%mm1\n\t" "por %%mm6, %%mm4\n\t" "por %%mm7, %%mm5\n\t" "movq %%mm1, %%mm2\n\t" "movq %%mm4, %%mm3\n\t" "psllq $48, %%mm2\n\t" "psllq $32, %%mm3\n\t" "pand %4, %%mm2\n\t" "pand %5, %%mm3\n\t" "por %%mm2, %%mm0\n\t" "psrlq $16, %%mm1\n\t" "psrlq $32, %%mm4\n\t" "psllq $16, %%mm5\n\t" "por %%mm3, %%mm1\n\t" "pand %6, %%mm5\n\t" "por %%mm5, %%mm4\n\t" MOVNTQ" %%mm0, %0\n\t" MOVNTQ" %%mm1, 8%0\n\t" MOVNTQ" %%mm4, 16%0" :"=m"(*dest) :"m"(*s),"m"(mask24l), "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) :"memory"); dest += 24; s += 32; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory");#endif while(s < end) { *dest++ = *s++; *dest++ = *s++; *dest++ = *s++; s++; }}/* Original by Strepto/Astral ported to gcc & bugfixed : A'rpi MMX2, 3DNOW optimization by Nick Kurshev 32bit c version, and and&add trick by Michael Niedermayer*/static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size){ register const uint8_t* s=src; register uint8_t* d=dst; register const uint8_t *end; const uint8_t *mm_end; end = s + src_size;#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*s)); __asm __volatile("movq %0, %%mm4"::"m"(mask15s)); mm_end = end - 15; while(s<mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" "movq %1, %%mm0\n\t" "movq 8%1, %%mm2\n\t" "movq %%mm0, %%mm1\n\t" "movq %%mm2, %%mm3\n\t" "pand %%mm4, %%mm0\n\t" "pand %%mm4, %%mm2\n\t" "paddw %%mm1, %%mm0\n\t" "paddw %%mm3, %%mm2\n\t" MOVNTQ" %%mm0, %0\n\t" MOVNTQ" %%mm2, 8%0" :"=m"(*d) :"m"(*s) ); d+=16; s+=16; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory");#endif mm_end = end - 3; while(s < mm_end) { register unsigned x= *((uint32_t *)s); *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); d+=4; s+=4; } if(s < end) { register unsigned short x= *((uint16_t *)s); *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); }}static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size){ register const uint8_t* s=src; register uint8_t* d=dst; register const uint8_t *end; const uint8_t *mm_end; end = s + src_size;#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*s)); __asm __volatile("movq %0, %%mm7"::"m"(mask15rg)); __asm __volatile("movq %0, %%mm6"::"m"(mask15b)); mm_end = end - 15; while(s<mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" "movq %1, %%mm0\n\t" "movq 8%1, %%mm2\n\t" "movq %%mm0, %%mm1\n\t" "movq %%mm2, %%mm3\n\t" "psrlq $1, %%mm0\n\t" "psrlq $1, %%mm2\n\t" "pand %%mm7, %%mm0\n\t" "pand %%mm7, %%mm2\n\t" "pand %%mm6, %%mm1\n\t" "pand %%mm6, %%mm3\n\t" "por %%mm1, %%mm0\n\t" "por %%mm3, %%mm2\n\t" MOVNTQ" %%mm0, %0\n\t" MOVNTQ" %%mm2, 8%0" :"=m"(*d) :"m"(*s) ); d+=16; s+=16; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory");#endif mm_end = end - 3; while(s < mm_end) { register uint32_t x= *((uint32_t *)s); *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); s+=4; d+=4; } if(s < end) { register uint16_t x= *((uint16_t *)s); *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); s+=2; d+=2; }}static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size){ const uint8_t *s = src; const uint8_t *end;#ifdef HAVE_MMX const uint8_t *mm_end;#endif uint16_t *d = (uint16_t *)dst; end = s + src_size;#ifdef HAVE_MMX mm_end = end - 15;#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) asm volatile( "movq %3, %%mm5 \n\t" "movq %4, %%mm6 \n\t" "movq %5, %%mm7 \n\t" ".balign 16 \n\t" "1: \n\t" PREFETCH" 32(%1) \n\t" "movd (%1), %%mm0 \n\t" "movd 4(%1), %%mm3 \n\t" "punpckldq 8(%1), %%mm0 \n\t" "punpckldq 12(%1), %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm3, %%mm4 \n\t" "pand %%mm6, %%mm0 \n\t" "pand %%mm6, %%mm3 \n\t" "pmaddwd %%mm7, %%mm0 \n\t" "pmaddwd %%mm7, %%mm3 \n\t" "pand %%mm5, %%mm1 \n\t" "pand %%mm5, %%mm4 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "psrld $5, %%mm0 \n\t" "pslld $11, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, (%0) \n\t" "addl $16, %1 \n\t" "addl $8, %0 \n\t" "cmpl %2, %1 \n\t" " jb 1b \n\t" : "+r" (d), "+r"(s) : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) );#else __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_16mask),"m"(green_16mask)); while(s < mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" "movd %1, %%mm0\n\t" "movd 4%1, %%mm3\n\t" "punpckldq 8%1, %%mm0\n\t" "punpckldq 12%1, %%mm3\n\t" "movq %%mm0, %%mm1\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm3, %%mm4\n\t" "movq %%mm3, %%mm5\n\t" "psrlq $3, %%mm0\n\t" "psrlq $3, %%mm3\n\t" "pand %2, %%mm0\n\t" "pand %2, %%mm3\n\t" "psrlq $5, %%mm1\n\t" "psrlq $5, %%mm4\n\t" "pand %%mm6, %%mm1\n\t" "pand %%mm6, %%mm4\n\t" "psrlq $8, %%mm2\n\t" "psrlq $8, %%mm5\n\t" "pand %%mm7, %%mm2\n\t" "pand %%mm7, %%mm5\n\t" "por %%mm1, %%mm0\n\t" "por %%mm4, %%mm3\n\t" "por %%mm2, %%mm0\n\t" "por %%mm5, %%mm3\n\t" "psllq $16, %%mm3\n\t" "por %%mm3, %%mm0\n\t" MOVNTQ" %%mm0, %0\n\t" :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); d += 4; s += 16; }#endif __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory");#endif while(s < end) { const int src= *s; s += 4; *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);// *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800); }}static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size){ const uint8_t *s = src; const uint8_t *end;#ifdef HAVE_MMX const uint8_t *mm_end;#endif uint16_t *d = (uint16_t *)dst; end = s + src_size;#ifdef HAVE_MMX __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_16mask),"m"(green_16mask)); mm_end = end - 15; while(s < mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" "movd %1, %%mm0\n\t" "movd 4%1, %%mm3\n\t" "punpckldq 8%1, %%mm0\n\t" "punpckldq 12%1, %%mm3\n\t" "movq %%mm0, %%mm1\n\t" "movq %%mm0, %%mm2\n\t" "movq %%mm3, %%mm4\n\t" "movq %%mm3, %%mm5\n\t" "psllq $8, %%mm0\n\t" "psllq $8, %%mm3\n\t" "pand %%mm7, %%mm0\n\t" "pand %%mm7, %%mm3\n\t" "psrlq $5, %%mm1\n\t" "psrlq $5, %%mm4\n\t" "pand %%mm6, %%mm1\n\t" "pand %%mm6, %%mm4\n\t" "psrlq $19, %%mm2\n\t" "psrlq $19, %%mm5\n\t" "pand %2, %%mm2\n\t" "pand %2, %%mm5\n\t" "por %%mm1, %%mm0\n\t" "por %%mm4, %%mm3\n\t" "por %%mm2, %%mm0\n\t" "por %%mm5, %%mm3\n\t" "psllq $16, %%mm3\n\t" "por %%mm3, %%mm0\n\t" MOVNTQ" %%mm0, %0\n\t" :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); d += 4; s += 16; } __asm __volatile(SFENCE:::"memory"); __asm __volatile(EMMS:::"memory");#endif while(s < end) { const int src= *s; s += 4; *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19); }}static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size){ const uint8_t *s = src; const uint8_t *end;#ifdef HAVE_MMX const uint8_t *mm_end;#endif uint16_t *d = (uint16_t *)dst; end = s + src_size;#ifdef HAVE_MMX mm_end = end - 15;#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster) asm volatile( "movq %3, %%mm5 \n\t" "movq %4, %%mm6 \n\t" "movq %5, %%mm7 \n\t" ".balign 16 \n\t" "1: \n\t" PREFETCH" 32(%1) \n\t" "movd (%1), %%mm0 \n\t" "movd 4(%1), %%mm3 \n\t" "punpckldq 8(%1), %%mm0 \n\t" "punpckldq 12(%1), %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm3, %%mm4 \n\t" "pand %%mm6, %%mm0 \n\t" "pand %%mm6, %%mm3 \n\t" "pmaddwd %%mm7, %%mm0 \n\t" "pmaddwd %%mm7, %%mm3 \n\t" "pand %%mm5, %%mm1 \n\t" "pand %%mm5, %%mm4 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "psrld $6, %%mm0 \n\t" "pslld $10, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, (%0) \n\t" "addl $16, %1 \n\t" "addl $8, %0 \n\t" "cmpl %2, %1 \n\t" " jb 1b \n\t" : "+r" (d), "+r"(s) : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) );#else __asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm __volatile( "movq %0, %%mm7\n\t" "movq %1, %%mm6\n\t" ::"m"(red_15mask),"m"(green_15mask)); while(s < mm_end) { __asm __volatile( PREFETCH" 32%1\n\t" "movd %1, %%mm0\n\t" "movd 4%1, %%mm3\n\t"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -