xmmx.c
来自「linux下的MPEG1」· C语言 代码 · 共 406 行
C
406 行
#include "config.h"#ifdef HAVE_MMX/* a definir pour avoir exactement le meme resultat que la fonction C * (un chouillat plus lent).. mais la difference est assez peu notable. */// #define STRICT_COMPAT#define BUFFPOINTNB 16#define BUFFPOINTMASK 0xffff#define BUFFINCR 0xff#define sqrtperte 16/* faire : a % sqrtperte <=> a & pertemask*/#define PERTEMASK 0xf/* faire : a / sqrtperte <=> a >> PERTEDEC*/#define PERTEDEC 4/*#define MMX_TRACE*/#include "mmx.h"/*#include "xmmx.h"*/#include "goom_graphic.h"int xmmx_supported (void) {#ifdef ARCH_X86_64 return 0; /* Haven't yet converted zoom_filter_xmmx to support 64-bit memory index registers (rsi,rax) */#else return (mm_support()&0x8)>>3;#endif}void zoom_filter_xmmx (int prevX, int prevY, Pixel *expix1, Pixel *expix2, int *lbruS, int *lbruD, int buffratio, int precalCoef[16][16]){#ifndef ARCH_X86_64 int bufsize = prevX * prevY; /* taille du buffer */ volatile int loop; /* variable de boucle */ mmx_t *brutS = (mmx_t*)lbruS; /* buffer de transformation source */ mmx_t *brutD = (mmx_t*)lbruD; /* buffer de transformation dest */ volatile mmx_t prevXY; volatile mmx_t ratiox; /* volatile mmx_t interpix; */ expix1[0].val=expix1[prevX-1].val=expix1[prevX*prevY-1].val=expix1[prevX*prevY-prevX].val=0; prevXY.ud[0] = (prevX-1)<<PERTEDEC; prevXY.ud[1] = (prevY-1)<<PERTEDEC; ratiox.d[0] = buffratio; ratiox.d[1] = buffratio; asm volatile ("\n\t movq %0, %%mm6" "\n\t pslld $16, %%mm6" /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */ "\n\t pxor %%mm7, %%mm7" /* mm7 = 0 */ ::"m"(ratiox)); loop=0; /* * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle. */ while (loop < bufsize) { /* Thread #1 * pre : mm6 = [rat16|rat16] * post : mm0 = S + ((D-S)*rat16 format [X|Y] * modified = mm0,mm1,mm2 */ asm volatile ("#1 \n\t movq %0, %%mm0" "#1 \n\t movq %1, %%mm1" "#1 \n\t psubd %%mm0, %%mm1" /* mm1 = D - S */ "#1 \n\t movq %%mm1, %%mm2" /* mm2 = D - S */ "#1 \n\t pslld $16, %%mm1" "#1 \n\t pmullw %%mm6, %%mm2" "#1 \n\t pmulhuw %%mm6, %%mm1" "#1 \n\t pslld $16, %%mm0" "#1 \n\t paddd %%mm2, %%mm1" /* mm1 = (D - S) * buffratio >> 16 */ "#1 \n\t paddd %%mm1, %%mm0" /* mm0 = S + mm1 */ "#1 \n\t psrld $16, %%mm0" : : "g"(brutS[loop]) , "g"(brutD[loop]) ); /* mm0 = S */ /* * pre : mm0 : position vector on screen * prevXY : coordinate of the lower-right point on screen * post : clipped mm0 * modified : mm0,mm1,mm2 */ asm volatile ("#1 \n\t movq %0, %%mm1" "#1 \n\t pcmpgtd %%mm0, %%mm1" /* mm0 en X contient (idem pour Y) : * 1111 si prevXY > px * 0000 si prevXY <= px */#ifdef STRICT_COMPAT "#1 \n\t movq %%mm1, %%mm2" "#1 \n\t punpckhdq %%mm2, %%mm2" "#1 \n\t punpckldq %%mm1, %%mm1" "#1 \n\t pand %%mm2, %%mm0"#endif "#1 \n\t pand %%mm1, %%mm0" /* on met a zero la partie qui deborde */ ::"m"(prevXY)); /* Thread #2 * pre : mm0 : clipped position on screen * * post : mm3 : coefs for this position * mm1 : X vector [0|X] * * modif : eax,esi */ __asm__ __volatile__ ( "#2 \n\t movd %%mm0,%%esi" "#2 \n\t movq %%mm0,%%mm1" "#2 \n\t andl $15,%%esi" "#2 \n\t psrlq $32,%%mm1" "#2 \n\t shll $6,%%esi" "#2 \n\t movd %%mm1,%%eax" "#2 \n\t addl %0,%%esi" "#2 \n\t andl $15,%%eax" "#2 \n\t movd (%%esi,%%eax,4),%%mm3" ::"g"(precalCoef):"eax","esi"); /* * extraction des coefficients... (Thread #3) * * pre : coef dans mm3 * * post : coef extraits dans mm3 (c1 & c2) * et mm4 (c3 & c4) * * modif : mm5 */ /* (Thread #4) * pre : mm0 : Y pos [*|Y] * mm1 : X pos [*|X] * * post : mm0 : expix1[position] * mm2 : expix1[position+largeur] * * modif : eax, esi */ __asm__ __volatile__ ( "#2 \n\t psrld $4, %%mm0" "#2 \n\t psrld $4, %%mm1" /* PERTEDEC = $4 */ "#4 \n\t movd %%mm1,%%eax" "#3 \n\t movq %%mm3,%%mm5" "#4 \n\t mull %1" "#4 \n\t movd %%mm0,%%esi" "#3 \n\t punpcklbw %%mm5, %%mm3" "#4 \n\t addl %%esi, %%eax" "#3 \n\t movq %%mm3, %%mm4" "#3 \n\t movq %%mm3, %%mm5" "#4 \n\t movl %0, %%esi" "#3 \n\t punpcklbw %%mm5, %%mm3" "#4 \n\t movq (%%esi,%%eax,4),%%mm0" "#3 \n\t punpckhbw %%mm5, %%mm4" "#4 \n\t addl %1,%%eax" "#4 \n\t movq (%%esi,%%eax,4),%%mm2" : : "g"(expix1) , "g"(prevX) :"eax","esi" ); /* * pre : mm0 : expix1[position] * mm2 : expix1[position+largeur] * mm3 & mm4 : coefs */ /* recopie des deux premiers pixels dans mm0 et mm1 */ movq_r2r (mm0, mm1); /* b1-v1-r1-a1-b2-v2-r2-a2 */ /* depackage du premier pixel */ punpcklbw_r2r (mm7, mm0); /* 00-b2-00-v2-00-r2-00-a2 */ /* extraction des coefficients... */ movq_r2r (mm3, mm5); /* c2-c2-c2-c2-c1-c1-c1-c1 */ /*^en parrallele^*/ /* depackage du 2ieme pixel */ /*^*/ punpckhbw_r2r (mm7, mm1); /* 00-b1-00-v1-00-r1-00-a1 */ punpcklbw_r2r (mm7, mm5); /* 00-c1-00-c1-00-c1-00-c1 */ punpckhbw_r2r (mm7, mm3); /* 00-c2-00-c2-00-c2-00-c2 */ /* multiplication des pixels par les coefficients */ pmullw_r2r (mm5, mm0); /* c1*b2-c1*v2-c1*r2-c1*a2 */ pmullw_r2r (mm3, mm1); /* c2*b1-c2*v1-c2*r1-c2*a1 */ paddw_r2r (mm1, mm0); /* ...extraction des 2 derniers coefficients */ movq_r2r (mm4, mm5); /* c4-c4-c4-c4-c3-c3-c3-c3 */ punpcklbw_r2r (mm7, mm4); /* 00-c3-00-c3-00-c3-00-c3 */ punpckhbw_r2r (mm7, mm5); /* 00-c4-00-c4-00-c4-00-c4 */ /* recuperation des 2 derniers pixels */ movq_r2r (mm2, mm1); /* depackage des pixels */ punpcklbw_r2r (mm7, mm1); punpckhbw_r2r (mm7, mm2); /* multiplication pas les coeffs */ pmullw_r2r (mm4, mm1); pmullw_r2r (mm5, mm2); /* ajout des valeurs obtenues à la valeur finale */ paddw_r2r (mm1, mm0); paddw_r2r (mm2, mm0); /* division par 256 = 16+16+16+16, puis repackage du pixel final */ psrlw_i2r (8, mm0); packuswb_r2r (mm7, mm0); movd_r2m (mm0,expix2[loop]); ++loop; }/*#ifdef HAVE_ATHLON*/ __asm__ __volatile__ ("emms\n");/*#else emms();#endif*/#endif /* ARCH_X86_64 */}#define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \{ \ movd_m2r(_backbuf, mm0); \ paddusb_m2r(_col, mm0); \ movd_r2m(mm0, _out); \}#define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)void draw_line_xmmx (Pixel *data, int x1, int y1, int x2, int y2, int col, int screenx, int screeny){ int x, y, dx, dy, yy, xx; Pixel *p; if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny) || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx)) goto end_of_line; dx = x2 - x1; dy = y2 - y1; if (x1 >= x2) { int tmp; tmp = x1; x1 = x2; x2 = tmp; tmp = y1; y1 = y2; y2 = tmp; dx = x2 - x1; dy = y2 - y1; } /* vertical line */ if (dx == 0) { if (y1 < y2) { p = &(data[(screenx * y1) + x1]); for (y = y1; y <= y2; y++) { DRAWMETHOD; p += screenx; } } else { p = &(data[(screenx * y2) + x1]); for (y = y2; y <= y1; y++) { DRAWMETHOD; p += screenx; } } goto end_of_line; } /* horizontal line */ if (dy == 0) { if (x1 < x2) { p = &(data[(screenx * y1) + x1]); for (x = x1; x <= x2; x++) { DRAWMETHOD; p++; } goto end_of_line; } else { p = &(data[(screenx * y1) + x2]); for (x = x2; x <= x1; x++) { DRAWMETHOD; p++; } goto end_of_line; } } /* 1 */ /* \ */ /* \ */ /* 2 */ if (y2 > y1) { /* steep */ if (dy > dx) { dx = ((dx << 16) / dy); x = x1 << 16; for (y = y1; y <= y2; y++) { xx = x >> 16; p = &(data[(screenx * y) + xx]); DRAWMETHOD; if (xx < (screenx - 1)) { p++; /* DRAWMETHOD; */ } x += dx; } goto end_of_line; } /* shallow */ else { dy = ((dy << 16) / dx); y = y1 << 16; for (x = x1; x <= x2; x++) { yy = y >> 16; p = &(data[(screenx * yy) + x]); DRAWMETHOD; if (yy < (screeny - 1)) { p += screeny; /* DRAWMETHOD; */ } y += dy; } } } /* 2 */ /* / */ /* / */ /* 1 */ else { /* steep */ if (-dy > dx) { dx = ((dx << 16) / -dy); x = (x1 + 1) << 16; for (y = y1; y >= y2; y--) { xx = x >> 16; p = &(data[(screenx * y) + xx]); DRAWMETHOD; if (xx < (screenx - 1)) { p--; /* DRAWMETHOD; */ } x += dx; } goto end_of_line; } /* shallow */ else { dy = ((dy << 16) / dx); y = y1 << 16; for (x = x1; x <= x2; x++) { yy = y >> 16; p = &(data[(screenx * yy) + x]); DRAWMETHOD; if (yy < (screeny - 1)) { p += screeny; /* DRAWMETHOD; */ } y += dy; } goto end_of_line; } }end_of_line: __asm__ __volatile__ ("emms\n"); }#endif
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?