📄 mmxmotio.c
字号:
/* while (vSize > 0) { out[0] = (in[0] + in[hdim+0] + 1) >> 1; out[1] = (in[1] + in[hdim+1] + 1) >> 1; out[2] = (in[2] + in[hdim+2] + 1) >> 1; out[3] = (in[3] + in[hdim+3] + 1) >> 1; in += hdim; out += hdim; --vSize; } return;*/}static void mc16pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize ){__asm { xor ebx, ebx mov edx, hdim ;//one row ahead mov ecx, vSize mov esi, inpix mov edi, outpix pxor mm0, mm0 ;// mm0 = 0 movq mm7, const_0x0002000200020002mc16pels2DInterpolMMX_loop: movq mm1, [esi + ebx] ;//00 movq mm6, [esi + ebx +8] ;//05 movq mm5, mm1 ;//01 movq mm2, mm1 ;//02 psrlq mm5, 8 ;//shift one byte ;//06 psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//07 punpcklbw mm1, mm0 ;//expand lower 4 pix ;//03 pxor mm5, mm6 ; ;//08 punpckhbw mm2, mm0 ;//expand higher 4 pix ;//04 movq mm6, mm5 ;//09 punpcklbw mm5, mm0 ;//expand lower 4 pix ;//10 paddw mm1, mm7 ;//add 2 ;//12 punpckhbw mm6, mm0 ;//expand higher 4 pix ;//11 paddw mm2, mm7 ;//add 2 ;//12 paddw mm1, mm5 ;//13 movq mm3, [esi + edx] ;//15 paddw mm2, mm6 ;//14 movq mm5, mm3 ;//16 movq mm4, mm3 ;//18 punpcklbw mm3, mm0 ;//expand lower 4 pix ;//18 movq mm6, [esi + edx + 8] ;//22 psrlq mm5, 8 ;//shift one byte ;//23 punpckhbw mm4, mm0 ;//expand higher 4 pix ;//19 psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//24 paddw mm1, mm3 ;//20 pxor mm5, mm6 ; ;//25 paddw mm2, mm4 ;//21 movq mm6, mm5 ;//26 punpcklbw mm5, mm0 ;//expand lower 4 pix ;//27 punpckhbw mm6, mm0 ;//expand higher 4 pix ;//28 paddw mm1, mm5 ;//29 paddw mm2, mm6 ;//30 psrlw mm1, 2 ;//31 psrlw mm2, 2 ;//32 movq mm6, [esi + ebx + 16] ;//40 packuswb mm1, mm2 ;//33 psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//42 movq [edi + ebx], mm1 ;//34 //-----------------//--------------//---------------- movq mm1, [esi + ebx + 8] ;//35 movq mm5, mm1 ;//36 movq mm2, mm1 ;//37 psrlq mm5, 8 ;//shift one byte ;//41 punpcklbw mm1, mm0 ;//expand lower 4 pix ;//38 pxor mm5, mm6 ; ;//43 punpckhbw mm2, mm0 ;//expand higher 4 pix ;//39 movq mm6, mm5 ;//44 punpcklbw mm5, mm0 ;//expand lower 4 pix ;//45 punpckhbw mm6, mm0 ;//expand higher 4 pix ;//46 paddw mm1, mm7 ;//add 2 ;//47 paddw mm2, mm7 ;//add 2 ;//48 paddw mm1, mm5 ;//49 paddw mm2, mm6 ;//50 movq mm3, [esi + edx + 8] ;//51 movq mm6, [esi + edx + 16] ;//58 movq mm5, mm3 ;//52 psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 ;//60 movq mm4, mm3 ;//53 psrlq mm5, 8 ;//shift one byte ;//59 punpcklbw mm3, mm0 ;//expand lower 4 pix ;//54 pxor mm5, mm6 ; ;//61 punpckhbw mm4, mm0 ;//expand higher 4 pix ;//55 paddw mm1, mm3 ;//56 paddw mm2, mm4 ;//57 movq mm6, mm5 ;//62 punpcklbw mm5, mm0 ;//expand lower 4 pix ;//63 punpckhbw mm6, mm0 ;//expand higher 4 pix ;//64 paddw mm1, mm5 ;//65 paddw mm2, mm6 ;//66 psrlw mm1, 2 ;//67 psrlw mm2, 2 ;//68 add edx, hdim packuswb mm1, mm2 ;//69 movq [edi + ebx + 8], mm1 ;//70 add ebx, hdim dec ecx jg mc16pels2DInterpolMMX_loop emms }/* while (vSize > 0) { out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2; out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2; out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2; out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2; out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2; out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2; out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2; out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2; out[8] = (in[8] + in[9] + in[hdim+8] + in[hdim+9] + 2) >> 2; out[9] = (in[9] + in[10] + in[hdim+9] + in[hdim+10] + 2) >> 2; out[10] = (in[10] + in[11] + in[hdim+10] + in[hdim+11] + 2) >> 2; out[11] = (in[11] + in[12] + in[hdim+11] + in[hdim+12] + 2) >> 2; out[12] = (in[12] + in[13] + in[hdim+12] + in[hdim+13] + 2) >> 2; out[13] = (in[13] + in[14] + in[hdim+13] + in[hdim+14] + 2) >> 2; out[14] = (in[14] + in[15] + in[hdim+14] + in[hdim+15] + 2) >> 2; out[15] = (in[15] + in[16] + in[hdim+15] + in[hdim+16] + 2) >> 2; in += hdim; out += hdim; --vSize; } return;*/}static void mc8pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize ){__asm { xor ebx, ebx mov edx, hdim ;//one row ahead mov ecx, vSize mov esi, inpix mov edi, outpix pxor mm0, mm0 ;// mm0 = 0 movq mm7, const_0x0002000200020002mc8pels2DInterpolMMX_loop: movq mm1, [esi + ebx] movq mm6, [esi + ebx +8] movq mm5, mm1 movq mm2, mm1 psrlq mm5, 8 ;//shift one byte punpcklbw mm1, mm0 ;//expand lower 4 pix psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 punpckhbw mm2, mm0 ;//expand higher 4 pix pxor mm5, mm6 ; paddw mm1, mm7 ;//add 2 movq mm6, mm5 punpcklbw mm5, mm0 ;//expand lower 4 pix paddw mm2, mm7 ;//add 2 punpckhbw mm6, mm0 ;//expand higher 4 pix paddw mm1, mm5 paddw mm2, mm6 movq mm3, [esi + edx] movq mm6, [esi + edx + 8] movq mm5, mm3 movq mm4, mm3 punpcklbw mm3, mm0 ;//expand lower 4 pix psrlq mm5, 8 ;//shift one byte punpckhbw mm4, mm0 ;//expand higher 4 pix psllq mm6, 56 ;//shift 7 bytes, LSB is in top pos of mm4 paddw mm1, mm3 pxor mm5, mm6 ; paddw mm2, mm4 movq mm6, mm5 punpcklbw mm5, mm0 ;//expand lower 4 pix punpckhbw mm6, mm0 ;//expand higher 4 pix paddw mm1, mm5 paddw mm2, mm6 psrlw mm1, 2 psrlw mm2, 2 add edx, hdim packuswb mm1, mm2 movq [edi + ebx], mm1 add ebx, hdim dec ecx jg mc8pels2DInterpolMMX_loop emms }/* while (vSize > 0) { out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2; out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2; out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2; out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2; out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2; out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2; out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2; out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2; in += hdim; out += hdim; --vSize; } return;*/}static void mc4pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize ){__asm { xor ebx, ebx mov edx, hdim ;//one row ahead mov ecx, vSize mov esi, inpix mov edi, outpix pxor mm0, mm0 ;// mm0 = 0 movq mm7, const_0x0002000200020002mc4pels2DInterpolMMX_loop: movq mm1, [esi + ebx] movq mm5, mm1 punpcklbw mm1, mm0 ;//expand lower 4 pix psrlq mm5, 8 ;//shift one byte punpcklbw mm5, mm0 ;//expand lower 4 pix paddw mm1, mm7 ;//add 2 paddw mm1, mm5 movq mm3, [esi + edx] movq mm5, mm3 punpcklbw mm3, mm0 ;//expand lower 4 pix paddw mm1, mm3 psrlq mm5, 8 ;//shift one byte punpcklbw mm5, mm0 ;//expand lower 4 pix paddw mm1, mm5 psrlw mm1, 2 packuswb mm1, mm0 movd [edi + ebx], mm1 add ebx, hdim add edx, hdim dec ecx jg mc4pels2DInterpolMMX_loop emms }/* while (vSize > 0) { out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2; out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2; out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2; out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2; in += hdim; out += hdim; --vSize; } return;*/}// mc - Perform motion compensation for a hSize x vSize blockvoid mcMMX( int hSize, int vSize, PIXEL *in, PIXEL *out, int hdim, int mvX, int mvY // Motion vector ){ int intX, intY, fracX, fracY; intX = mvX >> 1; // Integer part of motion vector intY = mvY >> 1; fracX = mvX & 0x1; // Fractional part of motion vector fracY = mvY & 0x1; in += intX + intY * hdim; if (hSize != 16 && hSize != 8 && hSize != 4) { H261ErrMsg("mc -- hSize not supported"); exit(0); }//#define MMX_MC_TEST#ifdef MMX_MC_TEST { PIXEL in_org[17*352]; PIXEL out_org[17*352]; void mc( int hSize, int vSize, PIXEL in[], PIXEL out[], int hdim,int mvX, int mvY); int bError = 0; int row, col; for(row=0; row<vSize+1; row++) { for(col=0; col<hSize+1; col++) { in_org[row*hdim + col] = in[row*hdim + col]; //out_org[row*hdim + col] = out[row*hdim + col]; } }#endif if (fracY == 0) { if (fracX == 0) { // No interpolation if (hSize == 8) { mc8pelsNoInterpolMMX( in, out, hdim, vSize ); } else if (hSize == 16) { mc16pelsNoInterpolMMX( in, out, hdim, vSize ); } else { mc4pelsNoInterpolMMX( in, out, hdim, vSize ); } } else { // Horizontal interpolation if (hSize == 8) { mc8pelsHorInterpolMMX( in, out, hdim, vSize ); } else if (hSize == 16) { mc16pelsHorInterpolMMX( in, out, hdim, vSize ); } else { mc4pelsHorInterpolMMX( in, out, hdim, vSize ); } } } else if (fracX == 0) { // Vertical interpolation if (hSize == 8) { mc8pelsVertInterpolMMX( in, out, hdim, vSize ); } else if (hSize == 16) { mc16pelsVertInterpolMMX( in, out, hdim, vSize ); } else { mc4pelsVertInterpolMMX( in, out, hdim, vSize ); } } else { // Bilinear interpolation if (hSize == 8) { mc8pels2DInterpolMMX( in, out, hdim, vSize ); } else if (hSize == 16) { mc16pels2DInterpolMMX( in, out, hdim, vSize ); } else { mc4pels2DInterpolMMX( in, out, hdim, vSize ); } }#ifdef MMX_MC_TEST mc(hSize,vSize, in_org - intX - intY * hdim, out_org, hdim, mvX, mvY ); for(row=0; row<vSize; row++) { for(col=0; col<hSize; col++) { if(in_org[row*hdim + col] != in[row*hdim + col]) { bError = 1; } if(out_org[row*hdim + col] != out[row*hdim + col]) { bError = 1; } } } }#endif return;}// averageBlock - compute average of two hSize*vSize pixel arraysvoid averageBlockMMX( PIXEL forPred[], int hSize, int vSize, int forOffset, PIXEL backPred[], int backOffset ){ int row, col; int nbPerRow = (hSize>>2) ; //do row size multiples of 4 if(nbPerRow==2) { __asm {//8 mov esi, forPred mov edi, backPred mov ecx, vSize pxor mm0, mm0; // mm0 = 0aver_8: movq mm1, [esi] movq mm3, [edi] movq mm2, mm1 movq mm4, mm3 punpcklbw mm1, mm0 ;// expand lower 4 pix punpckhbw mm2, mm0 ;// expand higher 4 pix punpcklbw mm3, mm0 ;// expand lower 4 pix punpckhbw mm4, mm0 ;// expand higher 4 pix paddw mm1, mm3 paddw mm2, mm4 psrlw mm1, 1 psrlw mm2, 1 packuswb mm1, mm2 add edi, backOffset movq [esi], mm1 add esi, forOffset dec ecx jg aver_8 emms } } else if(nbPerRow>=4) { nbPerRow = 4; __asm {//16 mov esi, forPred mov edi, backPred mov ecx, vSize pxor mm0, mm0; // mm0 = 0aver_16: movq mm1, [esi] ;//00 movq mm5, [esi+8] ;//10 movq mm2, mm1 ;//00 punpcklbw mm1, mm0 ;// expand lower 4 pix ;//00 movq mm6, mm5 ;//10 punpckhbw mm2, mm0 ;// expand higher 4 pix ;//00 movq mm3, [edi] ;//00 punpcklbw mm5, mm0 ;// expand lower 4 pix ;//10 movq mm4, mm3 ;//00 punpckhbw mm6, mm0 ;// expand higher 4 pix ;//10 movq mm7, [edi+8] ;//10 punpcklbw mm3, mm0 ;// expand lower 4 pix ;//00 punpckhbw mm4, mm0 ;// expand higher 4 pix ;//00 paddw mm2, mm4 ;//00 paddw mm1, mm3 ;//00 movq mm4, mm7 ;//10 punpcklbw mm7, mm0 ;// expand lower 4 pix ;//10 punpckhbw mm4, mm0 ;// expand higher 4 pix ;//10 paddw mm5, mm7 ;//10 psrlw mm1, 1 ;//00 paddw mm6, mm4 ;//10 psrlw mm2, 1 ;//00 psrlw mm5, 1 ;//10 packuswb mm1, mm2 ;//00 psrlw mm6, 1 ;//10 packuswb mm5, mm6 ;//10 movq [esi], mm1 ;//00 add edi, backOffset movq [esi+8], mm5 ;//10 add esi, forOffset dec ecx jg aver_16 emms } } else if(nbPerRow==1) { __asm {//4 mov esi, forPred mov edi, backPred mov ecx, vSize pxor mm0, mm0; // mm0 = 0aver_4: movq mm1, [esi] movq mm3, [edi] punpcklbw mm1, mm0 ;// expand lower 4 pix punpcklbw mm3, mm0 ;// expand lower 4 pix paddw mm1, mm3 psrlw mm1, 1 packuswb mm1, mm0 add edi, backOffset movd [esi], mm1 add esi, forOffset dec ecx jg aver_4 emms } } else if(nbPerRow==3) { __asm {//12 mov esi, forPred mov edi, backPred mov ecx, vSize pxor mm0, mm0; // mm0 = 0aver_12: movq mm1, [esi] ;//00 movq mm5, [esi+8] ;//10 movq mm2, mm1 ;//00 punpcklbw mm1, mm0 ;// expand lower 4 pix ;//00 punpckhbw mm2, mm0 ;// expand higher 4 pix ;//00 movq mm3, [edi] ;//00 punpcklbw mm5, mm0 ;// expand lower 4 pix ;//10 movq mm4, mm3 ;//00 punpcklbw mm3, mm0 ;// expand lower 4 pix ;//00 punpckhbw mm4, mm0 ;// expand higher 4 pix ;//00 movq mm7, [edi+8] ;//10 paddw mm2, mm4 ;//00 punpcklbw mm7, mm0 ;// expand lower 4 pix ;//10 paddw mm1, mm3 ;//00 psrlw mm2, 1 ;//00 paddw mm5, mm7 ;//10 psrlw mm1, 1 ;//00 psrlw mm5, 1 ;//10 packuswb mm1, mm2 ;//00 packuswb mm5, mm0 ;//10 movq [esi], mm1 ;//00 add edi, backOffset movd [esi+8], mm5 ;//10 add esi, forOffset dec ecx jg aver_12 emms } } for (row = 0; row < vSize; ++row) { for (col = nbPerRow<<2; col < hSize; ++col) {//from smaller multiple of 4 forPred[col] = (forPred[col] + backPred[col]) >> 1; } forPred += forOffset; backPred += backOffset; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -