mmxmotio.c

来自「symbian 下的helix player源代码」· C语言 代码 · 共 1,081 行 · 第 1/2 页

C
1,081
字号
/*
    while (vSize > 0) {
        out[0] = (in[0] + in[hdim+0] + 1) >> 1;
        out[1] = (in[1] + in[hdim+1] + 1) >> 1;
        out[2] = (in[2] + in[hdim+2] + 1) >> 1;
        out[3] = (in[3] + in[hdim+3] + 1) >> 1;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
*/
}


static void mc16pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize )
{
__asm {
	xor		ebx, ebx
	mov		edx, hdim		;//one row ahead
	mov		ecx, vSize
	mov		esi, inpix
	mov		edi, outpix
	pxor	mm0, mm0	;// mm0 = 0
	movq		mm7, const_0x0002000200020002
mc16pels2DInterpolMMX_loop:
		movq			mm1, [esi + ebx]									;//00
		movq			mm6, [esi + ebx +8]									;//05
		movq			mm5, mm1									;//01
		movq			mm2, mm1									;//02
		psrlq			mm5, 8			;//shift one byte									;//06
		psllq			mm6, 56			;//shift 7 bytes, LSB is in top pos of mm4									;//07
		punpcklbw	mm1, mm0	;//expand lower 4 pix									;//03
		pxor			mm5, mm6	;									;//08
		punpckhbw	mm2, mm0	;//expand higher 4 pix									;//04

		movq			mm6, mm5									;//09
		punpcklbw	mm5, mm0	;//expand lower 4 pix									;//10
		paddw		mm1, mm7		;//add 2									;//12
		punpckhbw	mm6, mm0	;//expand higher 4 pix									;//11

		paddw		mm2, mm7		;//add 2									;//12

		paddw		mm1, mm5									;//13
		movq			mm3, [esi + edx]									;//15
		paddw		mm2, mm6									;//14

		movq			mm5, mm3									;//16
		movq			mm4, mm3									;//18
		punpcklbw	mm3, mm0	;//expand lower 4 pix									;//18
		movq			mm6, [esi + edx + 8]									;//22
		psrlq			mm5, 8			;//shift one byte									;//23
		punpckhbw	mm4, mm0	;//expand higher 4 pix									;//19

		psllq			mm6, 56			;//shift 7 bytes, LSB is in top pos of mm4									;//24
		paddw		mm1, mm3									;//20
		pxor			mm5, mm6	;									;//25
		paddw		mm2, mm4									;//21

		movq			mm6, mm5									;//26
		punpcklbw	mm5, mm0	;//expand lower 4 pix									;//27
		punpckhbw	mm6, mm0	;//expand higher 4 pix									;//28

		paddw		mm1, mm5									;//29
		paddw		mm2, mm6									;//30

		psrlw			mm1, 2									;//31
		psrlw			mm2, 2											;//32
		movq			mm6, [esi + ebx + 16]									;//40
		packuswb	mm1, mm2											;//33

		psllq			mm6, 56			;//shift 7 bytes, LSB is in top pos of mm4								;//42
		movq	[edi + ebx], mm1											;//34

		//-----------------//--------------//----------------
		movq			mm1, [esi + ebx + 8]									;//35
		movq			mm5, mm1									;//36
		movq			mm2, mm1									;//37
		psrlq			mm5, 8			;//shift one byte											;//41
		punpcklbw	mm1, mm0	;//expand lower 4 pix									;//38
		pxor			mm5, mm6	;												;//43
		punpckhbw	mm2, mm0	;//expand higher 4 pix									;//39

		movq			mm6, mm5													;//44
		punpcklbw	mm5, mm0	;//expand lower 4 pix							;//45
		punpckhbw	mm6, mm0	;//expand higher 4 pix							;//46

		paddw		mm1, mm7		;//add 2							;//47
		paddw		mm2, mm7		;//add 2							;//48

		paddw		mm1, mm5							;//49
		paddw		mm2, mm6							;//50

		movq			mm3, [esi + edx + 8]							;//51
		movq			mm6, [esi + edx + 16]							;//58
		movq			mm5, mm3							;//52
		psllq			mm6, 56			;//shift 7 bytes, LSB is in top pos of mm4							;//60
		movq			mm4, mm3							;//53
		psrlq			mm5, 8			;//shift one byte							;//59
		punpcklbw	mm3, mm0	;//expand lower 4 pix							;//54
		pxor			mm5, mm6	;							;//61
		punpckhbw	mm4, mm0	;//expand higher 4 pix							;//55

		paddw		mm1, mm3							;//56
		paddw		mm2, mm4							;//57

		movq			mm6, mm5							;//62
		punpcklbw	mm5, mm0	;//expand lower 4 pix							;//63
		punpckhbw	mm6, mm0	;//expand higher 4 pix							;//64

		paddw		mm1, mm5							;//65
		paddw		mm2, mm6							;//66

		psrlw			mm1, 2										;//67
		psrlw			mm2, 2										;//68
		add		edx, hdim
		packuswb	mm1, mm2							;//69

		movq	[edi + ebx + 8], mm1						;//70


		add		ebx, hdim
		dec		ecx
	jg	mc16pels2DInterpolMMX_loop
	emms		
	}
/*
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
        out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
        out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
        out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
        out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2;
        out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2;
        out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2;
        out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2;
        out[8] = (in[8] + in[9] + in[hdim+8] + in[hdim+9] + 2) >> 2;
        out[9] = (in[9] + in[10] + in[hdim+9] + in[hdim+10] + 2) >> 2;
        out[10] = (in[10] + in[11] + in[hdim+10] + in[hdim+11] + 2) >> 2;
        out[11] = (in[11] + in[12] + in[hdim+11] + in[hdim+12] + 2) >> 2;
        out[12] = (in[12] + in[13] + in[hdim+12] + in[hdim+13] + 2) >> 2;
        out[13] = (in[13] + in[14] + in[hdim+13] + in[hdim+14] + 2) >> 2;
        out[14] = (in[14] + in[15] + in[hdim+14] + in[hdim+15] + 2) >> 2;
        out[15] = (in[15] + in[16] + in[hdim+15] + in[hdim+16] + 2) >> 2;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
*/
}


static void mc8pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize )
{
__asm {
	xor		ebx, ebx
	mov		edx, hdim		;//one row ahead
	mov		ecx, vSize
	mov		esi, inpix
	mov		edi, outpix
	pxor	mm0, mm0	;// mm0 = 0
	movq		mm7, const_0x0002000200020002
mc8pels2DInterpolMMX_loop:
		movq			mm1, [esi + ebx]
		movq			mm6, [esi + ebx +8]
		movq			mm5, mm1
		movq			mm2, mm1
		psrlq			mm5, 8			;//shift one byte
		punpcklbw	mm1, mm0	;//expand lower 4 pix
		psllq			mm6, 56			;//shift 7 bytes, LSB is in top pos of mm4
		punpckhbw	mm2, mm0	;//expand higher 4 pix

		pxor			mm5, mm6	;
		paddw		mm1, mm7		;//add 2
		movq			mm6, mm5
		punpcklbw	mm5, mm0	;//expand lower 4 pix
		paddw		mm2, mm7		;//add 2
		punpckhbw	mm6, mm0	;//expand higher 4 pix

		paddw		mm1, mm5
		paddw		mm2, mm6

		movq			mm3, [esi + edx]
		movq			mm6, [esi + edx + 8]
		movq			mm5, mm3
		movq			mm4, mm3
		punpcklbw	mm3, mm0	;//expand lower 4 pix
		psrlq			mm5, 8			;//shift one byte
		punpckhbw	mm4, mm0	;//expand higher 4 pix
		psllq			mm6, 56			;//shift 7 bytes, LSB is in top pos of mm4

		paddw		mm1, mm3
		pxor			mm5, mm6	;
		paddw		mm2, mm4

		movq			mm6, mm5
		punpcklbw	mm5, mm0	;//expand lower 4 pix
		punpckhbw	mm6, mm0	;//expand higher 4 pix

		paddw		mm1, mm5
		paddw		mm2, mm6

		psrlw			mm1, 2			
		psrlw			mm2, 2			
		add		edx, hdim
		packuswb	mm1, mm2

		movq	[edi + ebx], mm1
		add		ebx, hdim
		dec		ecx
	jg	mc8pels2DInterpolMMX_loop
	emms		
	}

/*
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
        out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
        out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
        out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
        out[4] = (in[4] + in[5] + in[hdim+4] + in[hdim+5] + 2) >> 2;
        out[5] = (in[5] + in[6] + in[hdim+5] + in[hdim+6] + 2) >> 2;
        out[6] = (in[6] + in[7] + in[hdim+6] + in[hdim+7] + 2) >> 2;
        out[7] = (in[7] + in[8] + in[hdim+7] + in[hdim+8] + 2) >> 2;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
*/
}


static void mc4pels2DInterpolMMX( PIXEL const *inpix, PIXEL *outpix, int hdim, int vSize )
{
__asm {
	xor		ebx, ebx
	mov		edx, hdim		;//one row ahead
	mov		ecx, vSize
	mov		esi, inpix
	mov		edi, outpix
	pxor	mm0, mm0	;// mm0 = 0
	movq		mm7, const_0x0002000200020002
mc4pels2DInterpolMMX_loop:
		movq			mm1, [esi + ebx]
		movq			mm5, mm1
		punpcklbw	mm1, mm0	;//expand lower 4 pix

		psrlq			mm5, 8			;//shift one byte
		punpcklbw	mm5, mm0	;//expand lower 4 pix

		paddw		mm1, mm7		;//add 2

		paddw		mm1, mm5

		movq			mm3, [esi + edx]
		movq			mm5, mm3
		punpcklbw	mm3, mm0	;//expand lower 4 pix

		paddw		mm1, mm3

		psrlq			mm5, 8			;//shift one byte
		punpcklbw	mm5, mm0	;//expand lower 4 pix

		paddw		mm1, mm5

		psrlw			mm1, 2			
		packuswb	mm1, mm0

		movd	[edi + ebx], mm1
		add		ebx, hdim
		add		edx, hdim
		dec		ecx
	jg	mc4pels2DInterpolMMX_loop
	emms		
	}





/*
    while (vSize > 0) {
        out[0] = (in[0] + in[1] + in[hdim+0] + in[hdim+1] + 2) >> 2;
        out[1] = (in[1] + in[2] + in[hdim+1] + in[hdim+2] + 2) >> 2;
        out[2] = (in[2] + in[3] + in[hdim+2] + in[hdim+3] + 2) >> 2;
        out[3] = (in[3] + in[4] + in[hdim+3] + in[hdim+4] + 2) >> 2;
        in += hdim;
        out += hdim;
        --vSize;
    }
    return;
*/
}




// mc - Perform motion compensation for a hSize x vSize block
void mcMMX( int hSize, int vSize, 
                    PIXEL *in, PIXEL *out, int hdim,
                    int mvX, int mvY    // Motion vector
                    )
{
    int intX, intY, fracX, fracY;    

    intX = mvX >> 1;    // Integer part of motion vector
    intY = mvY >> 1;
    fracX = mvX & 0x1;  // Fractional part of motion vector
    fracY = mvY & 0x1;
    in += intX + intY * hdim;
    if (hSize != 16  &&  hSize != 8  &&  hSize != 4) {
        H261ErrMsg("mc -- hSize not supported");
        exit(0);
    }

//#define MMX_MC_TEST
#ifdef MMX_MC_TEST
	{
	PIXEL in_org[17*352];
	PIXEL out_org[17*352];
	void mc( int hSize, int vSize, PIXEL in[], PIXEL out[], int hdim,int mvX, int mvY);
	int	bError = 0;

	int row, col;
	for(row=0; row<vSize+1; row++) {
		for(col=0; col<hSize+1; col++) {
			in_org[row*hdim + col] = in[row*hdim + col];
			//out_org[row*hdim + col] = out[row*hdim + col];
		}
	}
#endif


    if (fracY == 0) {
        if (fracX == 0) {
            // No interpolation
            if (hSize == 8) {
                mc8pelsNoInterpolMMX( in, out, hdim, vSize );
            } else if (hSize == 16) {
                mc16pelsNoInterpolMMX( in, out, hdim, vSize );
            } else {
                mc4pelsNoInterpolMMX( in, out, hdim, vSize );
            }
        } else {
            // Horizontal interpolation
            if (hSize == 8) {
                mc8pelsHorInterpolMMX( in, out, hdim, vSize );
            } else if (hSize == 16) {
                mc16pelsHorInterpolMMX( in, out, hdim, vSize );
            } else {
                mc4pelsHorInterpolMMX( in, out, hdim, vSize );
            }
        }
    } else if (fracX == 0) {
        // Vertical interpolation
        if (hSize == 8) {
            mc8pelsVertInterpolMMX( in, out, hdim, vSize );
        } else if (hSize == 16) {
            mc16pelsVertInterpolMMX( in, out, hdim, vSize );
        } else {
            mc4pelsVertInterpolMMX( in, out, hdim, vSize );
        }
    } else {    // Bilinear interpolation
        if (hSize == 8) {
            mc8pels2DInterpolMMX( in, out, hdim, vSize );
        } else if (hSize == 16) {
            mc16pels2DInterpolMMX( in, out, hdim, vSize );
        } else {
            mc4pels2DInterpolMMX( in, out, hdim, vSize );
        }
    }

#ifdef MMX_MC_TEST
	mc(hSize,vSize, in_org - intX - intY * hdim, out_org, hdim, mvX, mvY );
	for(row=0; row<vSize; row++) {
		for(col=0; col<hSize; col++) {
			if(in_org[row*hdim + col] != in[row*hdim + col]) {
				bError = 1;
			}
			if(out_org[row*hdim + col] != out[row*hdim + col]) {
				bError = 1;
			}
		}
	}

	}
#endif

    return;
}



// averageBlock - compute average of two hSize*vSize pixel arrays
void averageBlockMMX( PIXEL forPred[], int hSize, int vSize, int forOffset,
                          PIXEL backPred[], int backOffset )
{
    int row, col;
	int	nbPerRow = (hSize>>2) ;

	//do row size multiples of 4
	if(nbPerRow==2) {
		__asm {//8
		mov				esi, forPred
		mov				edi, backPred
		mov				ecx, vSize
		pxor			mm0, mm0;		// mm0 = 0
aver_8:
			movq			mm1, [esi]		
			movq			mm3, [edi]		
			movq			mm2, mm1
			movq			mm4, mm3
			punpcklbw	mm1, mm0	;// expand lower 4 pix
			punpckhbw	mm2, mm0	;// expand higher 4 pix
			punpcklbw	mm3, mm0	;// expand lower 4 pix
			punpckhbw	mm4, mm0	;// expand higher 4 pix
			paddw			mm1, mm3
			paddw			mm2, mm4
			psrlw			mm1, 1
			psrlw			mm2, 1
			packuswb	mm1, mm2
			add				edi, backOffset
			movq			[esi], mm1
			add				esi, forOffset
			dec				ecx
		jg aver_8
		emms
		}
	} else if(nbPerRow>=4) {
		nbPerRow = 4;
		__asm {//16
		mov				esi, forPred
		mov				edi, backPred
		mov				ecx, vSize
		pxor			mm0, mm0;		// mm0 = 0
aver_16:
			movq			mm1, [esi]	;//00
			movq			mm5, [esi+8]			;//10
			movq			mm2, mm1	;//00
			punpcklbw	mm1, mm0	;// expand lower 4 pix	;//00
			movq			mm6, mm5			;//10
			punpckhbw	mm2, mm0	;// expand higher 4 pix	;//00

			movq			mm3, [edi]			;//00
			punpcklbw	mm5, mm0	;// expand lower 4 pix			;//10
			movq			mm4, mm3	;//00
			punpckhbw	mm6, mm0	;// expand higher 4 pix			;//10
			movq			mm7, [edi+8]					;//10
			punpcklbw	mm3, mm0	;// expand lower 4 pix	;//00
			punpckhbw	mm4, mm0	;// expand higher 4 pix	;//00
			paddw			mm2, mm4	;//00
			paddw			mm1, mm3	;//00
			movq			mm4, mm7			;//10
			punpcklbw	mm7, mm0	;// expand lower 4 pix			;//10
			punpckhbw	mm4, mm0	;// expand higher 4 pix			;//10
			paddw			mm5, mm7			;//10
			psrlw			mm1, 1	;//00
			paddw			mm6, mm4			;//10
			psrlw			mm2, 1	;//00
			psrlw			mm5, 1			;//10
			packuswb	mm1, mm2	;//00
			psrlw			mm6, 1			;//10
			packuswb	mm5, mm6			;//10
			movq			[esi], mm1	;//00
			add				edi, backOffset
			movq			[esi+8], mm5			;//10
			add				esi, forOffset
			dec				ecx
		jg aver_16
		emms
		}
	} else if(nbPerRow==1) {
		__asm {//4
		mov				esi, forPred
		mov				edi, backPred
		mov				ecx, vSize
		pxor			mm0, mm0;		// mm0 = 0
aver_4:
			movq			mm1, [esi]		
			movq			mm3, [edi]		
			punpcklbw	mm1, mm0	;// expand lower 4 pix
			punpcklbw	mm3, mm0	;// expand lower 4 pix
			paddw			mm1, mm3
			psrlw			mm1, 1
			packuswb	mm1, mm0
			add				edi, backOffset
			movd			[esi], mm1
			add				esi, forOffset
			dec				ecx
		jg aver_4
		emms
		}
	} else if(nbPerRow==3) {
		__asm {//12
		mov				esi, forPred
		mov				edi, backPred
		mov				ecx, vSize
		pxor			mm0, mm0;		// mm0 = 0
aver_12:
			movq			mm1, [esi]	;//00
			movq			mm5, [esi+8]			;//10
			movq			mm2, mm1	;//00
			punpcklbw	mm1, mm0	;// expand lower 4 pix	;//00
			punpckhbw	mm2, mm0	;// expand higher 4 pix	;//00
			movq			mm3, [edi]			;//00
			punpcklbw	mm5, mm0	;// expand lower 4 pix			;//10
			movq			mm4, mm3	;//00
			punpcklbw	mm3, mm0	;// expand lower 4 pix	;//00
			punpckhbw	mm4, mm0	;// expand higher 4 pix	;//00
			movq			mm7, [edi+8]					;//10
			paddw			mm2, mm4	;//00
			punpcklbw	mm7, mm0	;// expand lower 4 pix			;//10
			paddw			mm1, mm3	;//00
			psrlw			mm2, 1	;//00
			paddw			mm5, mm7			;//10
			psrlw			mm1, 1	;//00
			psrlw			mm5, 1			;//10
			packuswb	mm1, mm2	;//00
			packuswb	mm5, mm0			;//10
			movq			[esi], mm1	;//00
			add				edi, backOffset
			movd			[esi+8], mm5			;//10
			add				esi, forOffset
			dec				ecx
		jg aver_12
		emms
		}
	}
	
    for (row = 0; row < vSize; ++row) {
        for (col = nbPerRow<<2; col < hSize; ++col) {//from smaller multiple of 4
            forPred[col] = (forPred[col] + backPred[col]) >> 1;
        }
        forPred  += forOffset;
        backPred += backOffset;
    }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?