📄 min_updated.cpp

📁 vc写的SSE2优化离散余弦变换
💻 CPP
字号:
//***************************************************************************/
//*
//*                  Copyright (c) 1998-99 Intel Corporation.
//*                         All rights reserved.
//*
//*
//***************************************************************************/
// 
// min_updated.cpp

#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#include <xmmintrin.h>
#include "timestamp.h"

CTimeStamp	timer;

typedef short WORD;
int dur;

union M64 {
	WORD w[4];
	__m64 m1;
};

//	const WORD nVals = 0x100;	// 256
//	const WORD nVals = 0x1000;	// 4096
	const WORD nVals = 0x4000;	// 16384

__declspec(align(32))
WORD a[nVals];

M64 mins;
M64 idxs;
M64  maxes		= {0x7fff, 0x7fff, 0x7fff, 0x7fff};

WORD indexOfMin;


const WORD fours[4] =  { (WORD)4, (WORD)4, (WORD)4, (WORD)4 };

void cmin() {
	WORD tmpMin = a[0];

	indexOfMin = 0;

	timer.start();
	for (int i=1; i<nVals; i++) {
		if (a[i] < tmpMin) {
			indexOfMin = i;
			tmpMin = a[i];
		}
	}
	dur = (int) timer.stop();
}

void xmmmin() {
/*	Timer's function take up a huge portion of the overall number of clock ticks.  To
	reduce its effect, the code is ran 1000 times and then average is reported thus
	reducting to minimal the effect of timer */
	timer.start();
	idxs.w[3] = nVals-1;
	idxs.w[2] = nVals-2;
	idxs.w[1] = nVals-3;
	idxs.w[0] = nVals-4;


		__asm {
			push ebx			
			mov ebx, 0x3e8 // Run loop 1000

			push	edi
			movq	mm3, idxs
loop:
			movq	mm4, fours
			lea		edi, idxs
			movzx	ecx, WORD PTR nVals
			sar		ecx, 2
			prefetcht0	-72[edx+ecx*8]	// bring two iteration ahead
			lea		edx, a
			movq	mm7, -8[edx+ecx*8]
//			movq	mm2, [edi]		// indices
//			movq	mm5, [edi]
			movq	mm2, mm3		// indeces
			movq	mm5, mm3		// indices
			dec		ecx

			movq		mm0, -8[edx+ecx*8]
			dec			ecx
			psubw		mm2, mm4
			pminsw		mm7, mm0	// get mins
			pcmpeqw		mm0, mm7	// create mask
			maskmovq	mm2, mm0	// store indicies of mins

			paddw	mm4, mm4		// constant to get next indices to look at

loop_top:
/*	Two iteration of the loop are done in parallel, the effect of almost
	a two time speed up achieved by using extra registers and pairing up
	instruction to achive a minimal waist on a 16 bit instruction stream
	that is being fed to the decoders */
			
			movq		mm0, -8[edx+ecx*8]  // load next four numbers
			movq		mm1, -16[edx+ecx*8]	// same as above
			psubw		mm2, mm4		// subtract 8 from each index to get next indices
			psubw		mm5, mm4		// same as above
			pminsw		mm7, mm0		// get mins
			pminsw		mm7, mm1	
			pcmpeqw		mm0, mm7		// create mask
			pcmpeqw		mm1, mm7		// create mask
			prefetcht0	-96[edx+ecx*8]	// bring 6 iteration ahead
			maskmovq	mm5, mm0		// store indicies of mins
			maskmovq	mm2, mm1
			sub			ecx, 0x2		// decrement counter by 2
			jnz		loop_top

										// clean up
	
			movq	mm2, [edi]			// get updated indices

			pshufw	mm0, mm7, 0xe		// min upper 2 with lower 2
			pminsw	mm0, mm7

			pshufw	mm1, mm0, 1			// min upper 1 with lower 1
			pminsw	mm1, mm0
			pshufw	mm1, mm1, 0			// scatter

			movq	mins, mm1			// save the min value

			pcmpeqw	mm7, mm1			// create mask
			pand	mm2, mm7			// indices of corresponding mins
			pandn	mm7, maxes			// load the rest with "maxint"
			por		mm7, mm2			// merge

			pshufw	mm0, mm7, 0xe		// min upper 2 with lower 2
			pminsw	mm0, mm7

			pshufw	mm1, mm0, 1			// min upper 1 with lower 1
			pminsw	mm1, mm0

			pshufw	mm1, mm1, 0			// scatter

			movq	[edi], mm1			// idxs of min values
			mov		ax, [edi]
			mov		indexOfMin, ax		// index of min fixup
			
		dec ebx			
		jnz loop

			pop ebx
			emms
			pop		edi
		}
	dur = (int) timer.stop();
		
//		exit(0);
}

void mmxmin() {
/*	Timer's function take up a huge portion of the overall number of clock ticks.  To
	reduce its effect, the code is ran 1000 times and then average is reported thus
	reducting to minimal the effect of timer */
	timer.start();
	idxs.w[3] = nVals-1;
	idxs.w[2] = nVals-2;
	idxs.w[1] = nVals-3;
	idxs.w[0] = nVals-4;

		__asm {
			push ebx			
			mov ebx, 0x3e8 // Run loop 1000

			push	edi
			movq	mm5, idxs
loop:
			movq	mm4, fours
			lea		edi, idxs
			movzx	ecx, WORD PTR nVals
			sar		ecx, 2
			lea		edx, a
			movq	mm7, -8[edx+ecx*8]
//			movq	mm2, [edi]		// indices
			movq	mm2, mm5		// indices 
			psubw	mm2, mm4		// adjust them down
			dec		ecx
			movq	mm0, -8[edx+ecx*8]
loop_top:
			dec		ecx
			
			movq	mm3, mm0		// save new values
			pcmpgtw	mm0, mm7		// 1's keep old mins
			pxor	mm7, mm3		// xor data together
			pand	mm7, mm0		// keep old mins
			pxor	mm7, mm3		// merge
			pand	mm6, mm0		// keep old indexes
			pandn	mm0, mm2		// maks new indexes
			por		mm6, mm0		// merge			
			movq	mm0, -8[edx+ecx*8]
			psubw	mm2, mm4

			jnz		loop_top

									// clean up

//			movq	mm2, [edi]		// get updated indices
			movq	mm2, mm6

			pshufw	mm0, mm7, 0xe	// min upper 2 with lower 2
			pminsw	mm0, mm7

			pshufw	mm1, mm0, 1		// min upper 1 with lower 1
			pminsw	mm1, mm0

			pshufw	mm1, mm1, 0		// scatter

			movq	mins, mm1		// save the min value

			pcmpeqw	mm7, mm1		// create mask
			pand	mm2, mm7		// indices of corresponding mins
			pandn	mm7, maxes		// load the rest with "maxint"
			por		mm7, mm2		// merge

			pshufw	mm0, mm7, 0xe	// min upper 2 with lower 2
			pminsw	mm0, mm7

			pshufw	mm1, mm0, 1		// min upper 1 with lower 1
			pminsw	mm1, mm0

			pshufw	mm1, mm1, 0		// scatter
			movq	[edi], mm1		// idxs of min values
			mov		ax, [edi]
			mov		indexOfMin, ax	// index of min fixup
			dec ebx
			jnz loop

			pop ebx		
			emms
			pop edi
		}

		dur = (int) timer.stop();
//		exit(0);
}


void intrmin() {
	__m64* ptr = (__m64*)&a[nVals-4];
	__m64 curMin	= *ptr;
	M64  fours_init = {4,4,4,4};
	M64  indices	= {nVals-4, nVals-3, nVals-2, nVals-1};
	idxs = indices;
	__m64 fours = fours_init.m1;
	char* const edi_addr = (char*)&idxs;

	timer.start();
	for(int i=nVals-4; i>0; i-=4) {
		__m64	nextVals	= *ptr--;
				curMin		= _m_pminsw(curMin, nextVals);
		__m64	mask		= _m_pcmpeqw(nextVals, curMin);
				_m_maskmovq(indices.m1, mask, edi_addr);
				indices.m1		= _m_psubw(indices.m1,fours);
	}
	__m64	shuf	= _m_pshufw(curMin, 0xe);
			shuf	= _m_pminsw(shuf, curMin);
	__m64	shuf2	= _m_pshufw(shuf, 0x01);
			shuf2	= _m_pminsw(shuf2, shuf);
			mins.m1	= _m_pshufw(shuf2, 0);
	__m64	minMask = _m_pcmpeqw(curMin, mins.m1);
	__m64	minIdx	= _m_por(_m_pand(minMask, idxs.m1), _m_pandn(minMask, maxes.m1));
			shuf	= _m_pshufw(minIdx, 0xe);
			shuf	= _m_pminsw(shuf, minIdx);
			shuf2	= _m_pshufw(shuf, 0x01);
			shuf2	= _m_pminsw(shuf2, shuf);
			idxs.m1	= _m_pshufw(shuf2, 0);
			indexOfMin = idxs.w[0];
			_m_empty();

	dur = (int) timer.stop();
}

int main(int argc, char **argv) {

	srand(20);		// fixed seed

	cout << "Program " << argv[0] << " starting\n";
	for (int i=0; i<nVals; i++) {
		a[i] = rand();
	}

	//	cmin();
	xmmmin();
	//	mmxmin();
	//	intrmin();

	cout << indexOfMin << "\t" << a[indexOfMin] << endl;

	FILE	*fPtr;
	fPtr = fopen("App_Test.txt", "a+");
	fprintf (fPtr, "%d\n", dur);
	printf("Don't forget !!! For xmmmin() and mmxmin() divide obtained timings by 1000\n");
	printf("Duration = %d\n", dur); /* For xmmmin() and mmxmin() divide by 1000 */
	fclose(fPtr);

	return 0;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -