📄 min_updated.cpp
字号:
//***************************************************************************/
//*
//* Copyright (c) 1998-99 Intel Corporation.
//* All rights reserved.
//*
//*
//***************************************************************************/
//
// min_updated.cpp
#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#include <xmmintrin.h>
#include "timestamp.h"
CTimeStamp timer;
typedef short WORD;
int dur;
union M64 {
WORD w[4];
__m64 m1;
};
// const WORD nVals = 0x100; // 256
// const WORD nVals = 0x1000; // 4096
const WORD nVals = 0x4000; // 16384
__declspec(align(32))
WORD a[nVals];
M64 mins;
M64 idxs;
M64 maxes = {0x7fff, 0x7fff, 0x7fff, 0x7fff};
WORD indexOfMin;
const WORD fours[4] = { (WORD)4, (WORD)4, (WORD)4, (WORD)4 };
void cmin() {
WORD tmpMin = a[0];
indexOfMin = 0;
timer.start();
for (int i=1; i<nVals; i++) {
if (a[i] < tmpMin) {
indexOfMin = i;
tmpMin = a[i];
}
}
dur = (int) timer.stop();
}
void xmmmin() {
/* Timer's function take up a huge portion of the overall number of clock ticks. To
reduce its effect, the code is ran 1000 times and then average is reported thus
reducting to minimal the effect of timer */
timer.start();
idxs.w[3] = nVals-1;
idxs.w[2] = nVals-2;
idxs.w[1] = nVals-3;
idxs.w[0] = nVals-4;
__asm {
push ebx
mov ebx, 0x3e8 // Run loop 1000
push edi
movq mm3, idxs
loop:
movq mm4, fours
lea edi, idxs
movzx ecx, WORD PTR nVals
sar ecx, 2
prefetcht0 -72[edx+ecx*8] // bring two iteration ahead
lea edx, a
movq mm7, -8[edx+ecx*8]
// movq mm2, [edi] // indices
// movq mm5, [edi]
movq mm2, mm3 // indeces
movq mm5, mm3 // indices
dec ecx
movq mm0, -8[edx+ecx*8]
dec ecx
psubw mm2, mm4
pminsw mm7, mm0 // get mins
pcmpeqw mm0, mm7 // create mask
maskmovq mm2, mm0 // store indicies of mins
paddw mm4, mm4 // constant to get next indices to look at
loop_top:
/* Two iteration of the loop are done in parallel, the effect of almost
a two time speed up achieved by using extra registers and pairing up
instruction to achive a minimal waist on a 16 bit instruction stream
that is being fed to the decoders */
movq mm0, -8[edx+ecx*8] // load next four numbers
movq mm1, -16[edx+ecx*8] // same as above
psubw mm2, mm4 // subtract 8 from each index to get next indices
psubw mm5, mm4 // same as above
pminsw mm7, mm0 // get mins
pminsw mm7, mm1
pcmpeqw mm0, mm7 // create mask
pcmpeqw mm1, mm7 // create mask
prefetcht0 -96[edx+ecx*8] // bring 6 iteration ahead
maskmovq mm5, mm0 // store indicies of mins
maskmovq mm2, mm1
sub ecx, 0x2 // decrement counter by 2
jnz loop_top
// clean up
movq mm2, [edi] // get updated indices
pshufw mm0, mm7, 0xe // min upper 2 with lower 2
pminsw mm0, mm7
pshufw mm1, mm0, 1 // min upper 1 with lower 1
pminsw mm1, mm0
pshufw mm1, mm1, 0 // scatter
movq mins, mm1 // save the min value
pcmpeqw mm7, mm1 // create mask
pand mm2, mm7 // indices of corresponding mins
pandn mm7, maxes // load the rest with "maxint"
por mm7, mm2 // merge
pshufw mm0, mm7, 0xe // min upper 2 with lower 2
pminsw mm0, mm7
pshufw mm1, mm0, 1 // min upper 1 with lower 1
pminsw mm1, mm0
pshufw mm1, mm1, 0 // scatter
movq [edi], mm1 // idxs of min values
mov ax, [edi]
mov indexOfMin, ax // index of min fixup
dec ebx
jnz loop
pop ebx
emms
pop edi
}
dur = (int) timer.stop();
// exit(0);
}
void mmxmin() {
/* Timer's function take up a huge portion of the overall number of clock ticks. To
reduce its effect, the code is ran 1000 times and then average is reported thus
reducting to minimal the effect of timer */
timer.start();
idxs.w[3] = nVals-1;
idxs.w[2] = nVals-2;
idxs.w[1] = nVals-3;
idxs.w[0] = nVals-4;
__asm {
push ebx
mov ebx, 0x3e8 // Run loop 1000
push edi
movq mm5, idxs
loop:
movq mm4, fours
lea edi, idxs
movzx ecx, WORD PTR nVals
sar ecx, 2
lea edx, a
movq mm7, -8[edx+ecx*8]
// movq mm2, [edi] // indices
movq mm2, mm5 // indices
psubw mm2, mm4 // adjust them down
dec ecx
movq mm0, -8[edx+ecx*8]
loop_top:
dec ecx
movq mm3, mm0 // save new values
pcmpgtw mm0, mm7 // 1's keep old mins
pxor mm7, mm3 // xor data together
pand mm7, mm0 // keep old mins
pxor mm7, mm3 // merge
pand mm6, mm0 // keep old indexes
pandn mm0, mm2 // maks new indexes
por mm6, mm0 // merge
movq mm0, -8[edx+ecx*8]
psubw mm2, mm4
jnz loop_top
// clean up
// movq mm2, [edi] // get updated indices
movq mm2, mm6
pshufw mm0, mm7, 0xe // min upper 2 with lower 2
pminsw mm0, mm7
pshufw mm1, mm0, 1 // min upper 1 with lower 1
pminsw mm1, mm0
pshufw mm1, mm1, 0 // scatter
movq mins, mm1 // save the min value
pcmpeqw mm7, mm1 // create mask
pand mm2, mm7 // indices of corresponding mins
pandn mm7, maxes // load the rest with "maxint"
por mm7, mm2 // merge
pshufw mm0, mm7, 0xe // min upper 2 with lower 2
pminsw mm0, mm7
pshufw mm1, mm0, 1 // min upper 1 with lower 1
pminsw mm1, mm0
pshufw mm1, mm1, 0 // scatter
movq [edi], mm1 // idxs of min values
mov ax, [edi]
mov indexOfMin, ax // index of min fixup
dec ebx
jnz loop
pop ebx
emms
pop edi
}
dur = (int) timer.stop();
// exit(0);
}
void intrmin() {
__m64* ptr = (__m64*)&a[nVals-4];
__m64 curMin = *ptr;
M64 fours_init = {4,4,4,4};
M64 indices = {nVals-4, nVals-3, nVals-2, nVals-1};
idxs = indices;
__m64 fours = fours_init.m1;
char* const edi_addr = (char*)&idxs;
timer.start();
for(int i=nVals-4; i>0; i-=4) {
__m64 nextVals = *ptr--;
curMin = _m_pminsw(curMin, nextVals);
__m64 mask = _m_pcmpeqw(nextVals, curMin);
_m_maskmovq(indices.m1, mask, edi_addr);
indices.m1 = _m_psubw(indices.m1,fours);
}
__m64 shuf = _m_pshufw(curMin, 0xe);
shuf = _m_pminsw(shuf, curMin);
__m64 shuf2 = _m_pshufw(shuf, 0x01);
shuf2 = _m_pminsw(shuf2, shuf);
mins.m1 = _m_pshufw(shuf2, 0);
__m64 minMask = _m_pcmpeqw(curMin, mins.m1);
__m64 minIdx = _m_por(_m_pand(minMask, idxs.m1), _m_pandn(minMask, maxes.m1));
shuf = _m_pshufw(minIdx, 0xe);
shuf = _m_pminsw(shuf, minIdx);
shuf2 = _m_pshufw(shuf, 0x01);
shuf2 = _m_pminsw(shuf2, shuf);
idxs.m1 = _m_pshufw(shuf2, 0);
indexOfMin = idxs.w[0];
_m_empty();
dur = (int) timer.stop();
}
int main(int argc, char **argv) {
srand(20); // fixed seed
cout << "Program " << argv[0] << " starting\n";
for (int i=0; i<nVals; i++) {
a[i] = rand();
}
// cmin();
xmmmin();
// mmxmin();
// intrmin();
cout << indexOfMin << "\t" << a[indexOfMin] << endl;
FILE *fPtr;
fPtr = fopen("App_Test.txt", "a+");
fprintf (fPtr, "%d\n", dur);
printf("Don't forget !!! For xmmmin() and mmxmin() divide obtained timings by 1000\n");
printf("Duration = %d\n", dur); /* For xmmmin() and mmxmin() divide by 1000 */
fclose(fPtr);
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -