📄 idct_ap922sse_opt.cpp
字号:
#include "StdAfx.h"
#pragma warning(once:4305 4244)
/*
------------------------------------------------------------------------
idct_ap922sse_opt.c - AP922float iDCT, Intel SSE implementation
Requirements :
Visual C++ 6.0 Pro + Visual Studio Service Pack 4 + Processor Pack Beta
SSE capable CPU (Pentium-III, Celeron-II)
------------------------------------------------------------------------
AP-922 floating-point iDCT (SSE)
--------------------------------
idct_ap922sse_opt.c is an "optimized" version of idct_ap922sse_rawcode.c
Both files implement functionally the same code. Persons wanting to
enhance the performance of AP922sse should work with the raw_code file
and not *this* file : this "optimized" code is far less readable than
the raw_code version.
For algorithm design notes, please see idct_ap922sse_rawcode.c
------------------------------------------
LEGAL MUMBO-JUMBO : Disclaimer of Warranty
------------------------------------------
This software is available to the user without any license fee or
royalty on an "as is" basis. The author disclaims any and all warranties,
whether express, implied, or statuary, including any implied warranties or
merchantability or of fitness for a particular purpose. In no event shall
the copyright-holder be liable for any incidental, punitive, or
consequential damages of any kind whatsoever arising from the use of these
programs.
This disclaimer of warranty extends to the user of these programs and
user's customers, employees, agents, transferees, successors, and assigns.
Revision history
----------------
09/03/2000 initial release of 'AP922float_SSE' iDCT
For algorithm design notes, please see idct_ap922sse_rawcode.c
Despite spending 4 hours hand-scheduling/optimizing SSE assembly,
Visual C++'s profiler does not report any speed increase.
DOOHHHH...but anyway here it is.
IMPORTANT, Visual C++ does not seem to align data to 16-byte
address offset. You will need to manually do this!!!
09/03/2000 liaor@iname.com http://members.tripod.com/~liaor
*/
#define NCOS1_16 (0.980785280403230449126182236134239) // cosine( Pi/16 )
#define NCOS2_16 (0.923879532511286756128183189396788) // cosine( 2Pi/16 )
#define NCOS3_16 (0.831469612302545237078788377617906) // cosine( 3Pi/16 )
#define NCOS4_16 (0.707106781186547524400844362104849) // cosine( 4Pi/16 )
#define NCOS5_16 (0.555570233019602224742830813948533) // cosine( 5Pi/16 )
#define NCOS6_16 (0.382683432365089771728459984030399) // cosine( 6Pi/16 )
#define NCOS7_16 (0.195090322016128267848284868477022) // cosine( 7Pi/16 )
#define TANG1_16 ( NCOS7_16 / NCOS1_16) // tangent( Pi/16)
#define TANG2_16 ( NCOS6_16 / NCOS2_16) // tangent(2Pi/16)
#define TANG3_16 ( NCOS5_16 / NCOS3_16) // tangent(3Pi/16)
// Scaled floating-point coefficient table
// Original (MMX) iDCT SSE-iDCT
// table-ordering -> table-ordering
// ------------------ ------------------
// w00, w02, w04, w06, -> w00, w01, w02, w03,
// w01, w03, w05, w07, -> w04, w05, w06, w07,
// w08, w10, w12, w14, -> w08, w09, w10, w11,
// w09, w11, w13, w15, -> w12, w13, w14, w15,
// w16, w18, w20, w22, -> w16, w17, w18, w19,
// w17, w19, w21, w23, -> w20, w21, w22, w23,
// w24, w26, w28, w30, -> w24, w25, w26, w27,
// w25, w27, w29, w31 -> w28, w29, w30, w31
//
// Compared to the integer-based table, the float table is downscaled by a
// multiplication factor of 0.5
#define OVERALL_DOWNSCALE (0.25/(1<<9))
// all table entries multiplied by an OVERALL_DOWNSCALE of (0.5*0.5)=0.25
#define RS0 (NCOS4_16*OVERALL_DOWNSCALE) // iDCT row#0 scalar
#define RS1 (NCOS1_16*OVERALL_DOWNSCALE) // iDCT row#1 scalar
#define RS2 (NCOS2_16*OVERALL_DOWNSCALE) // iDCT row#2 scalar
#define RS3 (NCOS3_16*OVERALL_DOWNSCALE) // iDCT row#3 scalar
#define RS4 (NCOS4_16*OVERALL_DOWNSCALE) // iDCT row#4 scalar
#define RS5 (NCOS3_16*OVERALL_DOWNSCALE) // iDCT row#5 scalar
#define RS6 (NCOS2_16*OVERALL_DOWNSCALE) // iDCT row#6 scalar
#define RS7 (NCOS1_16*OVERALL_DOWNSCALE) // iDCT row#7 scalar
#define DESCALE_SHIFT2 7
//#define DESCALE_SHIFT1 (16+6-DESCALE_SHIFT2)
//#define DESCALE_SHIFT1 (16-9-DESCALE_SHIFT2)
//#define RND_INV_ROWF (1 << (16+6-1) )
#define RND_INV_ROWF (1 << (16-9-1) )
#define RND_FLOAT_OFFSET ((RND_INV_ROWF) - 0.5)
//const static int r_inv_row_sse[2] = {RND_INV_ROWF-1, RND_INV_ROWF-1};
const static int r_inv_row_sse[2] = {0, 0};
const float tab_i_01234567ssef[] =
{
// Row #0
NCOS4_16*RS0, NCOS4_16*RS0, NCOS4_16*RS0, NCOS4_16*RS0,
NCOS2_16*RS0, NCOS6_16*RS0, -NCOS6_16*RS0, -NCOS2_16*RS0,
NCOS4_16*RS0, -NCOS4_16*RS0, -NCOS4_16*RS0, NCOS4_16*RS0,
NCOS6_16*RS0, -NCOS2_16*RS0, NCOS2_16*RS0, -NCOS6_16*RS0,
NCOS1_16*RS0, NCOS3_16*RS0, NCOS5_16*RS0, NCOS7_16*RS0,
NCOS3_16*RS0, -NCOS7_16*RS0, -NCOS1_16*RS0, -NCOS5_16*RS0,
NCOS5_16*RS0, -NCOS1_16*RS0, NCOS7_16*RS0, NCOS3_16*RS0,
NCOS7_16*RS0, -NCOS5_16*RS0, NCOS3_16*RS0, -NCOS1_16*RS0,
// Row #1
NCOS4_16*RS1, NCOS4_16*RS1, NCOS4_16*RS1, NCOS4_16*RS1,
NCOS2_16*RS1, NCOS6_16*RS1, -NCOS6_16*RS1, -NCOS2_16*RS1,
NCOS4_16*RS1, -NCOS4_16*RS1, -NCOS4_16*RS1, NCOS4_16*RS1,
NCOS6_16*RS1, -NCOS2_16*RS1, NCOS2_16*RS1, -NCOS6_16*RS1,
NCOS1_16*RS1, NCOS3_16*RS1, NCOS5_16*RS1, NCOS7_16*RS1,
NCOS3_16*RS1, -NCOS7_16*RS1, -NCOS1_16*RS1, -NCOS5_16*RS1,
NCOS5_16*RS1, -NCOS1_16*RS1, NCOS7_16*RS1, NCOS3_16*RS1,
NCOS7_16*RS1, -NCOS5_16*RS1, NCOS3_16*RS1, -NCOS1_16*RS1,
// Row #2
NCOS4_16*RS2, NCOS4_16*RS2, NCOS4_16*RS2, NCOS4_16*RS2,
NCOS2_16*RS2, NCOS6_16*RS2, -NCOS6_16*RS2, -NCOS2_16*RS2,
NCOS4_16*RS2, -NCOS4_16*RS2, -NCOS4_16*RS2, NCOS4_16*RS2,
NCOS6_16*RS2, -NCOS2_16*RS2, NCOS2_16*RS2, -NCOS6_16*RS2,
NCOS1_16*RS2, NCOS3_16*RS2, NCOS5_16*RS2, NCOS7_16*RS2,
NCOS3_16*RS2, -NCOS7_16*RS2, -NCOS1_16*RS2, -NCOS5_16*RS2,
NCOS5_16*RS2, -NCOS1_16*RS2, NCOS7_16*RS2, NCOS3_16*RS2,
NCOS7_16*RS2, -NCOS5_16*RS2, NCOS3_16*RS2, -NCOS1_16*RS2,
// Row #3
NCOS4_16*RS3, NCOS4_16*RS3, NCOS4_16*RS3, NCOS4_16*RS3,
NCOS2_16*RS3, NCOS6_16*RS3, -NCOS6_16*RS3, -NCOS2_16*RS3,
NCOS4_16*RS3, -NCOS4_16*RS3, -NCOS4_16*RS3, NCOS4_16*RS3,
NCOS6_16*RS3, -NCOS2_16*RS3, NCOS2_16*RS3, -NCOS6_16*RS3,
NCOS1_16*RS3, NCOS3_16*RS3, NCOS5_16*RS3, NCOS7_16*RS3,
NCOS3_16*RS3, -NCOS7_16*RS3, -NCOS1_16*RS3, -NCOS5_16*RS3,
NCOS5_16*RS3, -NCOS1_16*RS3, NCOS7_16*RS3, NCOS3_16*RS3,
NCOS7_16*RS3, -NCOS5_16*RS3, NCOS3_16*RS3, -NCOS1_16*RS3,
// Row #4
NCOS4_16*RS4, NCOS4_16*RS4, NCOS4_16*RS4, NCOS4_16*RS4,
NCOS2_16*RS4, NCOS6_16*RS4, -NCOS6_16*RS4, -NCOS2_16*RS4,
NCOS4_16*RS4, -NCOS4_16*RS4, -NCOS4_16*RS4, NCOS4_16*RS4,
NCOS6_16*RS4, -NCOS2_16*RS4, NCOS2_16*RS4, -NCOS6_16*RS4,
NCOS1_16*RS4, NCOS3_16*RS4, NCOS5_16*RS4, NCOS7_16*RS4,
NCOS3_16*RS4, -NCOS7_16*RS4, -NCOS1_16*RS4, -NCOS5_16*RS4,
NCOS5_16*RS4, -NCOS1_16*RS4, NCOS7_16*RS4, NCOS3_16*RS4,
NCOS7_16*RS4, -NCOS5_16*RS4, NCOS3_16*RS4, -NCOS1_16*RS4,
// Row #5
NCOS4_16*RS5, NCOS4_16*RS5, NCOS4_16*RS5, NCOS4_16*RS5,
NCOS2_16*RS5, NCOS6_16*RS5, -NCOS6_16*RS5, -NCOS2_16*RS5,
NCOS4_16*RS5, -NCOS4_16*RS5, -NCOS4_16*RS5, NCOS4_16*RS5,
NCOS6_16*RS5, -NCOS2_16*RS5, NCOS2_16*RS5, -NCOS6_16*RS5,
NCOS1_16*RS5, NCOS3_16*RS5, NCOS5_16*RS5, NCOS7_16*RS5,
NCOS3_16*RS5, -NCOS7_16*RS5, -NCOS1_16*RS5, -NCOS5_16*RS5,
NCOS5_16*RS5, -NCOS1_16*RS5, NCOS7_16*RS5, NCOS3_16*RS5,
NCOS7_16*RS5, -NCOS5_16*RS5, NCOS3_16*RS5, -NCOS1_16*RS5,
// Row #6
NCOS4_16*RS6, NCOS4_16*RS6, NCOS4_16*RS6, NCOS4_16*RS6,
NCOS2_16*RS6, NCOS6_16*RS6, -NCOS6_16*RS6, -NCOS2_16*RS6,
NCOS4_16*RS6, -NCOS4_16*RS6, -NCOS4_16*RS6, NCOS4_16*RS6,
NCOS6_16*RS6, -NCOS2_16*RS6, NCOS2_16*RS6, -NCOS6_16*RS6,
NCOS1_16*RS6, NCOS3_16*RS6, NCOS5_16*RS6, NCOS7_16*RS6,
NCOS3_16*RS6, -NCOS7_16*RS6, -NCOS1_16*RS6, -NCOS5_16*RS6,
NCOS5_16*RS6, -NCOS1_16*RS6, NCOS7_16*RS6, NCOS3_16*RS6,
NCOS7_16*RS6, -NCOS5_16*RS6, NCOS3_16*RS6, -NCOS1_16*RS6,
// Row #7
NCOS4_16*RS7, NCOS4_16*RS7, NCOS4_16*RS7, NCOS4_16*RS7,
NCOS2_16*RS7, NCOS6_16*RS7, -NCOS6_16*RS7, -NCOS2_16*RS7,
NCOS4_16*RS7, -NCOS4_16*RS7, -NCOS4_16*RS7, NCOS4_16*RS7,
NCOS6_16*RS7, -NCOS2_16*RS7, NCOS2_16*RS7, -NCOS6_16*RS7,
NCOS1_16*RS7, NCOS3_16*RS7, NCOS5_16*RS7, NCOS7_16*RS7,
NCOS3_16*RS7, -NCOS7_16*RS7, -NCOS1_16*RS7, -NCOS5_16*RS7,
NCOS5_16*RS7, -NCOS1_16*RS7, NCOS7_16*RS7, NCOS3_16*RS7,
NCOS7_16*RS7, -NCOS5_16*RS7, NCOS3_16*RS7, -NCOS1_16*RS7,
// iDCT column coefficients
NCOS4_16, NCOS4_16, NCOS4_16, NCOS4_16,
TANG1_16, TANG1_16, TANG1_16, TANG1_16,
TANG2_16, TANG2_16, TANG2_16, TANG2_16,
TANG3_16, TANG3_16, TANG3_16, TANG3_16,
RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET,
0 , 0 , 0 , 0, // not used, padding
0 , 0 , 0 , 0, // not used, padding
0 , 0 , 0 , 0 // not used, padding
};
// externally allocated array of 64 floats, for temporary storage
//extern void *fTempArray;
__declspec(align(16)) static float temparray[64];
void *fTempArray = temparray;
void idct_ap922float_sse(short *data)
{
// static float scratch[8];
static const __int64 mmAndmask = 0xFFFF0000FFFF0000; // [ XX __ XX __ ]
// AP-922 inverse_DCT row transform (floating point version)
//
// Row IDCT operator :A_T*M_T*P_T
// Let Y=[output column data, 8 elements] 32-bit IEEE-754 float
// X=[input column data, 8 elements] 16-bit short integer
//
// Y= [ A_T*M_T*P_T ] * X
//
// (Y and X are both column vectors)
#define INPR eax
#define TABLE ecx
#define TABLEP1 esi // starting address of (TABLE +32 elements)
#define OUTR edx
__asm {
// stmxcsr [_mxcsr_backup]; // store MXCSR to mem location _mxcsr_backup
mov OUTR, dword ptr [fTempArray]; // The content of fTempArray is zero which brings access violation
mov edi, 0x00; // i = 0
// mov ebx, [_mxcsr_backup];
sub OUTR, 32; // precompensate OUT
mov INPR, dword ptr [data]; ;// row 0 // The content of fTempArray is zero which brings access violation
// lea INPR, dword ptr [data]
// and ebx, 0xFFFF9FFF; // mask out rounding-mode (bits14:13) control
lea TABLEP1, dword ptr [tab_i_01234567ssef]; // row 0
// or ebx, 0x00002000; // set rounding mode to round-down 14-13 (01b)
//or ebx, 0x00000000; // set rounding mode to nearest 14-13 (01b)
movq mm7, [mmAndmask]; // mm7 <= [ FFFF 0000 FFFF 0000]
mov TABLE, TABLEP1;
add TABLEP1, 128; // initialize
// mov [_mxcsr_fdct], ebx; // store new mxcsr pattern
// ldmxcsr [ _mxcsr_fdct ];
// for ( x = 0; x < 8; ++x ) // transform one row per iteration
//ALIGN 16 ;// align jump address to 16 byte offset
// acc_idct_colloop1:
//
// begin (0) - Convert short -> float data
//
movq mm2, [INPR] ; // (0) mm2 <= x3 x2 x1 x0
;//
movq mm4, [INPR+8] ; // (0) mm4 <= x7 x6 x5 x4
pshufw mm0, mm2, 00000000b ;// (0) mm0 <= x0 x0 x0 x0
pshufw mm1, mm2, 10101010b ;// (0) mm1 <= x2 x2 x2 x2
pand mm0, mm7; ;// (0) mm0 = [ x0 x0 ]*SCALEUP (dword)
cvtpi2ps xmm0, mm0; ;// (0) xmm0 <= [__ __ x0 x0] float
// mm0 free
pand mm1, mm7; ;// (0) mm1 = [ x2 x2 ]*SCALEUP (dword)
prefetcht0 [TABLEP1 + 0*32] ;// prefetch coeff +1row, [w7..0]
pshufw mm0, mm4, 00000000b ;// (0) mm0 <= x4 x4 x4 x4
cvtpi2ps xmm1, mm1; ;// (0) xmm1 <= [__ __ x2 x2] float
// mm1 free
pand mm0, mm7; ;// (0) mm4 <= [x4 x4]*SCALEUP (dword)
prefetcht0 [TABLEP1 + 1*32] ;// prefetch coeff +1row [w15..8]
movlhps xmm0, xmm0; ;// (0) xmm0 <= [x0 x0 x0 x0] float
cvtpi2ps xmm2, mm0; ;// (0) xmm2 <= [__ __ x4 x4] float
// mm0 free
// 1) Apply M_T*P_T operator (P_T is implicit with table/element order)
// (table order has been changed for optimum SSE execution)
mulps xmm0, [TABLE + 0*16] ;// (1a) xmm0 <= a_v0 = [w3..w0] * x0
pshufw mm1, mm4, 10101010b; // (0) mm1 <= [x6 x6 x6 x6]
pand mm1, mm7; ;// (0) mm1 <= [x6 x6]*SCALEUP (dword)
movlhps xmm1, xmm1; ;// (0) xmm1 <=[ x2 x2 x2 x2] float
mulps xmm1, [TABLE + 1*16] ;// (1a) xmm1 <= a_v1 = [w7..w4] * x2
cvtpi2ps xmm3, mm1; ;// (0) xmm3 <=[ __ __ x6 x6] float
// mm1 free
movlhps xmm2, xmm2; ;// (0) xmm2 <= [x4 x4 x4 x4] float
pshufw mm0, mm2, 01010101b; // (0) mm1 <=[x1 x1 x1 x1]
mulps xmm2, [TABLE + 2*16]; // (1a) xmm2 <= a_v2 = [w11..w8] * x4
pshufw mm1, mm2, 11111111b; // (0) mm1 <=[x3 x3 x3 x3]
// mm2 free
pand mm0, mm7; // (0) mm0 <=[x1 x1]*SCALEUP (dword)
movlhps xmm3, xmm3; ;// (0) xmm3 <= [x6 x6 x6 x6]
mulps xmm3, [TABLE + 3*16] ; // (1a) xmm3 <= a_v3 = [w15..w12] * x6
cvtpi2ps xmm4, mm0; // (0) xmm4 <=[__ __ x1 x1] float
// mm0 free
/////////////////////////////////////////////
//
// loop point
// for ( i = 0; i < 8; i=i+1 )
// {
// ....// idct_row(#i)
ALIGN 16 ;// align jump address to 16 byte offset
acc_idct_rowloop1:
addps xmm1, xmm0; ;// (1b) xmm1 <= a_v0 + a_v1
// xmm0 free
pand mm1, mm7; // (0) mm1 <=[x3 x3]*SCALEUP (dword)
pshufw mm0, mm4, 01010101b; // (0) mm0 <= [x5 x5 x5 x5]
cvtpi2ps xmm5, mm1; // (0) xmm5 <=[__ __ x3 x3] float
pand mm0, mm7; // (0) mm0 <= [x5 x5]*SCALEUP (dword)
// mm1 free
cvtpi2ps xmm6, mm0; // (0) xmm6 <=[__ __ x5 x5] float
// mm0 free
movlhps xmm4, xmm4; // (0) xmm4 <= [x1 x1 x1 x1]
mulps xmm4, [TABLE + 4*16] ;// (1a) xmm4 <= b_v0
pshufw mm1, mm4, 11111111b; // (0) mm1 <= [x7 x7 x7 x7]
// mm4 free
addps xmm2, xmm1; ;// (1b) xmm2 + a_v2 <= (a_v0+a_v1) + a_v2
// xmm1 free
movlhps xmm5, xmm5; ;// (0) xmm5 <=[x3 x3 x3 x3] float
mulps xmm5, [TABLE + 5*16] ;// (1a) xmm5 <= b_v1
pand mm1, mm7; ;// (0) mm1 <= [x7 x7]*SCALEUP (dword)
cvtpi2ps xmm7, mm1; ;// (0) xmm7 <= [__ __ x7 x7]
// mm1 free
movlhps xmm6, xmm6; // (0) xmm6 <= [x5 x5 x5 x5] float
mulps xmm6, [TABLE + 6*16] ;// (1a) xmm6 <= b_v2
add OUTR, 32; ;// OUT <= OUT + 32 (increment row)
addps xmm3, xmm2; ;// (1b) xmm3 + a_v3 <=(a_v0+a_v1+a_v2) + a_v3
// xmm3 = complete a-vector ( a_v[3..0] )
// xmm2 free
add INPR, 16; // increment INP(psrc) + 1row
prefetcht0 [TABLEP1 + 2*32] ;// prefetch coeff +1row [w23..15]
movlhps xmm7, xmm7;
mulps xmm7, [TABLE + 7*16] ;// (1a) xmm7 <= b_v3
addps xmm5, xmm4; ;// (1b) xmm5 <= b_v0 + b_v1
// xmm4 free
///////////////////////////////////////////////////////////////
//
// begin processing next row ("+1")
//
// the following code intermixes processing from row (x) and (x+1)
// This is the result of software pipelining to reduce wasted
// cycles.
movq mm2, [INPR] ; // x+1 (0) mm2 <= x3 x2 x1 x0
add TABLE, 128; // move TABLE to w[32]
prefetcht0 [TABLEP1 + 3*32] ;// prefetch coeff +1row [w31..24]
add TABLEP1, 128; // move TABLEP1 to w[32]
// xmm4 used
movaps xmm4, xmm3; // (1b) xmm4 <= a_v[3..0]
pshufw mm0, mm2, 00000000b ;// x+1 (0) mm0 <= x0 x0 x0 x0
movq mm4, [INPR+8] ; // x+1 (0) mm4 <= x7 x6 x5 x4
addps xmm6, xmm5; // (1b) xmm6 + b_v2 <= (b_v0+b_v1) + b_v2
// xmm5 free
pshufw mm1, mm2, 10101010b ;// x+1 (0) mm1 <= x2 x2 x2 x2
pand mm0, mm7; ;// x+1 (0) mm0 = [ x0 x0 ]*SCALEUP (dword)
cvtpi2ps xmm0, mm0; ;// x+1 (0) xmm0 <= [__ __ x0 x0] float
// mm0 free
pand mm1, mm7; ;// x+1 (0) mm1 = [ x2 x2 ]*SCALEUP (dword)
// xmm5 used
movaps xmm5, xmm3; ;// xmm5 <= a_v[3..0]
// xmm3 free
add edi, 1; // i = i + 1 (loop index)
prefetcht0 [TABLEP1 + 0*32] ;// x+1 prefetch coeff +1row, [w7..0]
pshufw mm0, mm4, 00000000b ;// x+1 (0) mm0 <= x4 x4 x4 x4
addps xmm7, xmm6; // (1b) xmm7 + b_v3 <=(b_v0+b_v1+b_v2) + b_v3
// xmm6 free
// xmm7 = complete b-vector ( b_v[3..0] )
pand mm0, mm7; ;// x+1 (0) mm4 <= [x4 x4]*SCALEUP (dword)
prefetcht0 [TABLEP1 + 1*32] ;// x+1 prefetch coeff +1row [w15..8]
movlhps xmm0, xmm0; ;// x+1 (0) xmm0 <= [x0 x0 x0 x0] float
// 1) Apply M_T*P_T operator (P_T is implicit with table/element order)
// (table order has been changed for optimum SSE execution)
mulps xmm0, [TABLE + 0*16] ;// x+1 (1a) xmm0 <= a_v0 = [w3..w0] * x0
cvtpi2ps xmm1, mm1; ;// x+1 (0) xmm1 <= [__ __ x2 x2] float
// mm1 free
cvtpi2ps xmm2, mm0; ;// x+1 (0) xmm2 <= [__ __ x4 x4] float
// mm0 free
;//
// output y[4..7] = a[7..4] + b[7..4], output order must be reversed!
subps xmm5, xmm7; ;// (2a) xmm5 <= a_v[3..0]-b_v[3..0] (y4 y5 y6 y7)
;//
movlhps xmm1, xmm1; ;// x+1 (0) xmm1 <=[ x2 x2 x2 x2] float
pshufw mm1, mm4, 10101010b ;// x+1 (0) mm1 <= [x6 x6 x6 x6]
mulps xmm1, [TABLE + 1*16] ;// x+1 (1a) xmm1 <= a_v1 = [w7..w4] * x2
// output y[3..0] = a[3..0] + b[3..0]
addps xmm4, xmm7; ;// (2a) xmm4 <= a_v[3..0]+b_v[3..0] (y3 y2 y1 y0)
// xmm7 free
movlhps xmm2, xmm2; ;// x+1 (0) xmm2 <= [x4 x4 x4 x4] float
pand mm1, mm7; ;// x+1 (0) mm1 <= [x6 x6]*SCALEUP (dword)
mulps xmm2, [TABLE + 2*16]; // x+1 (1a) xmm2 <= a_v2 = [w11..w8] * x4
// fix output order, y[4..7] -> y[7..4]
shufps xmm5, xmm5, 00011011b; // (2a) xmm5 [y7 y6 y5 y4] <- [y4 y5 y6 y7]
cvtpi2ps xmm3, mm1; // x+1 (0) xmm3 <=[ __ __ x6 x6] float
// mm1 free
;//
// !*@&#@ Visual C++ refuses to align data to 16-byte offset, SIGH
pshufw mm0, mm2, 01010101b; // x+1 (0) mm1 <=[x1 x1 x1 x1]
movaps [OUTR + 0*16], xmm4; // (2b) store [y3 y2 y1 y0]
// movups [OUTR + 0*16], xmm4; // (2b) store [y3 y2 y1 y0]
// xmm4 free
pshufw mm1, mm2, 11111111b; // x+1 (0) mm1 <=[x3 x3 x3 x3]
// mm2 free
pand mm0, mm7; // x+1 (0) mm0 <=[x1 x1]*SCALEUP (dword)
movlhps xmm3, xmm3; ;// x+1 (0) xmm3 <= [x6 x6 x6 x6]
cvtpi2ps xmm4, mm0; // x+1 (0) xmm4 <=[__ __ x1 x1] float
// mm0 free
mulps xmm3, [TABLE + 3*16] ;// x+1 (1a) xmm3 <= a_v3 = [w15..w12] * x6
movaps [OUTR + 1*16], xmm5; ;// (2b) store [y7 y6 y5 y4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -