⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct_ap922sse_opt.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
#include "StdAfx.h"
#pragma warning(once:4305 4244)
/*
 ------------------------------------------------------------------------
 idct_ap922sse_opt.c - AP922float iDCT, Intel SSE implementation

 Requirements : 
  Visual C++ 6.0 Pro + Visual Studio Service Pack 4 + Processor Pack Beta
  SSE capable CPU (Pentium-III, Celeron-II)
 ------------------------------------------------------------------------

 AP-922 floating-point iDCT (SSE)
 --------------------------------

  idct_ap922sse_opt.c is an "optimized" version of idct_ap922sse_rawcode.c
  Both files implement functionally the same code.  Persons wanting to
  enhance the performance of AP922sse should work with the raw_code file
  and not *this* file : this "optimized" code is far less readable than 
  the raw_code version.

  For algorithm design notes, please see idct_ap922sse_rawcode.c

 ------------------------------------------
 LEGAL MUMBO-JUMBO : Disclaimer of Warranty
 ------------------------------------------
  This software is available to the user without any license fee or
  royalty on an "as is" basis.  The author disclaims any and all warranties,
  whether express, implied, or statuary, including any implied warranties or 
  merchantability or of fitness for a particular purpose.  In no event shall 
  the copyright-holder be liable for any incidental, punitive, or 
  consequential damages of any kind whatsoever arising from the use of these
  programs.

    This disclaimer of warranty extends to the user of these programs and 
    user's customers, employees, agents, transferees, successors, and assigns.

 Revision history
 ----------------
 09/03/2000 initial release of 'AP922float_SSE' iDCT
            For algorithm design notes, please see idct_ap922sse_rawcode.c

            Despite spending 4 hours hand-scheduling/optimizing SSE assembly,
            Visual C++'s profiler does not report any speed increase.
            DOOHHHH...but anyway here it is.


            IMPORTANT, Visual C++ does not seem to align data to 16-byte
            address offset.  You will need to manually do this!!!
            
 09/03/2000 liaor@iname.com   http://members.tripod.com/~liaor
*/

#define NCOS1_16 (0.980785280403230449126182236134239) // cosine(  Pi/16 )
#define NCOS2_16 (0.923879532511286756128183189396788) // cosine( 2Pi/16 )
#define NCOS3_16 (0.831469612302545237078788377617906) // cosine( 3Pi/16 )
#define NCOS4_16 (0.707106781186547524400844362104849) // cosine( 4Pi/16 )
#define NCOS5_16 (0.555570233019602224742830813948533) // cosine( 5Pi/16 )
#define NCOS6_16 (0.382683432365089771728459984030399) // cosine( 6Pi/16 )
#define NCOS7_16 (0.195090322016128267848284868477022) // cosine( 7Pi/16 )

#define TANG1_16 ( NCOS7_16 / NCOS1_16) // tangent( Pi/16)
#define TANG2_16 ( NCOS6_16 / NCOS2_16) // tangent(2Pi/16)
#define TANG3_16 ( NCOS5_16 / NCOS3_16) // tangent(3Pi/16)

// Scaled floating-point coefficient table
//    Original (MMX) iDCT         SSE-iDCT 
//     table-ordering      ->    table-ordering
//    ------------------       ------------------
//    w00, w02, w04, w06,  ->  w00, w01, w02, w03,
//    w01, w03, w05, w07,  ->  w04, w05, w06, w07,
//    w08, w10, w12, w14,  ->  w08, w09, w10, w11,
//    w09, w11, w13, w15,  ->  w12, w13, w14, w15,
//    w16, w18, w20, w22,  ->  w16, w17, w18, w19,
//    w17, w19, w21, w23,  ->  w20, w21, w22, w23,
//    w24, w26, w28, w30,  ->  w24, w25, w26, w27,
//    w25, w27, w29, w31   ->  w28, w29, w30, w31
//
// Compared to the integer-based table, the float table is downscaled by a
// multiplication factor of 0.5
 
#define OVERALL_DOWNSCALE (0.25/(1<<9))
 // all table entries multiplied by an OVERALL_DOWNSCALE of (0.5*0.5)=0.25

#define RS0 (NCOS4_16*OVERALL_DOWNSCALE)  // iDCT row#0 scalar
#define RS1 (NCOS1_16*OVERALL_DOWNSCALE)  // iDCT row#1 scalar
#define RS2 (NCOS2_16*OVERALL_DOWNSCALE)  // iDCT row#2 scalar
#define RS3 (NCOS3_16*OVERALL_DOWNSCALE)  // iDCT row#3 scalar
#define RS4 (NCOS4_16*OVERALL_DOWNSCALE)  // iDCT row#4 scalar
#define RS5 (NCOS3_16*OVERALL_DOWNSCALE)  // iDCT row#5 scalar
#define RS6 (NCOS2_16*OVERALL_DOWNSCALE)  // iDCT row#6 scalar
#define RS7 (NCOS1_16*OVERALL_DOWNSCALE)  // iDCT row#7 scalar

#define DESCALE_SHIFT2 7
//#define DESCALE_SHIFT1 (16+6-DESCALE_SHIFT2)
//#define DESCALE_SHIFT1 (16-9-DESCALE_SHIFT2)

//#define RND_INV_ROWF (1 << (16+6-1) )
#define RND_INV_ROWF (1 << (16-9-1) )
#define RND_FLOAT_OFFSET ((RND_INV_ROWF) - 0.5)

//const static int r_inv_row_sse[2] = {RND_INV_ROWF-1, RND_INV_ROWF-1};
const static int r_inv_row_sse[2] = {0, 0};

const float tab_i_01234567ssef[] = 
{ 
 // Row #0
  NCOS4_16*RS0,  NCOS4_16*RS0,  NCOS4_16*RS0,  NCOS4_16*RS0, 
  NCOS2_16*RS0,  NCOS6_16*RS0, -NCOS6_16*RS0, -NCOS2_16*RS0, 
  NCOS4_16*RS0, -NCOS4_16*RS0, -NCOS4_16*RS0,  NCOS4_16*RS0, 
  NCOS6_16*RS0, -NCOS2_16*RS0,  NCOS2_16*RS0, -NCOS6_16*RS0, 
  NCOS1_16*RS0,  NCOS3_16*RS0,  NCOS5_16*RS0,  NCOS7_16*RS0, 
  NCOS3_16*RS0, -NCOS7_16*RS0, -NCOS1_16*RS0, -NCOS5_16*RS0, 
  NCOS5_16*RS0, -NCOS1_16*RS0,  NCOS7_16*RS0,  NCOS3_16*RS0, 
  NCOS7_16*RS0, -NCOS5_16*RS0,  NCOS3_16*RS0, -NCOS1_16*RS0, 

 // Row #1
  NCOS4_16*RS1,  NCOS4_16*RS1,  NCOS4_16*RS1,  NCOS4_16*RS1, 
  NCOS2_16*RS1,  NCOS6_16*RS1, -NCOS6_16*RS1, -NCOS2_16*RS1, 
  NCOS4_16*RS1, -NCOS4_16*RS1, -NCOS4_16*RS1,  NCOS4_16*RS1, 
  NCOS6_16*RS1, -NCOS2_16*RS1,  NCOS2_16*RS1, -NCOS6_16*RS1, 
  NCOS1_16*RS1,  NCOS3_16*RS1,  NCOS5_16*RS1,  NCOS7_16*RS1, 
  NCOS3_16*RS1, -NCOS7_16*RS1, -NCOS1_16*RS1, -NCOS5_16*RS1, 
  NCOS5_16*RS1, -NCOS1_16*RS1,  NCOS7_16*RS1,  NCOS3_16*RS1, 
  NCOS7_16*RS1, -NCOS5_16*RS1,  NCOS3_16*RS1, -NCOS1_16*RS1, 

 // Row #2
  NCOS4_16*RS2,  NCOS4_16*RS2,  NCOS4_16*RS2,  NCOS4_16*RS2, 
  NCOS2_16*RS2,  NCOS6_16*RS2, -NCOS6_16*RS2, -NCOS2_16*RS2, 
  NCOS4_16*RS2, -NCOS4_16*RS2, -NCOS4_16*RS2,  NCOS4_16*RS2, 
  NCOS6_16*RS2, -NCOS2_16*RS2,  NCOS2_16*RS2, -NCOS6_16*RS2, 
  NCOS1_16*RS2,  NCOS3_16*RS2,  NCOS5_16*RS2,  NCOS7_16*RS2, 
  NCOS3_16*RS2, -NCOS7_16*RS2, -NCOS1_16*RS2, -NCOS5_16*RS2, 
  NCOS5_16*RS2, -NCOS1_16*RS2,  NCOS7_16*RS2,  NCOS3_16*RS2, 
  NCOS7_16*RS2, -NCOS5_16*RS2,  NCOS3_16*RS2, -NCOS1_16*RS2, 

 // Row #3
  NCOS4_16*RS3,  NCOS4_16*RS3,  NCOS4_16*RS3,  NCOS4_16*RS3, 
  NCOS2_16*RS3,  NCOS6_16*RS3, -NCOS6_16*RS3, -NCOS2_16*RS3, 
  NCOS4_16*RS3, -NCOS4_16*RS3, -NCOS4_16*RS3,  NCOS4_16*RS3, 
  NCOS6_16*RS3, -NCOS2_16*RS3,  NCOS2_16*RS3, -NCOS6_16*RS3, 
  NCOS1_16*RS3,  NCOS3_16*RS3,  NCOS5_16*RS3,  NCOS7_16*RS3, 
  NCOS3_16*RS3, -NCOS7_16*RS3, -NCOS1_16*RS3, -NCOS5_16*RS3, 
  NCOS5_16*RS3, -NCOS1_16*RS3,  NCOS7_16*RS3,  NCOS3_16*RS3, 
  NCOS7_16*RS3, -NCOS5_16*RS3,  NCOS3_16*RS3, -NCOS1_16*RS3, 

 // Row #4
  NCOS4_16*RS4,  NCOS4_16*RS4,  NCOS4_16*RS4,  NCOS4_16*RS4, 
  NCOS2_16*RS4,  NCOS6_16*RS4, -NCOS6_16*RS4, -NCOS2_16*RS4, 
  NCOS4_16*RS4, -NCOS4_16*RS4, -NCOS4_16*RS4,  NCOS4_16*RS4, 
  NCOS6_16*RS4, -NCOS2_16*RS4,  NCOS2_16*RS4, -NCOS6_16*RS4, 
  NCOS1_16*RS4,  NCOS3_16*RS4,  NCOS5_16*RS4,  NCOS7_16*RS4, 
  NCOS3_16*RS4, -NCOS7_16*RS4, -NCOS1_16*RS4, -NCOS5_16*RS4, 
  NCOS5_16*RS4, -NCOS1_16*RS4,  NCOS7_16*RS4,  NCOS3_16*RS4, 
  NCOS7_16*RS4, -NCOS5_16*RS4,  NCOS3_16*RS4, -NCOS1_16*RS4, 

 // Row #5
  NCOS4_16*RS5,  NCOS4_16*RS5,  NCOS4_16*RS5,  NCOS4_16*RS5, 
  NCOS2_16*RS5,  NCOS6_16*RS5, -NCOS6_16*RS5, -NCOS2_16*RS5, 
  NCOS4_16*RS5, -NCOS4_16*RS5, -NCOS4_16*RS5,  NCOS4_16*RS5, 
  NCOS6_16*RS5, -NCOS2_16*RS5,  NCOS2_16*RS5, -NCOS6_16*RS5, 
  NCOS1_16*RS5,  NCOS3_16*RS5,  NCOS5_16*RS5,  NCOS7_16*RS5, 
  NCOS3_16*RS5, -NCOS7_16*RS5, -NCOS1_16*RS5, -NCOS5_16*RS5, 
  NCOS5_16*RS5, -NCOS1_16*RS5,  NCOS7_16*RS5,  NCOS3_16*RS5, 
  NCOS7_16*RS5, -NCOS5_16*RS5,  NCOS3_16*RS5, -NCOS1_16*RS5, 

 // Row #6
  NCOS4_16*RS6,  NCOS4_16*RS6,  NCOS4_16*RS6,  NCOS4_16*RS6, 
  NCOS2_16*RS6,  NCOS6_16*RS6, -NCOS6_16*RS6, -NCOS2_16*RS6, 
  NCOS4_16*RS6, -NCOS4_16*RS6, -NCOS4_16*RS6,  NCOS4_16*RS6, 
  NCOS6_16*RS6, -NCOS2_16*RS6,  NCOS2_16*RS6, -NCOS6_16*RS6, 
  NCOS1_16*RS6,  NCOS3_16*RS6,  NCOS5_16*RS6,  NCOS7_16*RS6, 
  NCOS3_16*RS6, -NCOS7_16*RS6, -NCOS1_16*RS6, -NCOS5_16*RS6, 
  NCOS5_16*RS6, -NCOS1_16*RS6,  NCOS7_16*RS6,  NCOS3_16*RS6, 
  NCOS7_16*RS6, -NCOS5_16*RS6,  NCOS3_16*RS6, -NCOS1_16*RS6, 

 // Row #7
  NCOS4_16*RS7,  NCOS4_16*RS7,  NCOS4_16*RS7,  NCOS4_16*RS7, 
  NCOS2_16*RS7,  NCOS6_16*RS7, -NCOS6_16*RS7, -NCOS2_16*RS7, 
  NCOS4_16*RS7, -NCOS4_16*RS7, -NCOS4_16*RS7,  NCOS4_16*RS7, 
  NCOS6_16*RS7, -NCOS2_16*RS7,  NCOS2_16*RS7, -NCOS6_16*RS7, 
  NCOS1_16*RS7,  NCOS3_16*RS7,  NCOS5_16*RS7,  NCOS7_16*RS7, 
  NCOS3_16*RS7, -NCOS7_16*RS7, -NCOS1_16*RS7, -NCOS5_16*RS7, 
  NCOS5_16*RS7, -NCOS1_16*RS7,  NCOS7_16*RS7,  NCOS3_16*RS7, 
  NCOS7_16*RS7, -NCOS5_16*RS7,  NCOS3_16*RS7, -NCOS1_16*RS7, 

 // iDCT column coefficients
  NCOS4_16, NCOS4_16, NCOS4_16, NCOS4_16,
  TANG1_16, TANG1_16, TANG1_16, TANG1_16,
  TANG2_16, TANG2_16, TANG2_16, TANG2_16,
  TANG3_16, TANG3_16, TANG3_16, TANG3_16,
  RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET,
  0       , 0       , 0       , 0,        // not used, padding
  0       , 0       , 0       , 0,        // not used, padding
  0       , 0       , 0       , 0         // not used, padding
};


// externally allocated array of 64 floats, for temporary storage
//extern void *fTempArray;
__declspec(align(16)) static float temparray[64];
void *fTempArray = temparray;

void idct_ap922float_sse(short *data)
{
//  static float scratch[8];
  static const __int64 mmAndmask = 0xFFFF0000FFFF0000; // [ XX __ XX __ ]

  // AP-922 inverse_DCT row transform (floating point version)
  //
  // Row IDCT operator :A_T*M_T*P_T
  // Let Y=[output column data, 8 elements] 32-bit IEEE-754 float
  //     X=[input column data, 8 elements] 16-bit short integer
  //
  //     Y= [ A_T*M_T*P_T ] * X
  //
  //   (Y and X are both column vectors)

#define INPR eax
#define TABLE ecx
#define TABLEP1 esi  // starting address of (TABLE +32 elements)
#define OUTR edx

  __asm {
//    stmxcsr [_mxcsr_backup];  // store MXCSR to mem location _mxcsr_backup

    mov OUTR, dword ptr [fTempArray];		//	The content of fTempArray is zero which brings access violation
     mov edi, 0x00; // i = 0

//    mov ebx, [_mxcsr_backup];
     sub OUTR, 32;    // precompensate OUT

    mov INPR, dword ptr [data];   ;// row 0			//	The content of fTempArray is zero which brings access violation
//	  lea INPR, dword ptr [data]
//     and ebx, 0xFFFF9FFF;    // mask out rounding-mode (bits14:13) control

    lea TABLEP1, dword ptr [tab_i_01234567ssef]; // row 0
//     or  ebx, 0x00002000;    // set rounding mode to round-down 14-13 (01b)
    //or  ebx, 0x00000000;    // set rounding mode to nearest 14-13 (01b)

    movq mm7, [mmAndmask];      // mm7 <= [ FFFF 0000 FFFF 0000]
     mov TABLE, TABLEP1;

    add TABLEP1, 128;   // initialize 
//     mov [_mxcsr_fdct], ebx;  // store new mxcsr pattern

//    ldmxcsr [ _mxcsr_fdct ];

  // for ( x = 0; x < 8; ++x )  // transform one row per iteration
//ALIGN 16 ;// align jump address to 16 byte offset
//  acc_idct_colloop1:

//
// begin (0) - Convert short -> float data
//
    movq mm2, [INPR] ;           // (0) mm2 <= x3 x2 x1 x0
     ;//

    movq mm4, [INPR+8] ;         // (0) mm4 <= x7 x6 x5 x4
     pshufw mm0, mm2, 00000000b ;// (0) mm0 <= x0 x0 x0 x0

    pshufw mm1, mm2, 10101010b  ;// (0) mm1 <= x2 x2 x2 x2
     pand mm0, mm7;             ;// (0) mm0 = [ x0 x0 ]*SCALEUP (dword)

    cvtpi2ps xmm0, mm0;         ;// (0) xmm0 <= [__ __ x0 x0] float
// mm0 free
     pand mm1, mm7;             ;// (0) mm1 = [ x2 x2 ]*SCALEUP (dword)

    prefetcht0 [TABLEP1 + 0*32] ;// prefetch coeff +1row, [w7..0]
     pshufw mm0, mm4, 00000000b ;// (0)  mm0 <= x4 x4 x4 x4

    cvtpi2ps xmm1, mm1;         ;// (0) xmm1 <= [__ __ x2 x2] float
// mm1 free
     pand mm0, mm7;             ;// (0) mm4 <= [x4 x4]*SCALEUP (dword)

    prefetcht0 [TABLEP1 + 1*32] ;// prefetch coeff +1row [w15..8]
    movlhps xmm0, xmm0;         ;// (0) xmm0 <= [x0 x0 x0 x0] float
     cvtpi2ps xmm2, mm0;        ;// (0) xmm2 <= [__ __ x4 x4] float
// mm0 free

// 1) Apply M_T*P_T operator (P_T is implicit with table/element order)
//    (table order has been changed for optimum SSE execution)

    mulps xmm0, [TABLE + 0*16]  ;// (1a) xmm0 <= a_v0 = [w3..w0] * x0
     pshufw mm1, mm4, 10101010b; // (0) mm1 <= [x6 x6 x6 x6]

    pand mm1, mm7;              ;// (0) mm1 <= [x6 x6]*SCALEUP (dword)
     movlhps xmm1, xmm1;        ;// (0) xmm1 <=[ x2 x2 x2 x2] float

    mulps xmm1, [TABLE + 1*16]  ;// (1a) xmm1 <= a_v1 = [w7..w4] * x2
     cvtpi2ps xmm3, mm1;        ;// (0) xmm3 <=[ __ __ x6 x6] float
// mm1 free

    movlhps xmm2, xmm2;         ;// (0) xmm2 <= [x4 x4 x4 x4] float
     pshufw mm0, mm2, 01010101b; // (0) mm1 <=[x1 x1 x1 x1]

    mulps xmm2, [TABLE + 2*16];  // (1a) xmm2 <= a_v2 = [w11..w8] * x4
     pshufw mm1, mm2, 11111111b; // (0) mm1 <=[x3 x3 x3 x3]
// mm2 free

    pand mm0, mm7;               // (0) mm0 <=[x1 x1]*SCALEUP (dword)
     movlhps xmm3, xmm3;        ;// (0) xmm3 <= [x6 x6 x6 x6]

    mulps xmm3, [TABLE + 3*16] ; // (1a) xmm3 <= a_v3 = [w15..w12] * x6
     cvtpi2ps xmm4, mm0;         // (0) xmm4 <=[__ __ x1 x1] float
// mm0 free

    /////////////////////////////////////////////
    //
    //  loop point 
    //  for ( i = 0; i < 8; i=i+1 )  
    //  {
    //     ....// idct_row(#i)

ALIGN 16 ;// align jump address to 16 byte offset
  acc_idct_rowloop1:

    addps xmm1, xmm0;           ;// (1b) xmm1 <= a_v0 + a_v1
// xmm0 free
     pand mm1, mm7;              // (0) mm1 <=[x3 x3]*SCALEUP (dword)

     pshufw mm0, mm4, 01010101b; // (0) mm0 <= [x5 x5 x5 x5]

    cvtpi2ps xmm5, mm1;          // (0) xmm5 <=[__ __ x3 x3] float
     pand mm0, mm7;              // (0) mm0 <= [x5 x5]*SCALEUP (dword)
// mm1 free

    cvtpi2ps xmm6, mm0;          // (0) xmm6 <=[__ __ x5 x5] float
 // mm0 free
     movlhps xmm4, xmm4;         // (0) xmm4 <= [x1 x1 x1 x1]

    mulps xmm4, [TABLE + 4*16]  ;// (1a) xmm4 <= b_v0
     pshufw mm1, mm4, 11111111b; // (0) mm1 <= [x7 x7 x7 x7]
// mm4 free

    addps xmm2, xmm1;           ;// (1b) xmm2 + a_v2 <= (a_v0+a_v1) + a_v2
// xmm1 free
     movlhps xmm5, xmm5;        ;// (0) xmm5 <=[x3 x3 x3 x3] float

    mulps xmm5, [TABLE + 5*16]  ;// (1a) xmm5 <= b_v1
     pand mm1, mm7;             ;// (0) mm1 <= [x7 x7]*SCALEUP (dword)

    cvtpi2ps xmm7, mm1;         ;// (0) xmm7 <= [__ __ x7 x7]
// mm1 free
     movlhps xmm6, xmm6;         // (0) xmm6 <= [x5 x5 x5 x5] float

    mulps xmm6, [TABLE + 6*16]  ;// (1a) xmm6 <= b_v2
     add OUTR, 32;              ;// OUT <= OUT + 32 (increment row) 

    addps xmm3, xmm2;           ;// (1b) xmm3 + a_v3 <=(a_v0+a_v1+a_v2) + a_v3
// xmm3 = complete a-vector ( a_v[3..0] )
// xmm2 free
     add INPR, 16;               // increment INP(psrc) + 1row 

    prefetcht0 [TABLEP1 + 2*32] ;// prefetch coeff +1row [w23..15]
     movlhps xmm7, xmm7;

    mulps xmm7, [TABLE + 7*16]  ;// (1a) xmm7 <= b_v3
     addps xmm5, xmm4;          ;// (1b) xmm5 <= b_v0 + b_v1
// xmm4 free

    ///////////////////////////////////////////////////////////////
    //
    // begin processing next row ("+1")
    //
    // the following code intermixes processing from row (x) and (x+1)
    // This is the result of software pipelining to reduce wasted
    // cycles.

    movq mm2, [INPR] ;           // x+1 (0) mm2 <= x3 x2 x1 x0
     add TABLE, 128;              // move TABLE to w[32]

    prefetcht0 [TABLEP1 + 3*32] ;// prefetch coeff +1row [w31..24]
     add TABLEP1, 128;           // move TABLEP1 to w[32]

// xmm4 used
    movaps xmm4, xmm3;           // (1b) xmm4 <= a_v[3..0] 
     pshufw mm0, mm2, 00000000b ;// x+1 (0) mm0 <= x0 x0 x0 x0

    movq mm4, [INPR+8] ;         // x+1 (0) mm4 <= x7 x6 x5 x4
     addps xmm6, xmm5;           // (1b) xmm6 + b_v2 <= (b_v0+b_v1) + b_v2
// xmm5 free

    pshufw mm1, mm2, 10101010b  ;// x+1 (0) mm1 <= x2 x2 x2 x2
     pand mm0, mm7;             ;// x+1 (0) mm0 = [ x0 x0 ]*SCALEUP (dword)

    cvtpi2ps xmm0, mm0;         ;// x+1 (0) xmm0 <= [__ __ x0 x0] float
// mm0 free
     pand mm1, mm7;             ;// x+1 (0) mm1 = [ x2 x2 ]*SCALEUP (dword)

// xmm5 used
     movaps xmm5, xmm3;         ;// xmm5 <= a_v[3..0]
// xmm3 free
     add edi, 1;                 // i = i + 1 (loop index)

    prefetcht0 [TABLEP1 + 0*32] ;// x+1 prefetch coeff +1row, [w7..0]
     pshufw mm0, mm4, 00000000b ;// x+1 (0)  mm0 <= x4 x4 x4 x4

    addps xmm7, xmm6;            // (1b) xmm7 + b_v3 <=(b_v0+b_v1+b_v2) + b_v3
// xmm6 free
// xmm7 = complete b-vector ( b_v[3..0] )
     pand mm0, mm7;             ;// x+1 (0) mm4 <= [x4 x4]*SCALEUP (dword)

    prefetcht0 [TABLEP1 + 1*32] ;// x+1 prefetch coeff +1row [w15..8]
     movlhps xmm0, xmm0;        ;// x+1 (0) xmm0 <= [x0 x0 x0 x0] float

// 1) Apply M_T*P_T operator (P_T is implicit with table/element order)
//    (table order has been changed for optimum SSE execution)

    mulps xmm0, [TABLE + 0*16]  ;// x+1 (1a) xmm0 <= a_v0 = [w3..w0] * x0
     cvtpi2ps xmm1, mm1;        ;// x+1 (0) xmm1 <= [__ __ x2 x2] float
// mm1 free

    cvtpi2ps xmm2, mm0;         ;// x+1 (0) xmm2 <= [__ __ x4 x4] float
// mm0 free
     ;//

// output y[4..7] = a[7..4] + b[7..4], output order must be reversed!
    subps xmm5, xmm7;       ;// (2a) xmm5 <= a_v[3..0]-b_v[3..0] (y4 y5 y6 y7)
     ;//

    movlhps xmm1, xmm1;         ;// x+1 (0) xmm1 <=[ x2 x2 x2 x2] float
     pshufw mm1, mm4, 10101010b ;// x+1 (0) mm1 <= [x6 x6 x6 x6]
    
    mulps xmm1, [TABLE + 1*16]  ;// x+1 (1a) xmm1 <= a_v1 = [w7..w4] * x2
// output y[3..0] = a[3..0] + b[3..0]
     addps xmm4, xmm7;      ;// (2a) xmm4 <= a_v[3..0]+b_v[3..0] (y3 y2 y1 y0)
// xmm7 free

    movlhps xmm2, xmm2;         ;// x+1 (0) xmm2 <= [x4 x4 x4 x4] float
     pand mm1, mm7;              ;// x+1 (0) mm1 <= [x6 x6]*SCALEUP (dword)

    mulps xmm2, [TABLE + 2*16];  // x+1 (1a) xmm2 <= a_v2 = [w11..w8] * x4
// fix output order, y[4..7] -> y[7..4]
     shufps xmm5, xmm5, 00011011b; // (2a) xmm5 [y7 y6 y5 y4] <- [y4 y5 y6 y7]

    cvtpi2ps xmm3, mm1;           // x+1 (0) xmm3 <=[ __ __ x6 x6] float
// mm1 free
     ;//

// !*@&#@ Visual C++ refuses to align data to 16-byte offset, SIGH

    pshufw mm0, mm2, 01010101b;   // x+1 (0) mm1 <=[x1 x1 x1 x1]
    movaps [OUTR + 0*16], xmm4;   // (2b) store [y3 y2 y1 y0]
//     movups [OUTR + 0*16], xmm4;   // (2b) store [y3 y2 y1 y0]
// xmm4 free

    pshufw mm1, mm2, 11111111b;   // x+1 (0) mm1 <=[x3 x3 x3 x3]
// mm2 free
     pand mm0, mm7;               // x+1 (0) mm0 <=[x1 x1]*SCALEUP (dword)

    movlhps xmm3, xmm3;          ;// x+1 (0) xmm3 <= [x6 x6 x6 x6]
     cvtpi2ps xmm4, mm0;          // x+1 (0) xmm4 <=[__ __ x1 x1] float
// mm0 free

    mulps xmm3, [TABLE + 3*16]   ;// x+1 (1a) xmm3 <= a_v3 = [w15..w12] * x6
    movaps [OUTR + 1*16], xmm5;  ;// (2b) store [y7 y6 y5 y4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -