📄 amf_biquadcascade64_s_render.asm

📁 ADI SHARC DSP 音频算法标准模块库
💻 ASM
字号:

// Copyright(c) 2005 Analog Devices, Inc. All Rights Reserved. ADI Confidential.

// File    : $Id: //depot/development/visualaudio/modules/2.5.0/SHARC/Source/AMF_BiquadCascade64_S_Render.asm#4 $ 
// Part of : VisualAudio V2.5.0 
// Updated : $Date: 2006/10/12 $ by $Author: Fernando $

//    Module Name     : AMF_BiquadCascade64_S_Render.asm 
//    DSP Processor   : Any SIMD SHARC (21161 or newer)
//    Original Author : Tim Stilson    
//    Date               : 1/10/05
//====================================================================================
// Processor resources used:
//  DOUBLED_UP_N1D1_MACS:
//      134 words pmem INTERNAL 
//      cycles = 92 + 5*TickSize + numSections*(16 + TickSize*25)
//  DOUBLE_ACC:
//      137 words pmem INTERNAL 
//      cycles = 92 + 5*TickSize + numSections*(16 + TickSize*28)
//  (SIMD used, stereo dimension)
//====================================================================================

// I'm guessing that the limit for optimizability is 21 cycles in the inner loop in the
// DOUBLE_UP_N1D1_MACS case (5*3 for the macs, +4 for the extra n1/d1 macs,
// and +2 for the moves out of the acc, which are ALU ops), and that it would take a good
// optimizer a bit of work to get it to that level.

#if 1

        /**************************************************************
                
        Date Modified:
                        1/09/05        Tim Stilson, pseudocode
                        1/11/05        Tim Stilson, register allocation, medium-effort optimization, cleanup
                        1/17/05        Tim Stilson, finished verifying 64-bit behavior, cleanup debugging cases

        ****************************************************************************/

#include "processor.h"
#include "AMF_BiquadCascade64_S.h"
#include <asm_sprt.h>

// CHOOSE only one of these
// NOTE that they are not directly compatible with each other, as doubling up the n1/d1 macs
//      implies a different coefficient scaling (only n1, d1 halved instead of all coefs).
#define DOUBLE_UP_N1D1_MACS 1
#define DOUBLE_ACC 0

// global routines
.global    _AMF_BiquadCascade64_S_Render;            ;

.segment /pm SEG_MOD_FAST_CODE;

_AMF_BiquadCascade64_S_Render:

    puts = mode1;
    puts = R9;
    puts = R10;
    puts = R14;  // CAN DELETE if not using DOUBLE_ACC case
    R0 = I0; puts = R0;
    R0 = I1; puts = R0;
    R0 = I2; puts = R0;
    R0 = I3; puts = R0;
    puts = I9;
    R0 = M0; puts = R0;
    R0 = B0; puts = R0;
    R0 = B3; puts = R0;
    puts = B9;
    R0 = MR2F; puts = R0;
    R0 = MR1F; puts = R0;
    R0 = MR0F; puts = R0;

#if 0
regs used so far
I9, B9, L9    coef ptrs
I0, B0        scratch ptr
I2            other scratch ptr
I1            output ptr
I3            input ptr (input state ptr after first loop)
B3            input states base pointer
L3            input states circular buffer length
I4            setup ptr (output states ptr after initial setup)
B4            output states base pointer
L4            output states circular buffer length
M4            2
M0            -2 (4 in last loop)
R0            temp
R1            temp (low state)
R2            temp (high state)
R4            1 (used to shift right in the multiplication)
R8            coefficients
R9            rsch (high input), numSections before outer loop
R10            rscl (low input)
R12            tick size
R14            0x7fffffff  (MAY BE UNUSED, depending on final solution to acc *= 2 problem)
MRF            accumulator
#endif

////////////////////
//
// Overview:
//
// The 32-bit float input array is converted to 64-bit fixed-point in the first loop, the output is into the scratch array
// The next loop (quads) is the outer loop, which loops through each biquad
// The inner loop (filtering) implements a single biquad, 64-bit input from the scratch array, 64-bit output back to the scratch array
//     The biquads each have four 64-bit states, and five 32-bit coefficients.
// The last loop converts from the scratch array back to 32-bit float, into the output array
//
////////////////////

// scratch layout: l/r simd pairs, these grouped into double-precision pairs, low-order first


    I4 = R4;                /* Read structure pointer */
    
    //R12 = R12;              /* Read number of points */

    R0 = DM(AMF_BiquadCascade64_S_Coefs,I4);    // pointer to biquad coefficient array
    B9 = R0;
    L9 = 5;
    
    R0 = DM(AMF_BiquadCascade64_S_State1,I4);    // pointer to biquad input states arrays
    B3 = R0;

    R9 = DM(AMF_BiquadCascade64_S_NumSections,I4);
    
    R0 = DM(AMF_BiquadCascade64_S_State2,I4);    // pointer to biquad output states arrays
    B4 = R0;

    I4=R8;                    // I4->*buffers
    
 // initialize input and output samples pointers
    I3=DM(0,I4);            // I3->buffers[0], input
    I1=DM(1,I4);            // I1->buffers[1], output
    B0=DM(2,I4);            // I0->buffers[2], scratch


    
    bit set MODE1 PEYEN | ALUSAT;         
    M4 = 2;

    ////////// convert input to 64bit
    // simplified algorithm: convert into just the top 32 bits
    // FIXME: extend bits down into low-order 32 bits if desired

    R2 = 31;
    R1 = R1 xor R1;                          // clear r1
    lcntr = R12, do inputLoop until lce;
        F0 = DM(I3,M4);                      // read input
        R0 = fix F0 by R2, DM(I0,M4) = R1;   // convert to 32-bit, write low-order zeros
inputLoop:    
        DM(I0,M4) = R0;                      // write high-order


    // reset i/o pointers for filter

    bit set mode1 CBUFEN | BDCST9;                /* Enable circular buffering and broadcast loads */

    I3 = B3;   // restore states pointers
    I4 = B4;
    L3 = 8;    // setup states circular buffers
    L4 = 8;
    M0 = -2;
    R4 = 1;
    R14 = 0x7fffffff;

    // don't clobber R4 and R14 in these loops

    lcntr=R9, do quads until lce;

        // reset i/o pointers back to front of scratch
        I0=B0;
        I2=B0;
                                                                       // load scr, n0[j]
                                    R10=DM(I0,M4);                     // low-order input
                                     R9=DM(I0,M4), R8=PM(I9, M14);     // high-order input, n0 coef load (broadcast)

        lcntr=R12, do filtering until lce;
            // feedforward macs
                                                                       // acc  = mult6432(scrh[i], scrl[i], n0[j]);                         
            R0 = R8*R10 (SUF),       R1=DM(I3,M4);                     // low-order multiply, load low-order state
            MRF = R0*R4 (SSF);                                         // "shift right 31 bits" while MPY'ing into MRF
            MRF = MRF + R8*R9 (SSF), R2=DM(I3,M4), R8=PM(I9,M14);      // high-order multiply, load high-order state and n1 coef (broadcast)

                                                                       // mac6432(acc, instates0h[j], instates0l[j], n1[j]);              
            R0 = R8*R1 (SUF),        R1=DM(I3,M4);                     // low-order multiply, load low-order state
#if DOUBLE_UP_N1D1_MACS
            MRF = MRF + R0*R4 (SSF);                                   // "shift right 31 bits" while MAC'ing into MRF
            MRF = MRF + R8*R2 (SSF);                                   // high-order multiply, load high-order state and n2 coef (broadcast)
#endif
            MRF = MRF + R0*R4 (SSF);                                   // "shift right 31 bits" while MAC'ing into MRF
            MRF = MRF + R8*R2 (SSF), R2=DM(I3,M0), R8=PM(I9,M14);      // high-order multiply, load high-order state and n2 coef (broadcast)

                                                                       // mac6432(acc, instates1h[j], instates1l[j], n2[j]);
            R0 = R8*R1 (SUF),        R1=DM(I4,M4);                     // low-order multiply, load low-order state
            MRF = MRF + R0*R4 (SSF), DM(I3,M4)=R10;                   // "shift right 31 bits" while MAC'ing into MRF, instates0l = scrl[i];
            MRF = MRF + R8*R2 (SSF), R2=DM(I4,M4), R8=PM(I9, M14);     // high-order multiply, load high-order state and d1 coef (broadcast)

            // feedback macs
                                                                       // mac6432(acc, instates1h[j], outstates0l[j], d1[j]);
            R0 = R8*R1 (SUF),        R1=DM(I4,M4);                     // low-order multiply, load low-order state
#if DOUBLE_UP_N1D1_MACS
            MRF = MRF + R0*R4 (SSF);                                   // "shift right 31 bits" while MAC'ing into MRF
            MRF = MRF + R8*R2 (SSF);                                   // high-order multiply, load high-order state and d2 coef (broadcast)
#endif
            MRF = MRF + R0*R4 (SSF), DM(I3,M0)=R9;                     // "shift right 31 bits" while MAC'ing into MRF, instates0h=scrh[i];
            MRF = MRF + R8*R2 (SSF), R2=DM(I4,M0), R8=PM(I9, M14);     // high-order multiply, load high-order state and d2 coef (broadcast)

                                                                       // mac6432(acc, outstates1h[j], outstates1l[j], d2[j]);              
            R0 = R8*R1 (SUF),       R10=DM(I0,M4);                     // low-order multiply, load next high-order input
            MRF = MRF + R0*R4 (SSF), R9=DM(I0,M4);                     // "shift right 31 bits" while MAC'ing into MRF, load next low-order input
            MRF = MRF + R8*R2 (SSF), R8=PM(I9,M14);                    // high-order multiply, load n0 coef for next iter

            // Don't clobber R8, R9, R10 from here to end of filtering loop

            MRF = SAT MRF (SF);                                        // Saturate Acc

#if DOUBLE_ACC
            // Acc *= 2, because of halved coefs, then copy to output and states

            R1 = MR1F;
            R0 = MR0F;
            
            
            // hack to multiply acc by two, argh...
            // "MRF += MRF" => MRF += MRF*1.0 => MRF += MR0F*0x1 (SSF) + MR1F*(1.0-eps) (SSF) + MR1F*eps (SSF)
            R2 = R14*R0 (SUF);        // low-order multiply
            MRF = MRF + R2*R4 (SSF);        
            MRF = MRF + R1*R14 (SSF);
            MRF = MRF + R1*R4 (SSF);
            MRF = SAT MRF (SF);                                        // Saturate again
#endif

            // OPTIMIZE: with a bit of work, some of these moves may be able to be put in parallel w/ stuff at the top of the loop
            // Remember that the moves between the regs and the accs are ALU operations, not memory/move operations            

            // outstates0h[j] = outh[i] = acch;
            // outstates0l[j] = outl[i] = accl;
            R0 = MR0F;
            R1 = MR1F,        DM(I2,M4) = R0; // Output (Scratch), low first
            DM(I2,M4) = R1; 
            DM(I4,M4) = R0; // State Variable, low first
filtering:
            DM(I4,M0) = R1; 

        // point to next sets of coefs
        R0 = B9;      // *coefsbase += 5
        R1 = 5;
        R0 = R0 + R1;
        B9 = R0;
        // point to next sets of states
        R0 = B3;      // *inputstatesbase += 8
        R1 = 8;
        R0 = R0 + R1;
        B3 = R0;
        R0 = B4;      // *outputstatesbase += 8
        R0 = R0 + R1;
quads:
        B4 = R0;


    ////////// convert output from 64bit
    // simplified algorithm: convert just the top 32 bits
    // FIXME: use low-order 32 bits if desired

    I0 = B0; // back to start of scratch
    M0 = 4;
    R2 = -31;

        R0 = DM(I0,M4); // dummy move to point to high-order 
        R0 = DM(I0,M0); 
    lcntr = R12, do outputLoop until lce;
        F1 = float R0 by R2, R0 = DM(I0,M0);
outputLoop:    
        DM(I1,M4) = F1;

    bit clr MODE1 PEYEN;         

    L9 = 0;
    L3 = 0;
    L4 = 0;

    R0=gets(1); MR0F = R0;
    R0=gets(2); MR1F = R0;
    R0=gets(3); MR2F = R0;
    B9=gets(4);
    B3=gets(5);
    B0=gets(6); 
    M0=gets(7);
    I9=gets(8);
    I3=gets(9); 
    I2=gets(10);
    I1=gets(11); 
    I0=gets(12);
    R14=gets(13);  // MAYBE DELETE if not using DOUBLE_ACC case
    R10=gets(14);
    R9=gets(15);
    mode1=gets(16);
    alter(16);

//------------------------------------------------------------------------------------
_AMF_BiquadCascade64_S_Render.END:
    leaf_exit; // C-rth requires this instead of rts
//------------------------------------------------------------------------------------
    
.endseg;
#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -