📄 amf_biquadcascade64_s_render.asm
字号:
// Copyright(c) 2005 Analog Devices, Inc. All Rights Reserved. ADI Confidential.
// File : $Id: //depot/development/visualaudio/modules/2.5.0/SHARC/Source/AMF_BiquadCascade64_S_Render.asm#4 $
// Part of : VisualAudio V2.5.0
// Updated : $Date: 2006/10/12 $ by $Author: Fernando $
// Module Name : AMF_BiquadCascade64_S_Render.asm
// DSP Processor : Any SIMD SHARC (21161 or newer)
// Original Author : Tim Stilson
// Date : 1/10/05
//====================================================================================
// Processor resources used:
// DOUBLED_UP_N1D1_MACS:
// 134 words pmem INTERNAL
// cycles = 92 + 5*TickSize + numSections*(16 + TickSize*25)
// DOUBLE_ACC:
// 137 words pmem INTERNAL
// cycles = 92 + 5*TickSize + numSections*(16 + TickSize*28)
// (SIMD used, stereo dimension)
//====================================================================================
// I'm guessing that the limit for optimizability is 21 cycles in the inner loop in the
// DOUBLE_UP_N1D1_MACS case (5*3 for the macs, +4 for the extra n1/d1 macs,
// and +2 for the moves out of the acc, which are ALU ops), and that it would take a good
// optimizer a bit of work to get it to that level.
#if 1
/**************************************************************
Date Modified:
1/09/05 Tim Stilson, pseudocode
1/11/05 Tim Stilson, register allocation, medium-effort optimization, cleanup
1/17/05 Tim Stilson, finished verifying 64-bit behavior, cleanup debugging cases
****************************************************************************/
#include "processor.h"
#include "AMF_BiquadCascade64_S.h"
#include <asm_sprt.h>
// CHOOSE only one of these
// NOTE that they are not directly compatible with each other, as doubling up the n1/d1 macs
// implies a different coefficient scaling (only n1, d1 halved instead of all coefs).
#define DOUBLE_UP_N1D1_MACS 1
#define DOUBLE_ACC 0
// global routines
.global _AMF_BiquadCascade64_S_Render; ;
.segment /pm SEG_MOD_FAST_CODE;
_AMF_BiquadCascade64_S_Render:
puts = mode1;
puts = R9;
puts = R10;
puts = R14; // CAN DELETE if not using DOUBLE_ACC case
R0 = I0; puts = R0;
R0 = I1; puts = R0;
R0 = I2; puts = R0;
R0 = I3; puts = R0;
puts = I9;
R0 = M0; puts = R0;
R0 = B0; puts = R0;
R0 = B3; puts = R0;
puts = B9;
R0 = MR2F; puts = R0;
R0 = MR1F; puts = R0;
R0 = MR0F; puts = R0;
#if 0
regs used so far
I9, B9, L9 coef ptrs
I0, B0 scratch ptr
I2 other scratch ptr
I1 output ptr
I3 input ptr (input state ptr after first loop)
B3 input states base pointer
L3 input states circular buffer length
I4 setup ptr (output states ptr after initial setup)
B4 output states base pointer
L4 output states circular buffer length
M4 2
M0 -2 (4 in last loop)
R0 temp
R1 temp (low state)
R2 temp (high state)
R4 1 (used to shift right in the multiplication)
R8 coefficients
R9 rsch (high input), numSections before outer loop
R10 rscl (low input)
R12 tick size
R14 0x7fffffff (MAY BE UNUSED, depending on final solution to acc *= 2 problem)
MRF accumulator
#endif
////////////////////
//
// Overview:
//
// The 32-bit float input array is converted to 64-bit fixed-point in the first loop, the output is into the scratch array
// The next loop (quads) is the outer loop, which loops through each biquad
// The inner loop (filtering) implements a single biquad, 64-bit input from the scratch array, 64-bit output back to the scratch array
// The biquads each have four 64-bit states, and five 32-bit coefficients.
// The last loop converts from the scratch array back to 32-bit float, into the output array
//
////////////////////
// scratch layout: l/r simd pairs, these grouped into double-precision pairs, low-order first
I4 = R4; /* Read structure pointer */
//R12 = R12; /* Read number of points */
R0 = DM(AMF_BiquadCascade64_S_Coefs,I4); // pointer to biquad coefficient array
B9 = R0;
L9 = 5;
R0 = DM(AMF_BiquadCascade64_S_State1,I4); // pointer to biquad input states arrays
B3 = R0;
R9 = DM(AMF_BiquadCascade64_S_NumSections,I4);
R0 = DM(AMF_BiquadCascade64_S_State2,I4); // pointer to biquad output states arrays
B4 = R0;
I4=R8; // I4->*buffers
// initialize input and output samples pointers
I3=DM(0,I4); // I3->buffers[0], input
I1=DM(1,I4); // I1->buffers[1], output
B0=DM(2,I4); // I0->buffers[2], scratch
bit set MODE1 PEYEN | ALUSAT;
M4 = 2;
////////// convert input to 64bit
// simplified algorithm: convert into just the top 32 bits
// FIXME: extend bits down into low-order 32 bits if desired
R2 = 31;
R1 = R1 xor R1; // clear r1
lcntr = R12, do inputLoop until lce;
F0 = DM(I3,M4); // read input
R0 = fix F0 by R2, DM(I0,M4) = R1; // convert to 32-bit, write low-order zeros
inputLoop:
DM(I0,M4) = R0; // write high-order
// reset i/o pointers for filter
bit set mode1 CBUFEN | BDCST9; /* Enable circular buffering and broadcast loads */
I3 = B3; // restore states pointers
I4 = B4;
L3 = 8; // setup states circular buffers
L4 = 8;
M0 = -2;
R4 = 1;
R14 = 0x7fffffff;
// don't clobber R4 and R14 in these loops
lcntr=R9, do quads until lce;
// reset i/o pointers back to front of scratch
I0=B0;
I2=B0;
// load scr, n0[j]
R10=DM(I0,M4); // low-order input
R9=DM(I0,M4), R8=PM(I9, M14); // high-order input, n0 coef load (broadcast)
lcntr=R12, do filtering until lce;
// feedforward macs
// acc = mult6432(scrh[i], scrl[i], n0[j]);
R0 = R8*R10 (SUF), R1=DM(I3,M4); // low-order multiply, load low-order state
MRF = R0*R4 (SSF); // "shift right 31 bits" while MPY'ing into MRF
MRF = MRF + R8*R9 (SSF), R2=DM(I3,M4), R8=PM(I9,M14); // high-order multiply, load high-order state and n1 coef (broadcast)
// mac6432(acc, instates0h[j], instates0l[j], n1[j]);
R0 = R8*R1 (SUF), R1=DM(I3,M4); // low-order multiply, load low-order state
#if DOUBLE_UP_N1D1_MACS
MRF = MRF + R0*R4 (SSF); // "shift right 31 bits" while MAC'ing into MRF
MRF = MRF + R8*R2 (SSF); // high-order multiply, load high-order state and n2 coef (broadcast)
#endif
MRF = MRF + R0*R4 (SSF); // "shift right 31 bits" while MAC'ing into MRF
MRF = MRF + R8*R2 (SSF), R2=DM(I3,M0), R8=PM(I9,M14); // high-order multiply, load high-order state and n2 coef (broadcast)
// mac6432(acc, instates1h[j], instates1l[j], n2[j]);
R0 = R8*R1 (SUF), R1=DM(I4,M4); // low-order multiply, load low-order state
MRF = MRF + R0*R4 (SSF), DM(I3,M4)=R10; // "shift right 31 bits" while MAC'ing into MRF, instates0l = scrl[i];
MRF = MRF + R8*R2 (SSF), R2=DM(I4,M4), R8=PM(I9, M14); // high-order multiply, load high-order state and d1 coef (broadcast)
// feedback macs
// mac6432(acc, instates1h[j], outstates0l[j], d1[j]);
R0 = R8*R1 (SUF), R1=DM(I4,M4); // low-order multiply, load low-order state
#if DOUBLE_UP_N1D1_MACS
MRF = MRF + R0*R4 (SSF); // "shift right 31 bits" while MAC'ing into MRF
MRF = MRF + R8*R2 (SSF); // high-order multiply, load high-order state and d2 coef (broadcast)
#endif
MRF = MRF + R0*R4 (SSF), DM(I3,M0)=R9; // "shift right 31 bits" while MAC'ing into MRF, instates0h=scrh[i];
MRF = MRF + R8*R2 (SSF), R2=DM(I4,M0), R8=PM(I9, M14); // high-order multiply, load high-order state and d2 coef (broadcast)
// mac6432(acc, outstates1h[j], outstates1l[j], d2[j]);
R0 = R8*R1 (SUF), R10=DM(I0,M4); // low-order multiply, load next high-order input
MRF = MRF + R0*R4 (SSF), R9=DM(I0,M4); // "shift right 31 bits" while MAC'ing into MRF, load next low-order input
MRF = MRF + R8*R2 (SSF), R8=PM(I9,M14); // high-order multiply, load n0 coef for next iter
// Don't clobber R8, R9, R10 from here to end of filtering loop
MRF = SAT MRF (SF); // Saturate Acc
#if DOUBLE_ACC
// Acc *= 2, because of halved coefs, then copy to output and states
R1 = MR1F;
R0 = MR0F;
// hack to multiply acc by two, argh...
// "MRF += MRF" => MRF += MRF*1.0 => MRF += MR0F*0x1 (SSF) + MR1F*(1.0-eps) (SSF) + MR1F*eps (SSF)
R2 = R14*R0 (SUF); // low-order multiply
MRF = MRF + R2*R4 (SSF);
MRF = MRF + R1*R14 (SSF);
MRF = MRF + R1*R4 (SSF);
MRF = SAT MRF (SF); // Saturate again
#endif
// OPTIMIZE: with a bit of work, some of these moves may be able to be put in parallel w/ stuff at the top of the loop
// Remember that the moves between the regs and the accs are ALU operations, not memory/move operations
// outstates0h[j] = outh[i] = acch;
// outstates0l[j] = outl[i] = accl;
R0 = MR0F;
R1 = MR1F, DM(I2,M4) = R0; // Output (Scratch), low first
DM(I2,M4) = R1;
DM(I4,M4) = R0; // State Variable, low first
filtering:
DM(I4,M0) = R1;
// point to next sets of coefs
R0 = B9; // *coefsbase += 5
R1 = 5;
R0 = R0 + R1;
B9 = R0;
// point to next sets of states
R0 = B3; // *inputstatesbase += 8
R1 = 8;
R0 = R0 + R1;
B3 = R0;
R0 = B4; // *outputstatesbase += 8
R0 = R0 + R1;
quads:
B4 = R0;
////////// convert output from 64bit
// simplified algorithm: convert just the top 32 bits
// FIXME: use low-order 32 bits if desired
I0 = B0; // back to start of scratch
M0 = 4;
R2 = -31;
R0 = DM(I0,M4); // dummy move to point to high-order
R0 = DM(I0,M0);
lcntr = R12, do outputLoop until lce;
F1 = float R0 by R2, R0 = DM(I0,M0);
outputLoop:
DM(I1,M4) = F1;
bit clr MODE1 PEYEN;
L9 = 0;
L3 = 0;
L4 = 0;
R0=gets(1); MR0F = R0;
R0=gets(2); MR1F = R0;
R0=gets(3); MR2F = R0;
B9=gets(4);
B3=gets(5);
B0=gets(6);
M0=gets(7);
I9=gets(8);
I3=gets(9);
I2=gets(10);
I1=gets(11);
I0=gets(12);
R14=gets(13); // MAYBE DELETE if not using DOUBLE_ACC case
R10=gets(14);
R9=gets(15);
mode1=gets(16);
alter(16);
//------------------------------------------------------------------------------------
_AMF_BiquadCascade64_S_Render.END:
leaf_exit; // C-rth requires this instead of rts
//------------------------------------------------------------------------------------
.endseg;
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -