📄 fdct_aan_kc.sc
字号:
//////////////////////////////////////////////////////////////////////////////////////////////////////// Title: fdct_aan_kc.sc (KernelC code for 8x8 FDCT and Quantization)//// Notice: COPYRIGHT (C) STREAM PROCESSORS, INC. 2005-2007// THIS PROGRAM IS PROVIDED UNDER THE TERMS OF THE SPI// END-USER LICENSE AGREEMENT (EULA). THE PROGRAM MAY ONLY// BE USED IN A MANNER EXPLICITLY SPECIFIED IN THE EULA,// WHICH INCLUDES LIMITATIONS ON COPYING, MODIFYING,// REDISTRIBUTION AND WARANTIES. UNAUTHORIZED USE OF THIS// PROGRAM IS STRICTLY PROHIBITED. YOU MAY OBTAIN A COPY OF// THE EULA FROM WWW.STREAMPROCESSORS.COM. // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #includes ////////////////////////////////////////////////////////////////////////////////////////////////////#include "fdct_aan_kc.h"#include "spi_common.h"////////////////////////////////////////////////////////////////////////////////// Constants////////////////////////////////////////////////////////////////////////////////#define UNPACK_MASK 0xb931a820#define SUBTRACT_128x2 0x01000100#define EXTRACT_MSB 0x05040100#define FIX14_0_382683433 0x187e187e //((int) 6270) // FIX(0.382683433) Q2.14#define FIX14_0_541196100 0x22a322a3 //((int) 8867) // FIX(0.541196100) Q2.14#define FIX14_0_707106781 0x2D412D41 //((int) 11585) // FIX(0.707106781) Q2.14#define FIX14_1_306562965 0x539f539f //((int) 21407) // FIX(1.306562965) Q2.14#define FIX15_0_382683433 0x30FC30FC //((int) 12540) // FIX(0.382683433) Q1.15#define FIX15_0_541196100 0x45464546 //((int) 17734) // FIX(0.541196100) Q1.15#define FIX15_0_707106781 0x5A825A82 //((int) 23170) // FIX(0.707106781) Q1.15#define DIV_CONST_BITS 15#define DIV_CONST_HALF ((1 << DIV_CONST_BITS) >> 1)///////////////////////////////////////////////////////////////////kernel void fdct_and_quantize_aan_kc( // in 16.0 format. The input is organized as a sequence of 8 rows, // where each row is composed of 4 int16x2s. After reading 32 // elements, a whole 8x8 block is in each lane. stream uint8x4 block_in(array_in), // in 16.0 format, similar to the input except that the data IS // TRANSPOSED---i.e., a sequence of 8 columns. stream int16x2 block_out(seq_out), stream int16x2 divisor(seq_in), int32x1 strip_size(in))//// Description: From Pennebaker/Mitchell, pg. 50-52. See also Arai, Agui,// Nakajima. This algorithm is based on the 16-pt DFT. Basically,// the 8-pt DCT can be calculated by scaling the real parts of the// output of the 16-pt DFT.//// Returns: Nothing.//////////////////////////////////////////////////////////////////{ ////////////////////////////////////////////////// // VARIABLE DECLARATIONS ////////////////////////////////////////////////// vec unsigned int8x4 a0, a1, a2, a3, a4, a5, a6, a7; vec unsigned int8x4 a8, a9, a10, a11, a12, a13, a14, a15; vec int16x2 x0, x1, x2, x3, x4, x5, x6, x7; vec int16x2 x8, x9, x10, x11, x12, x13, x14, x15; vec int16x2 x16, x17, x18, x19, x20, x21, x22, x23; vec int16x2 x24, x25, x26, x27, x28, x29, x30, x31; vec int32x1 dummy; vec int16x2 c0, c1, c2, c3, c4, c5, c6, c7; vec int16x2 c8, c9, c10, c11, c12, c13, c14, c15; vec int16x2 c16, c17, c18, c19, c20, c21, c22, c23; vec int16x2 c24, c25, c26, c27, c28, c29, c30, c31; vec int16x2 scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; vec int16x2 scale8, scale9, scale10, scale11, scale12, scale13, scale14, scale15; vec int16x2 scale16, scale17, scale18, scale19, scale20, scale21, scale22, scale23; vec int16x2 scale24, scale25, scale26, scale27, scale28, scale29, scale30, scale31; int32x1 num_blk, index1, index2, strip_size_words; int32x1 strip_size_wordsx2, strip_size_wordsx3, strip_size_wordsx4; int32x1 strip_size_wordsx5, strip_size_wordsx6, strip_size_wordsx7; spi_read (divisor, scale0); // Read the 64 quantization divisor values into 16x2 vec variables for further use spi_read (divisor, scale1); spi_read (divisor, scale2); spi_read (divisor, scale3); spi_read (divisor, scale4); spi_read (divisor, scale5); spi_read (divisor, scale6); spi_read (divisor, scale7); spi_read (divisor, scale8); spi_read (divisor, scale9); spi_read (divisor, scale10); spi_read (divisor, scale11); spi_read (divisor, scale12); spi_read (divisor, scale13); spi_read (divisor, scale14); spi_read (divisor, scale15); spi_read (divisor, scale16); spi_read (divisor, scale17); spi_read (divisor, scale18); spi_read (divisor, scale19); spi_read (divisor, scale20); spi_read (divisor, scale21); spi_read (divisor, scale22); spi_read (divisor, scale23); spi_read (divisor, scale24); spi_read (divisor, scale25); spi_read (divisor, scale26); spi_read (divisor, scale27); spi_read (divisor, scale28); spi_read (divisor, scale29); spi_read (divisor, scale30); spi_read (divisor, scale31); dummy = 0; num_blk = 0; index1 = 0; index2 = 1; strip_size_words = strip_size + strip_size; // Pre calculated index values to pick the right data from the input. strip_size_wordsx2 = strip_size_words + strip_size_words; // Index values are multiples of strip_size i.e. the index values strip_size_wordsx3 = strip_size_wordsx2 + strip_size_words; // would be 8, 16, 24, 32 etc, if strip_size were 4. This is done strip_size_wordsx4 = strip_size_wordsx3 + strip_size_words; // because data is loaded in strip_size row fashion. Hence, the strip_size_wordsx5 = strip_size_wordsx4 + strip_size_words; // Word0 & Word1(1st two words) are from the first 8x8 block, Word2 strip_size_wordsx6 = strip_size_wordsx5 + strip_size_words; // & Word3 (second two words) are from the second 8x8 block etc. The strip_size_wordsx7 = strip_size_wordsx6 + strip_size_words; // next data for the first 8x8 block would appear after strip_size*2 words. // One loop iteration handles NUM_LANES 8x8 blocks while (num_blk < strip_size) {#if defined (SWP)#pragma pipeline #endif //////////////////////////////////////////////////////////////////////////////////////////////////// // READ : load an 8x8 block from input stream //////////////////////////////////////////////////////////////////////////////////////////////////// spi_array_read (block_in, a0, index1); spi_array_read (block_in, a1, index2); spi_array_read (block_in, a2, (index1 + strip_size_words)); spi_array_read (block_in, a3, (index2 + strip_size_words)); spi_array_read (block_in, a4, (index1 + strip_size_wordsx2)); spi_array_read (block_in, a5, (index2 + strip_size_wordsx2)); spi_array_read (block_in, a6, (index1 + strip_size_wordsx3)); spi_array_read (block_in, a7, (index2 + strip_size_wordsx3)); spi_array_read (block_in, a8, (index1 + strip_size_wordsx4)); spi_array_read (block_in, a9, (index2 + strip_size_wordsx4)); spi_array_read (block_in, a10, (index1 + strip_size_wordsx5)); spi_array_read (block_in, a11, (index2 + strip_size_wordsx5)); spi_array_read (block_in, a12, (index1 + strip_size_wordsx6)); spi_array_read (block_in, a13, (index2 + strip_size_wordsx6)); spi_array_read (block_in, a14, (index1 + strip_size_wordsx7)); spi_array_read (block_in, a15, (index2 + strip_size_wordsx7)); //////////////////////////////////////////////////////////////////////////////////////////////////// // UNPACKING : unpack bytes into half words //////////////////////////////////////////////////////////////////////////////////////////////////// x0 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a0, dummy); x1 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a0, dummy); x2 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a1, dummy); x3 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a1, dummy); x4 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a2, dummy); x5 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a2, dummy); x6 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a3, dummy); x7 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a3, dummy); x8 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a4, dummy); x9 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a4, dummy); x10 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a5, dummy); x11 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a5, dummy); x12 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a6, dummy); x13 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a6, dummy); x14 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a7, dummy); x15 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a7, dummy); x16 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a8, dummy); x17 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a8, dummy); x18 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a9, dummy); x19 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a9, dummy); x20 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a10, dummy); x21 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a10, dummy); x22 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a11, dummy); x23 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a11, dummy); x24 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a12, dummy); x25 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a12, dummy); x26 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a13, dummy); x27 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a13, dummy); x28 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a14, dummy); x29 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a14, dummy); x30 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a15, dummy); x31 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a15, dummy); //////////////////////////////////////////////////////////////////////////////////////////////////// // INLINE KERNEL : call fdct8x8_kc() to perform one 8x8 block FDCT //////////////////////////////////////////////////////////////////////////////////////////////////// fdct8x8_and_quantize_aan_kc ( scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7, scale8, scale9, scale10, scale11, scale12, scale13, scale14, scale15, scale16, scale17, scale18, scale19, scale20, scale21, scale22, scale23, scale24, scale25, scale26, scale27, scale28, scale29, scale30, scale31, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16, c17, c18, c19, c20, c21, c22, c23, c24, c25, c26, c27, c28, c29, c30, c31 ); //////////////////////////////////////////////////////////////////////////////////////////////////// // WRITE : store the 8x8 block FDCT co-efficients into ouput stream //////////////////////////////////////////////////////////////////////////////////////////////////// spi_write (block_out, c0); spi_write (block_out, c1); spi_write (block_out, c2); spi_write (block_out, c3); spi_write (block_out, c4); spi_write (block_out, c5); spi_write (block_out, c6); spi_write (block_out, c7); spi_write (block_out, c8); spi_write (block_out, c9); spi_write (block_out, c10); spi_write (block_out, c11); spi_write (block_out, c12); spi_write (block_out, c13); spi_write (block_out, c14); spi_write (block_out, c15); spi_write (block_out, c16); spi_write (block_out, c17); spi_write (block_out, c18); spi_write (block_out, c19); spi_write (block_out, c20); spi_write (block_out, c21); spi_write (block_out, c22); spi_write (block_out, c23); spi_write (block_out, c24); spi_write (block_out, c25); spi_write (block_out, c26); spi_write (block_out, c27); spi_write (block_out, c28); spi_write (block_out, c29); spi_write (block_out, c30); spi_write (block_out, c31); index1 = (index1 + 2); index2 = (index2 + 2); num_blk = num_blk + 1; } // end while}///////////////////////////////////////////////////////////////////inline void kernel dct8_2( // Input pixel values (0--255) as int16x2s in 16.0 vec int16x2 in0(in), vec int16x2 in1(in), vec int16x2 in2(in), vec int16x2 in3(in), vec int16x2 in4(in), vec int16x2 in5(in), vec int16x2 in6(in), vec int16x2 in7(in), // Output DCT coefficients in 16.0 vec int16x2 out0(out), vec int16x2 out1(out), vec int16x2 out2(out), vec int16x2 out3(out), vec int16x2 out4(out), vec int16x2 out5(out), vec int16x2 out6(out), vec int16x2 out7(out))// Description:// This function does two 8pt DCTs on each lane, on each half of// the int16x2s. That is, the upper half of the 8 input int16x2s// represent one 8-element array, while the lower halves represent// another 8-element array. This algorithm is from// Pennebaker/Mitchell, pg. 50-52. See also Arai, Agui, Nakajima.// The algorithm is based on the 16-pt DFT. Basically, the 8-pt// DCT can be calculated by scaling the real parts of the output of// the 16-pt DFT.//// Returns: Nothing.//////////////////////////////////////////////////////////////////{ vec int16x2 a0, a1, a2, a3, a4, a5, a6, a7; vec int16x2 b0, b1, b2, b3; vec int16x2 z1, z2, z3, z4, z5, z11, z13; vec int16x2 tmp1; //phase 1 a0 = in0 + in7; a7 = in0 - in7; a1 = in1 + in6; a6 = in1 - in6; a2 = in2 + in5; a5 = in2 - in5; a3 = in3 + in4; a4 = in3 - in4; // even part
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -