📄 fdct_aan_kc.sc

📁 motion Jpeg 在SPI DSP平台优化好的代码
💻 SC
📖 第 1 页 / 共 2 页
字号:
12 下一页
////////////////////////////////////////////////////////////////////////////////////////////////////////      Title:          fdct_aan_kc.sc  (KernelC code for 8x8 FDCT and Quantization)////      Notice:         COPYRIGHT (C) STREAM PROCESSORS, INC. 2005-2007//                      THIS PROGRAM IS PROVIDED UNDER THE TERMS OF THE SPI//                      END-USER LICENSE AGREEMENT (EULA). THE PROGRAM MAY ONLY//                      BE USED IN A MANNER EXPLICITLY SPECIFIED IN THE EULA,//                      WHICH INCLUDES LIMITATIONS ON COPYING, MODIFYING,//                      REDISTRIBUTION AND WARANTIES. UNAUTHORIZED USE OF THIS//                      PROGRAM IS STRICTLY PROHIBITED. YOU MAY OBTAIN A COPY OF//                      THE EULA FROM WWW.STREAMPROCESSORS.COM. //    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////      #includes ////////////////////////////////////////////////////////////////////////////////////////////////////#include "fdct_aan_kc.h"#include "spi_common.h"//////////////////////////////////////////////////////////////////////////////////                                  Constants////////////////////////////////////////////////////////////////////////////////#define UNPACK_MASK         0xb931a820#define SUBTRACT_128x2		0x01000100#define EXTRACT_MSB         0x05040100#define FIX14_0_382683433     0x187e187e       //((int)  6270)    // FIX(0.382683433) Q2.14#define FIX14_0_541196100     0x22a322a3       //((int)  8867)    // FIX(0.541196100) Q2.14#define FIX14_0_707106781     0x2D412D41       //((int)  11585)   // FIX(0.707106781) Q2.14#define FIX14_1_306562965     0x539f539f       //((int)  21407)   // FIX(1.306562965) Q2.14#define FIX15_0_382683433     0x30FC30FC       //((int)  12540)   // FIX(0.382683433) Q1.15#define FIX15_0_541196100     0x45464546       //((int)  17734)   // FIX(0.541196100) Q1.15#define FIX15_0_707106781     0x5A825A82       //((int)  23170)   // FIX(0.707106781) Q1.15#define DIV_CONST_BITS 15#define DIV_CONST_HALF ((1 << DIV_CONST_BITS) >> 1)///////////////////////////////////////////////////////////////////kernel void fdct_and_quantize_aan_kc(    // in 16.0 format.  The input is organized as a sequence of 8 rows,    // where each row is composed of 4 int16x2s.  After reading 32    // elements, a whole 8x8 block is in each lane.    stream uint8x4 block_in(array_in),    // in 16.0 format, similar to the input except that the data IS    // TRANSPOSED---i.e., a sequence of 8 columns.    stream int16x2 block_out(seq_out),    stream int16x2 divisor(seq_in),	int32x1		   strip_size(in))//// Description: From Pennebaker/Mitchell, pg. 50-52.  See also Arai, Agui,//              Nakajima.  This algorithm is based on the 16-pt DFT.  Basically,//              the 8-pt DCT can be calculated by scaling the real parts of the//              output of the 16-pt DFT.////	Returns:		Nothing.//////////////////////////////////////////////////////////////////{	//////////////////////////////////////////////////	//	VARIABLE DECLARATIONS	//////////////////////////////////////////////////    vec unsigned int8x4 a0, a1, a2, a3, a4, a5, a6, a7;    vec unsigned int8x4 a8, a9, a10, a11, a12, a13, a14, a15;    vec int16x2 x0, x1, x2, x3, x4, x5, x6, x7;    vec int16x2 x8, x9, x10, x11, x12, x13, x14, x15;    vec int16x2 x16, x17, x18, x19, x20, x21, x22, x23;    vec int16x2 x24, x25, x26, x27, x28, x29, x30, x31;        vec int32x1 dummy;    vec int16x2 c0, c1, c2, c3, c4, c5, c6, c7;    vec int16x2 c8, c9, c10, c11, c12, c13, c14, c15;    vec int16x2 c16, c17, c18, c19, c20, c21, c22, c23;    vec int16x2 c24, c25, c26, c27, c28, c29, c30, c31;    vec int16x2 scale0,  scale1,  scale2,  scale3,  scale4,  scale5,  scale6,  scale7;    vec int16x2 scale8,  scale9,  scale10, scale11, scale12, scale13, scale14, scale15;    vec int16x2 scale16, scale17, scale18, scale19, scale20, scale21, scale22, scale23;    vec int16x2 scale24, scale25, scale26, scale27, scale28, scale29, scale30, scale31;	int32x1 num_blk, index1, index2, strip_size_words;	int32x1 strip_size_wordsx2, strip_size_wordsx3, strip_size_wordsx4;	int32x1 strip_size_wordsx5, strip_size_wordsx6, strip_size_wordsx7;    spi_read (divisor, scale0);         // Read the 64 quantization divisor values into 16x2 vec variables for further use    spi_read (divisor, scale1);    spi_read (divisor, scale2);    spi_read (divisor, scale3);    spi_read (divisor, scale4);    spi_read (divisor, scale5);    spi_read (divisor, scale6);    spi_read (divisor, scale7);    spi_read (divisor, scale8);    spi_read (divisor, scale9);    spi_read (divisor, scale10);    spi_read (divisor, scale11);    spi_read (divisor, scale12);    spi_read (divisor, scale13);    spi_read (divisor, scale14);    spi_read (divisor, scale15);    spi_read (divisor, scale16);    spi_read (divisor, scale17);    spi_read (divisor, scale18);    spi_read (divisor, scale19);    spi_read (divisor, scale20);    spi_read (divisor, scale21);    spi_read (divisor, scale22);    spi_read (divisor, scale23);    spi_read (divisor, scale24);    spi_read (divisor, scale25);    spi_read (divisor, scale26);    spi_read (divisor, scale27);    spi_read (divisor, scale28);    spi_read (divisor, scale29);    spi_read (divisor, scale30);    spi_read (divisor, scale31);    dummy            = 0;	num_blk		     = 0;	index1		     = 0;	index2		     = 1;	strip_size_words = strip_size + strip_size;                     // Pre calculated index values to pick the right data from the input.	strip_size_wordsx2 = strip_size_words + strip_size_words;       // Index values are multiples of strip_size i.e. the index values  	strip_size_wordsx3 = strip_size_wordsx2 + strip_size_words;     // would be 8, 16, 24, 32 etc, if strip_size were 4.  This is done 	strip_size_wordsx4 = strip_size_wordsx3 + strip_size_words;     // because data is loaded in strip_size row fashion.  Hence, the 	strip_size_wordsx5 = strip_size_wordsx4 + strip_size_words;     // Word0 & Word1(1st two words) are from the first 8x8 block, Word2	strip_size_wordsx6 = strip_size_wordsx5 + strip_size_words;     // & Word3 (second two words) are from the second 8x8 block etc.  The 	strip_size_wordsx7 = strip_size_wordsx6 + strip_size_words;     // next data for the first 8x8 block would appear after strip_size*2 words.    // One loop iteration handles NUM_LANES 8x8 blocks    while (num_blk < strip_size)    {#if defined (SWP)#pragma pipeline #endif	    ////////////////////////////////////////////////////////////////////////////////////////////////////        //      READ : load an 8x8 block from input stream        ////////////////////////////////////////////////////////////////////////////////////////////////////        spi_array_read (block_in, a0, index1);        spi_array_read (block_in, a1, index2);        spi_array_read (block_in, a2, (index1 + strip_size_words));        spi_array_read (block_in, a3, (index2 + strip_size_words));        spi_array_read (block_in, a4, (index1 + strip_size_wordsx2));        spi_array_read (block_in, a5, (index2 + strip_size_wordsx2));        spi_array_read (block_in, a6, (index1 + strip_size_wordsx3));        spi_array_read (block_in, a7, (index2 + strip_size_wordsx3));        spi_array_read (block_in, a8, (index1 + strip_size_wordsx4));        spi_array_read (block_in, a9, (index2 + strip_size_wordsx4));        spi_array_read (block_in, a10, (index1 + strip_size_wordsx5));        spi_array_read (block_in, a11, (index2 + strip_size_wordsx5));        spi_array_read (block_in, a12, (index1 + strip_size_wordsx6));        spi_array_read (block_in, a13, (index2 + strip_size_wordsx6));        spi_array_read (block_in, a14, (index1 + strip_size_wordsx7));        spi_array_read (block_in, a15, (index2 + strip_size_wordsx7));        ////////////////////////////////////////////////////////////////////////////////////////////////////        //      UNPACKING : unpack bytes into half words        ////////////////////////////////////////////////////////////////////////////////////////////////////        x0  = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a0, dummy);        x1  = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a0, dummy);         x2  = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a1, dummy);        x3  = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a1, dummy);        x4  = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a2, dummy);        x5  = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a2, dummy);        x6  = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a3, dummy);        x7  = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a3, dummy);        x8  = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a4, dummy);        x9  = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a4, dummy);        x10 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a5, dummy);        x11 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a5, dummy);        x12 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a6, dummy);        x13 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a6, dummy);        x14 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a7, dummy);        x15 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a7, dummy);        x16 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a8, dummy);        x17 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a8, dummy);        x18 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a9, dummy);        x19 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a9, dummy);        x20 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a10, dummy);        x21 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a10, dummy);        x22 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a11, dummy);        x23 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a11, dummy);        x24 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a12, dummy);        x25 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a12, dummy);        x26 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a13, dummy);        x27 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a13, dummy);        x28 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a14, dummy);        x29 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a14, dummy);        x30 = (vec int16x2)spi_vshuffledu_lo (UNPACK_MASK, (vec int32x1)a15, dummy);        x31 = (vec int16x2)spi_vshuffledu_hi (UNPACK_MASK, (vec int32x1)a15, dummy);        ////////////////////////////////////////////////////////////////////////////////////////////////////        //      INLINE KERNEL : call fdct8x8_kc() to perform one 8x8 block FDCT        ////////////////////////////////////////////////////////////////////////////////////////////////////        fdct8x8_and_quantize_aan_kc (            scale0,  scale1,  scale2,  scale3,  scale4,  scale5,  scale6,  scale7,            scale8,  scale9,  scale10, scale11, scale12, scale13, scale14, scale15,            scale16, scale17, scale18, scale19, scale20, scale21, scale22, scale23,            scale24, scale25, scale26, scale27, scale28, scale29, scale30, scale31,            x0, x1, x2, x3, x4, x5, x6, x7,            x8, x9, x10, x11, x12, x13, x14, x15,            x16, x17, x18, x19, x20, x21, x22, x23,            x24, x25, x26, x27, x28, x29, x30, x31,            c0, c1, c2, c3, c4, c5, c6, c7,            c8, c9, c10, c11, c12, c13, c14, c15,            c16, c17, c18, c19, c20, c21, c22, c23,            c24, c25, c26, c27, c28, c29, c30, c31            );        ////////////////////////////////////////////////////////////////////////////////////////////////////        //      WRITE : store the 8x8 block FDCT co-efficients into ouput stream        ////////////////////////////////////////////////////////////////////////////////////////////////////        spi_write (block_out, c0);        spi_write (block_out, c1);        spi_write (block_out, c2);        spi_write (block_out, c3);        spi_write (block_out, c4);        spi_write (block_out, c5);        spi_write (block_out, c6);        spi_write (block_out, c7);        spi_write (block_out, c8);        spi_write (block_out, c9);        spi_write (block_out, c10);        spi_write (block_out, c11);        spi_write (block_out, c12);        spi_write (block_out, c13);        spi_write (block_out, c14);        spi_write (block_out, c15);        spi_write (block_out, c16);        spi_write (block_out, c17);        spi_write (block_out, c18);        spi_write (block_out, c19);        spi_write (block_out, c20);        spi_write (block_out, c21);        spi_write (block_out, c22);        spi_write (block_out, c23);        spi_write (block_out, c24);        spi_write (block_out, c25);        spi_write (block_out, c26);        spi_write (block_out, c27);        spi_write (block_out, c28);        spi_write (block_out, c29);        spi_write (block_out, c30);        spi_write (block_out, c31);		index1 = (index1 + 2);        index2 = (index2 + 2);		num_blk = num_blk + 1;    }  // end while}///////////////////////////////////////////////////////////////////inline void kernel dct8_2(    // Input pixel values (0--255) as int16x2s in 16.0    vec int16x2  in0(in),     vec int16x2  in1(in),    vec int16x2  in2(in),    vec int16x2  in3(in),    vec int16x2  in4(in),     vec int16x2  in5(in),    vec int16x2  in6(in),    vec int16x2  in7(in),    // Output DCT coefficients in 16.0    vec int16x2 out0(out),     vec int16x2 out1(out),    vec int16x2 out2(out),    vec int16x2 out3(out),    vec int16x2 out4(out),     vec int16x2 out5(out),    vec int16x2 out6(out),    vec int16x2 out7(out))// Description://    This function does two 8pt DCTs on each lane, on each half of//    the int16x2s.  That is, the upper half of the 8 input int16x2s//    represent one 8-element array, while the lower halves represent//    another 8-element array. This algorithm is from//    Pennebaker/Mitchell, pg. 50-52.  See also Arai, Agui, Nakajima.//    The algorithm is based on the 16-pt DFT.  Basically, the 8-pt//    DCT can be calculated by scaling the real parts of the output of//    the 16-pt DFT.////	Returns:    Nothing.//////////////////////////////////////////////////////////////////{    vec int16x2 a0, a1, a2, a3, a4, a5, a6, a7;    vec int16x2 b0, b1, b2, b3;    vec int16x2 z1, z2, z3, z4, z5, z11, z13;    vec int16x2 tmp1;        //phase 1    a0 = in0 + in7;    a7 = in0 - in7;    a1 = in1 + in6;    a6 = in1 - in6;    a2 = in2 + in5;    a5 = in2 - in5;    a3 = in3 + in4;    a4 = in3 - in4;    // even part
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -