📄 dct1c.i

📁 H.264完整的C语言代码和DCT的代码
💻 I
字号:
// dct.i
// Ujval Kapasi
// 1/22/97
// 3/28/97
//
// 8x8 DCT (for JPEG and MPEG)
// 
// Test out a fast 1-d dct algorithm for the imagine chip implementation
//   From Pennebaker/Mitchell, pg. 50-52.  See also Arai, Agui, Nakajima.
// This algorithm is based on the 16-pt DFT.  Basically, the 8-pt DCT can
//   be calculated by scaling the real parts of the output of the 16-pt DFT.

// This kernel processes two blocks at the same time, one in each half
//   of the half2 registers

// STUFF TO DO ONLY ONCE -- I.E., OUTSIDE OF LOOP

// DEBUG : ISTREAM 0 : constants stored in VRF until ability to load constants
// -----                onto imagine is implemented in simulator

// Unnecessary : only exist because constants as of yet are not handled

kernel dct(istream<half2> consts,
           istream<half2> datain,
           ostream<int>   out)
{

  int two = 1 + 1;
  int four = two + two;
  int eight = four + four;
  int minus_eight = 0 - eight;

  half2 COS_2, COS_3, COS_1_plus_COS_3, COS_1_minus_COS_3;

  // Stored in 2.14 format
  //COS_2             = 0x2d412d41;    // cos(2*pi/8) || cos(2*pi/8);
  //COS_3             = 0x187e187e;    // cos(3*pi/8) || cos(3*pi/8);
  //COS_1_plus_COS_3  = 0x539f539f;    // cos(pi/8) + cos(3*pi/8) || same
  //COS_1_minus_COS_3 = 0x22a322a3;    // cos(pi/8) - cos(3*pi/8) || same

  consts >> COS_2 >> COS_3 >> COS_1_plus_COS_3 >> COS_1_minus_COS_3;

  half2 K0, K1, K2, K3, K4, K5, K6, K7;

  // Stored in 2.14 format
  //K0 = 0x16a116a1           // 0.25 * sqrt(2)       || 0.25 * sqrt(2);
  //K1 = 0x10501050           // 0.25 * sec(pi/16)    || 0.25 * sec(pi/16);
  //K2 = 0x11511151           // 0.25 * sec(2*pi/16)  || 0.25 * sec(2*pi/16);
  //K3 = 0x133e133e           // 0.25 * sec(3*pi/16)  || 0.25 * sec(3*pi/16);
  //K4 = 0x16a116a1           // 0.25 * sec(4*pi/16)  || 0.25 * sec(4*pi/16);
  //K5 = 0x1ccd1ccd           // 0.25 * sec(5*pi/16)  || 0.25 * sec(5*pi/16);
  //K6 = 0x29cf29cf           // 0.25 * sec(6*pi/16)  || 0.25 * sec(6*pi/16);
  //K7 = 0x52035203           // 0.25 * sec(7*pi/16)  || 0.25 * sec(7*pi/16);

  consts >> K0 >> K1 >> K2 >> K3 >> K4 >> K5 >> K6 >> K7;

  // half to consume a multiple of 8 words from stream
  half2 junk;
  consts >> junk >> junk >> junk >> junk;

  array<half2> buf(64);       // intermediate dct output.  ie, do rows then
                              //   store here.  Then index into this
                              //   differently to get the columns

  loop_stream(datain) {       // loop over blocks
    int index = 0 - eight;
    int index2 = 0 + 0;

    uc<int> i = 8;
    uc<int> i2 = 8;
    loop_count(i) pipeline(1) {

      half2 a0, a1, a2, a3, a4, a5, a6, a7;
  
      datain >> a0 >> a1 >> a2 >> a3 >> a4 >> a5 >> a6 >> a7;
  
      half2 s16, s07, s25, s34, s1625, s0734;
  
      s07 = a0 + a7;
      s16 = a1 + a6;
      s25 = a2 + a5;
      s34 = a3 + a4;
      s1625 = s16 + s25;
      s0734 = s07 + s34;
      // 12 OPS (count double because we are using half2's)
  
      half2 d16, d07, d25, d34, d1625, d0734;
  
      d07 = a0 - a7;
      d16 = a1 - a6;
      d25 = a2 - a5;
      d34 = a3 - a4;
      d1625 = s16 - s25;
      d0734 = s07 - s34;
      // 12 OPS
  
      half2 sd16d07, sd25d34;
  
      sd16d07 = d07 + d16;
      sd25d34 = d25 + d34;
      // 4 OPS
  
      half2 m1_over_2, m2, m5, m6, m7, m8, m9;

      // All results in 16.0
      m1_over_2 = s0734 + s1625;
      m2 = s0734 - s1625;
      m5 = hi(COS_2 * shift(d1625 + d0734, two));
      m6 = hi(COS_2 * shift(d25 + d16, two));
      m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
      m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
      m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
      // 30 OPS
  
      half2 s5, s6, s7, s8;
  
      s5 = d07 + m6;
      s6 = d07 - m6;
      s7 = m8 - m7;
      s8 = m9 - m7;
      // 8 OPS
  
      // All results in 16.0
      index = index + eight;
      buf[0+index] = hi(K0 * shift(m1_over_2, two));
      buf[1+index] = hi(K1 * shift(s5 + s7, two));
      buf[2+index] = hi(K2 * shift(d0734 + m5, two));
      buf[3+index] = hi(K3 * shift(s6 - s8, two));
      buf[4+index] = hi(K4 * shift(m2, two));
      buf[5+index] = hi(K5 * shift(s6 + s8, two));
      buf[6+index] = hi(K6 * shift(d0734 - m5, two));
      buf[7+index] = hi(K7 * shift(s5 - s7, two));
      // 44 OPS
  
      // TOTAL : 110 per loop iter (same for next loop also)
    }
  
    // do the columns now
    loop_count(i2) pipeline(1) {

      a0 = buf[0+index2];
      a1 = buf[8+index2];
      a2 = buf[16+index2];
      a3 = buf[24+index2];
      a4 = buf[32+index2];
      a5 = buf[40+index2];
      a6 = buf[48+index2];
      a7 = buf[56+index2];
      index2 = index2 + 1;
  
      s07 = a0 + a7;
      s16 = a1 + a6;
      s25 = a2 + a5;
      s34 = a3 + a4;
      s1625 = s16 + s25;
      s0734 = s07 + s34;
  
      d07 = a0 - a7;
      d16 = a1 - a6;
      d25 = a2 - a5;
      d34 = a3 - a4;
      d1625 = s16 - s25;
      d0734 = s07 - s34;
  
      sd16d07 = d07 + d16;
      sd25d34 = d25 + d34;

      m1_over_2 = s0734 + s1625;
      m2 = s0734 - s1625;
      m5 = hi(COS_2 * shift(d1625 + d0734, two));
      m6 = hi(COS_2 * shift(d25 + d16, two));
      m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
      m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
      m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
  
      s5 = d07 + m6;
      s6 = d07 - m6;
      s7 = m8 - m7;
      s8 = m9 - m7;

      out << hi(K0 * shift(m1_over_2, two));
      out << hi(K1 * shift(s5 + s7, two));
      out << hi(K2 * shift(d0734 + m5, two));
      out << hi(K3 * shift(s6 - s8, two));
      out << hi(K4 * shift(m2, two));
      out << hi(K5 * shift(s6 + s8, two));
      out << hi(K6 * shift(d0734 - m5, two));
      out << hi(K7 * shift(s5 - s7, two));
    }
  }
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -