📄 jpege_vlc_kc.sc
字号:
abs_coef = spi_vabd32i (diff_dc_coef, 0); code_mag = spi_vselect32(spi_vle32i((vec int32x1)0, diff_dc_coef), diff_dc_coef, (diff_dc_coef - 1)); // code_mag = (diff_dc_coef >= 0) ? diff_dc_coef : (diff_dc_coef - 1); // Find number of bits needed for magnitude of the coefficient. num_bits = 32 - spi_vffone32 (abs_coef); // Write huffman coded categorie spi_array_read(dc_huffman_table,huffman_entry,num_bits); // huffman_entry = dc_huffman_table[num_bits]; code_length = spi_vshuffledi_hi (half_word_hi_lo, huffman_entry, tmp); // top 16 bits code_word = spi_vshuffledi_lo (half_word_hi_lo, huffman_entry, tmp); // bottom 16 bits //cur_word_position = start_index | (no_prev_blk_bits << 16) ; // This line implemented as shuffle for optimization cur_word_position = spi_vshuffleu(0x05040100, start_index, no_prev_blk_bits); cur_word = prev_blk_bits ; // Merge code_word of length code_length & code_mag of length num_bits into one word to be written to the bitstream code_word = (code_word << num_bits) | (code_mag & ((vec uint32x1)0xFFFFFFFF >> ((vec uint32x1)32 - num_bits))); code_length = code_length + num_bits; write_bits (bitstream, cur_word_position, cur_word_position, cur_word, cur_word, code_word, code_length, false);}inline void kernel huffman_encode ( vec int32x1 zz_0(in), vec int32x1 zz_63(in), stream int32x1 run_level(array_io), vec uint32x1 run_level_size(in), stream uint32x1 ac_huffman_table(array_in), stream uint32x1 bitstream(array_io), stream int32x1 prev_block_data_strm(array_io), vec uint32x1 escape_code_length(in), vec uint32x1 escape_code_word(in), vec uint32x1 eob_code_length(in), vec uint32x1 eob_code_word(in), vec uint32x1 cur_word_in(in), vec uint32x1 cur_word_position_in(in), vec uint32x1 last_word(out), vec uint32x1 last_word_pos(out) ) // Description: // Each lane operates on one 8x8 block. // There are no more than 64 run-level pairs per block. // All the run -level pairs are Huffman coded & then written to the bitstream. // Each word to be written to the bistream is first chacked for "FF" sequence & // stuffed with "00" as an emulation prevention measure // ////////////////////////////////////////////////////////////////{ vec uint32x1 code_mag; // code of the magnitute vec uint32x1 num_bits; vec uint32x1 huffman_entry; vec uint32x1 code_length; vec uint32x1 code_word; vec uint32x1 run_level_i; vec int32x1 run, level, updated_run; vec uint32x1 abs_level; vec uint32x1 table_offset; vec uint32x1 cur_word_position; vec uint32x1 cur_word; vec uint32x1 utmpv, utmpv1; vec int32x1 tmp; vec int32x1 i; vec uint32x1 half_word_hi_lo; half_word_hi_lo = 0xb9b93120; cur_word = cur_word_in; cur_word_position = cur_word_position_in; // ---- Handle ac_coefficients ----- // Loop over run_level until coding for all the lanes is done i = 0; tmp = 0; utmpv1 = 0; spi_array_read (run_level, run_level_i, i); //run_level_i = run_level[0]; utmpv = spi_vselect32(spi_vle32u(run_level_size, i), 0, 1); //utmpv = (run_level_size == 0) ? 0 : 1; while ( spi_vrorl(utmpv) != 0 ) {#if defined (SWP)#pragma pipeline #endif // Lanes for which run-level encoding has finished, initialse the run & level to 0 run_level_i = spi_vselect32(spi_vle32u(run_level_size, i), 0, run_level_i); run = spi_vshuffledi_hi (half_word_hi_lo, run_level_i, tmp); // top 16 bits level = spi_vshuffledi_lo (half_word_hi_lo, run_level_i, tmp); // bottom 16 bits // If run > 16 in previous iteration, update the run with recalculated one run = spi_vselect32(utmpv1, updated_run, run); // check if run > 15 utmpv1 = spi_vlt32u(15, run); // store the modified run to use it in next iteration updated_run = run - 16; // if run > 15, we'll be inserting an escape code, hence change the run to 0 so that it points to a valid location in huffman table run = spi_vselect32(utmpv1, 0, run); utmpv = spi_veq32(level, 0); //level == 0 abs_level = spi_vabd32i (level, 0); code_mag = spi_vselect32(spi_vle32i(0, level), level, level - 1); // code_mag = (level >= 0) ? level : level - 1; num_bits = spi_vselect32(utmpv, 0, (32 - spi_vffone32 (abs_level))); // num_bits = utmpv ? 0 : (32 - spi_vffone32 (abs_level)); table_offset = run << 4; table_offset = table_offset + num_bits; spi_array_read(ac_huffman_table, huffman_entry, table_offset); code_length = spi_vselect32(utmpv, 0, spi_vshuffledi_hi (half_word_hi_lo, huffman_entry, tmp)); //code_length = (level == 0) ? 0 : huffman_entry >> 16; code_word = spi_vshuffledi_lo (half_word_hi_lo, huffman_entry, tmp); // bottom 16 bits //code_word = huffman_entry & MASK_16BIT // Merge code_word of length code_length & code_mag of length num_bits into one word to be written to the bitstream code_word = (code_word << num_bits) | (code_mag & ((vec uint32x1)0xFFFFFFFF >> ((vec uint32x1)32 - num_bits))); code_length = code_length + num_bits; // If run > 15 we beed to write the escape code word code_length = spi_vselect32(utmpv1, escape_code_length, code_length); code_word = spi_vselect32(utmpv1, escape_code_word, code_word); write_bits (bitstream, cur_word_position, cur_word_position, cur_word, cur_word, code_word, code_length, false); // Don't increment the pointer if run > 15 i = spi_vselect32(utmpv1, i, i + 1); spi_array_read (run_level, run_level_i, i); utmpv = spi_vselect32(spi_vle32u(run_level_size, i), 0, 1); //utmpv = (run_level_size == i) ? 0 : 1; } // If the last coef is zero, emit an end-of-block code utmpv = zz_63; code_length = spi_vselect32(spi_veq32(utmpv, 0), eob_code_length, 0); write_bits (bitstream, cur_word_position, cur_word_position, cur_word, cur_word, eob_code_word, code_length, false); // Save the dc_coef of the prev block to be used as the init_dc_coef for thr next block //cur_dc_coef = zz_0; spi_array_write (prev_block_data_strm, zz_0, 0); // Store the incomplete word & no of bits in that to be patched up with the next block spi_array_write (prev_block_data_strm, cur_word, 1); spi_array_write (prev_block_data_strm, (cur_word_position >> 16), 2); last_word = cur_word; last_word_pos = cur_word_position; }inline void kernel calc_run_level( stream uint32x1 run_level(array_io), vec int32x1 level(in), vec int32x1 run_in(in), vec int32x1 run_out(out), vec uint32x1 num_non_zero_in(in), vec uint32x1 num_non_zero_out(out), vec uint32x1 pack(in) ) // Description: // If level = 0, run is inceremented. // If level != 0, run-level pair is written to the stream "run_level" // & the index into the stream num_non_zero_in is incremented. // ////////////////////////////////////////////////////////////////{ vec uint32x1 tmp0; // Pack run and level into one 32 bit word where run is top 16 bit, level to bottom 16bits tmp0 = spi_vshufflei (pack, run_in, level); spi_array_write (run_level, tmp0, num_non_zero_in); tmp0 = spi_veq32(level, (vec uint32x1)0); run_out = spi_vselect32(tmp0, (run_in + (vec int32x1)1), (vec uint32x1)0); num_non_zero_out = spi_vselect32(tmp0, num_non_zero_in, (num_non_zero_in + (vec uint32x1)1));}kernel void jpege_vlc_kc ( stream int16x2 coefs (seq_in), // Transformed & quantized co-efficients as input stream uint32x1 dc_huffman_table(array_in), // Each huffman table entry is a 32 bit word. // The high 16 bit is the code length, the low 16 bit is the code word // dc_huffman_table has a total of 12 entries. stream uint32x1 ac_huffman_table(array_in), // ac_huffman table has a total of 160 entries stream uint32x1 bitstream (array_io), // Ouput bitstream stream int32x1 prev_block_data_strm(array_io), // This stream consists of prev dc coeff, prev incomplete word, // & no. of bits in prev incomplete word stream uint32x1 prev_bitstream_offset_strm(array_io), // This bitstream points to the last word position of the previous block stream uint32x1 next_bitstream_offset_strm(array_io), // This bitstream is updated to pint to the last word position of current block stream uint32x1 run_level(array_io), // Used as temporary storage for runs & levels of each block // The high 16 bit is the run, the low 16 bit is the level uint32x1 last_iter_in_row(in) // This flag is true if we are encoding the last strip is a row ) // Description: Each lane operates on STRIP_SIZE number of 8x8 blocks. // // 1. Transformed & quantized input coefficients are first stored in zig-zag scan order // 2. runs & levels are calculated // 3. This is followed by huffamn encoding of differential DC coeffient & all the runs & levels in the block // 4. Each word to be written to the bitstream is searched for "FF" bit pattern & stuffed with "00" : Emulation prevention procedure // ////////////////////////////////////////////////////////////////{ vec int16x2 c0c8, c16c24, c32c40, c48c56; vec int16x2 c1c9, c17c25, c33c41, c49c57; vec int16x2 c2c10, c18c26, c34c42, c50c58; vec int16x2 c3c11, c19c27, c35c43, c51c59; vec int16x2 c4c12, c20c28, c36c44, c52c60; vec int16x2 c5c13, c21c29, c37c45, c53c61; vec int16x2 c6c14, c22c30, c38c46, c54c62; vec int16x2 c7c15, c23c31, c39c47, c55c63; vec int32x1 zz_0, zz_1, zz_2, zz_3, zz_4, zz_5, zz_6, zz_7, zz_8, zz_9; vec int32x1 zz_10, zz_11, zz_12, zz_13, zz_14, zz_15, zz_16, zz_17, zz_18, zz_19; vec int32x1 zz_20, zz_21, zz_22, zz_23, zz_24, zz_25, zz_26, zz_27, zz_28, zz_29; vec int32x1 zz_30, zz_31, zz_32, zz_33, zz_34, zz_35, zz_36, zz_37, zz_38, zz_39; vec int32x1 zz_40, zz_41, zz_42, zz_43, zz_44, zz_45, zz_46, zz_47, zz_48, zz_49; vec int32x1 zz_50, zz_51, zz_52, zz_53, zz_54, zz_55, zz_56, zz_57, zz_58, zz_59; vec int32x1 zz_60, zz_61, zz_62, zz_63; vec uint32x1 hi_lo_bytes; vec uint32x1 half_word_hi_lo; vec int32x1 tmp0; vec uint32x1 utmpv; vec uint32x1 num_non_zero; vec int32x1 run; vec uint32x1 pack; vec uint32x1 write_index, store_index; vec uint32x1 last_word, last_word_pos, last_word_bits; vec uint32x1 cur_word, cur_word_position; vec uint32x1 data, num_bits; vec uint32x1 reset_marker; vec uint32x1 escape_code_word; vec uint32x1 escape_code_length; vec uint32x1 eob_code_word; vec uint32x1 eob_code_length; hi_lo_bytes =0x9B9B1302; half_word_hi_lo = 0xb9b93120; write_index = 0; tmp0 = 0; // Read the escape code word upfront since it is required for coding all the blocks spi_array_read(ac_huffman_table, utmpv, 0xf0); escape_code_length = spi_vshuffledi_hi (half_word_hi_lo, utmpv, tmp0); // top 16 bits escape_code_word = spi_vshuffledi_lo (half_word_hi_lo, utmpv, tmp0); // bottom 16 bits // Read the end of block code word upfront since it is required for coding all the blocks spi_array_read(ac_huffman_table, utmpv, 0); eob_code_length = spi_vshuffledi_hi (half_word_hi_lo, utmpv, tmp0); // top 16 bits eob_code_word = spi_vshuffledi_lo (half_word_hi_lo, utmpv, tmp0); // bottom 16 bits while (!spi_eos(coefs)) { // each loop handles STRIP_SIZE number of blocks per lane. // read coefficients of one block. These coefficients are in transposed order of the original block after dct. spi_read (coefs, c0c8); spi_read (coefs, c16c24); spi_read (coefs, c32c40); spi_read (coefs, c48c56); spi_read (coefs, c1c9); spi_read (coefs, c17c25); spi_read (coefs, c33c41); spi_read (coefs, c49c57); spi_read (coefs, c2c10); spi_read (coefs, c18c26); spi_read (coefs, c34c42); spi_read (coefs, c50c58); spi_read (coefs, c3c11); spi_read (coefs, c19c27); spi_read (coefs, c35c43); spi_read (coefs, c51c59); spi_read (coefs, c4c12); spi_read (coefs, c20c28); spi_read (coefs, c36c44); spi_read (coefs, c52c60); spi_read (coefs, c5c13); spi_read (coefs, c21c29); spi_read (coefs, c37c45); spi_read (coefs, c53c61); spi_read (coefs, c6c14); spi_read (coefs, c22c30); spi_read (coefs, c38c46); spi_read (coefs, c54c62); spi_read (coefs, c7c15); spi_read (coefs, c23c31); spi_read (coefs, c39c47); spi_read (coefs, c55c63);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -