📄 encode_frame_sc.sc
字号:
spi_printf ("Error: Malloc of prev_blk_bits failed. \n"); SPI_ASSERT(1); } if ((kernel_output_bitcount = (unsigned int *) spi_malloc (iterations_per_frame * SPI_LANES * sizeof (unsigned int))) == NULL) { spi_printf ("Error: Malloc of kernel_output_bitcount failed. \n"); SPI_ASSERT(1); } for (i = 0; i < SPI_LANES; i++) { bitstream_offset[i] = i * p_comp->width_in_blocks * BLOCK_BIT_BUFFER_SIZE; } init[0] = init[1] = init[2] = 0; //////////////////////////////////////////////////////////////////////////////////////////////////// // Indice Generation //////////////////////////////////////////////////////////////////////////////////////////////////// // // Data in the form of 8x8 blocks is to be loaded into 'block_strm' from the input image / input bitstream. The // input image is visualized as rows of 8x8 blocks and each lane independently processes one such row of 8x8 blocks. // Indices are calculated such that every row in the input image would have an index and STRIP_SIZE * BLOCK_WIDTH // number of pixels are loaded from each index. The indices are ordered such that an entire STRIP_SIZE row of // 8x8 blocks get loaded into each lane. Number of blocks processed in one kernel call = STRIP_SIZE * SPI_LANES, // where each lane processes one row of 8x8 blocks in the image. // Hence (BLOCK_HEIGHT * SPI_LANES) indices are calculated for one kernel call. Since the location of the indices // is constant w.r.t. location of the base pointer, only the offset inside the image is changed for each kernel call. // Since the height of the image may not always be a multiple of SPI_LANES # 8x8 blocks, another set of indices is // needed for the last iteration, when there are no more rows_of_8x8_blocks remaining in the input image that can be // loaded into different lanes. In this case the indices into the unused lanes are set to 0. // When the input image height is not a multiple of BLOCK_HEIGHT, to avoid re-allocation & padding of the input image, // previous row indice is reused, which serves the purpose of padding in the last iteration indices. count = 0; for (i = 0; i < BLOCK_HEIGHT; i++) { index = i * p_comp->scaled_width; for (k =0; k < SPI_LANES; k++) { p_input_index[count] = index + k * BLOCK_HEIGHT * p_comp->scaled_width; if (p_input_index[count] >= p_comp->scaled_width * p_comp->scaled_height) // In case the image is small { p_input_index[count] = 0; } cur_height_loc = (((iterations_per_frame - 1) * SPI_LANES) * BLOCK_HEIGHT); cur_height_loc += ((k * BLOCK_HEIGHT) + i); // Calculates the last iteration location in terms of rows in input image. if (cur_height_loc >= p_comp->actual_height) // Set index offsets for the last iteration { if (cur_height_loc < p_comp->scaled_height) // Special condition when input image height is not a multiple of 8, { // reuse previous row index (padding). p_input_index[count + INDICES_PER_KERNEL] = p_input_index[count + INDICES_PER_KERNEL - SPI_LANES]; } else // Location is outside image boundary, set to zero. { p_input_index[count + INDICES_PER_KERNEL] = 0; } } else // Location is within image boundary, reuse calculated index { p_input_index[count + INDICES_PER_KERNEL] = p_input_index[count]; } count++; } } //////////////////////////////////////////////////////////////////////////////////////////////////// // AC & DC Huffman Table Generation //////////////////////////////////////////////////////////////////////////////////////////////////// // // DC huffman table (12 words) and AC huffman table (256 words) are organized such that the higher // 16 bits have the code length and the lower 16 bits have the corresponding code word. for (i = 0; i < DERIVED_DC_TABLE_LENGTH; i++) { dc_huffman_table_k[i] = ((unsigned int) p_comp->d_dc_huff_tbl.code_length[i] << 16) | ((unsigned int)p_comp->d_dc_huff_tbl.code_word[i]); } for (i = 0; i < DERIVED_AC_TABLE_LENGTH; i++) { ac_huffman_table_k[i] = ((unsigned int) p_comp->d_ac_huff_tbl.code_length[i] << 16) | ((unsigned int)p_comp->d_ac_huff_tbl.code_word[i]); } spi_flush_entire_data_cache(); //////////////////////////////////////////////////////////////////////////////////////////////////// // LOAD : Data Load //////////////////////////////////////////////////////////////////////////////////////////////////// // Data loads that need to take place, only once per input frame spi_load (dc_huff_table_strm, &dc_huffman_table_k[0], 0, DERIVED_DC_TABLE_LENGTH, 1, 1, 0); // Load DC huffman table into each lane (transpose=0) spi_load (ac_huff_table_strm, &ac_huffman_table_k[0], 0, DERIVED_AC_TABLE_LENGTH, 1, 1, 0); // Load AC huffman table into each lane (transpose=0) spi_load (divisor_strm, p_quant_divisor, 0, 32, 1, 1, 0); // Load Quantization Divisors into each lane (transpose=0) spi_load (bitstream_offset_strm, &bitstream_offset, 0, SPI_LANES, 1, 1, 1); // Initialse the pointers to bitstream. spi_load (index_strm, p_input_index, 0, INDICES_PER_KERNEL, 1, 1, 1); // Load the first set of indices generated. p_src = p_input; cur_strip_size = (p_comp->width_in_blocks > STRIP_SIZE) ? STRIP_SIZE : p_comp->width_in_blocks; last_strip_size = p_comp->width_in_blocks - ((iterations_per_row - 1) * STRIP_SIZE); // Remaining valid number of 8x8 blocks in the current row of 8x8 blocks for (i = 0; i < iterations_per_frame; i++) { p_bitstream = (unsigned int *) p_comp->p_mem_buffer + i * p_comp->width_in_blocks * SPI_LANES * BLOCK_BIT_BUFFER_SIZE_W; strip_size = cur_strip_size; last_iter_in_row = 0; // Data loads & stores that need to take place prior to kernel launch. spi_load (prev_block_data, &init, 0, 3, 1, 1, 0); // Initialise all the three substreams to 0 if (i == (iterations_per_frame - 1)) // Check if current iteration is the on the last few rows of 8x8 blocks of the input image { spi_load (index_strm, p_input_index, INDICES_PER_KERNEL, INDICES_PER_KERNEL, 1, 1, 1); // If so, then load the second set of indices generated. } if (i != 0) { // Store the number of bits created by each lane // This is used to convert the output bitstream from the kernel into a bit-buffer structure spi_store ( next_bitstream_offset_strm, // output stream kernel_output_bitcount, // output buffer ((i-1) * SPI_LANES), // offset SPI_LANES, // count 1, // group 1, // stride 1 // transpose ); } spi_load (next_bitstream_offset_strm, &bitstream_offset, 0, SPI_LANES, 1, 1, 1); // Reset the values in the bitstream for (j = 0; j < iterations_per_row; j++) { if (j == (iterations_per_row - 1)) // Check if the current iteration is last iteration on the row of 8x8 blocks. { last_iter_in_row = 0xFFFFFFF; // 0xFFFFFFF is easier for DPU to understand than "0x1" strip_size = last_strip_size; // Update strip_size to the vaild number of 8x8 blocks only } // Update 'p_src_img_offset' to point the location in the input image where the next valid data needs to be picked. // 'p_src_img_offset' is made to point to the start of the window from which the next valid STRIP_SIZE number of // 8x8 blocks are to fetched from. p_src_img_offset = p_src + (((i * SPI_LANES * BLOCK_HEIGHT * p_comp->scaled_width) + (j * STRIP_SIZE *BLOCK_WIDTH))); // Input image needs to be loaded into the each lanes
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -