📄 rle_kc.cpp
字号:
// all values are now ac values, so don't worry about dc stuff
j = 7;
loop_count(j) {
// get values and check if they == 0
ac_val = half2(zigzag[src_idx]);
ac_zero = itocc(int(ac_val == 0));
// encode run-level pair for the two blocks and store them
run_level = (int(ac_val) & lo_mask) | shift(int(run), 16);
rle_out[lo_idx] = run_level;
run_level = shift(int(ac_val), -16) | (int(run) & hi_mask);
rle_out[hi_idx] = run_level;
// update the run variable, and reset to 0 if level != 0
run = select(ac_zero, (run + h2_one), 0);
// update source data index
src_idx = src_idx + 1;
// update the result data indices if level != 0
lo_idx = select(ac_zero, lo_idx, lo_idx + 1);
ac_zero = itocc(shift(cctoi(ac_zero), -16));
hi_idx = select(ac_zero, hi_idx, hi_idx + 1);
}
// save final value of the run variable for each block
runs[0+run_idx] = int(run) & lo_mask;
runs[1+run_idx] = shift(int(run), -16);
run_idx = run_idx + 2;
// if not all the values for non-zero, store a zero to indicate the end
// of actual run-level pairs for the first block.
all_non_zero = itocc(lo_idx == (data_idx + 8));
rle_out[lo_idx] = select( all_non_zero, int(rle_out[lo_idx]), 0);
// update the dest base data index
data_idx = data_idx + 16;
// if not all the values for non-zero, store a zero to indicate the end
// of actual run-level pairs for the second block.
all_non_zero = itocc(hi_idx == data_idx);
rle_out[hi_idx] = select(all_non_zero, int(rle_out[hi_idx]), 0);
}
// Now, patch up the run-level encoding. This is done by patching up the
// the cluster 0/cluster 1 data first, and then the cluster 1/cluster 2
// data, etc. However it is pipelined so that the patching for all 6
// blocks are done with as much parallelism as possible. Thus, while
// the cluster 1/cluster 2 data is being patched up for the first block,
// the cluster 0/cluster 1 data will be patched up for the second block,
// and so on. The invalid cc tells the cluster when it has a valid run
// from the previous cluster. Thus cluster 7 will be invalid the first
// 6 iterations, and cluster 1 will be invalid the last 6. Cluster 0 is
// always invalid because it doesn't need to have any of its data patched
// up (ie, there is no previous cluster from which to get a run). The
// invalid cc is passed from cluster to cluster via communication. It
// originates in cluster 1, and cluster 1 sets it to false until it has
// finished all its blocks. It starts as true in every other cluster.
// Eventually the invalid flag will be false in every cluster, and then
// true again as the value cluster 1 sets it to percolates to the rest
// of the clusters.
cc cluster_one; // cluster 1 ??
cc invalid; // false when a cluster has a valid run to work with
cc send_run; // true if clust needs to send its run to next cluster
cc all_zeros; // true if the cluster had all zeros for this block
int true_cc; // = 0xFFFF; used for setting cc's to TRUE
int done; // non-zero when cluster 1 has patched up all blocks
int blocks_left; // how many blocks cluster 1 has left to patch up
int run_token; // run of zeros, passed from cluster to cluster
int orig_run_level; // the run-level calculated above
int new_run_level; // the run-level adjusted with the final run from the
// previous cluster
int block_idx; // index to the blocks, 0,8,16,24,32,40
uc<int> perm_token = 0x65432100; // used to comm. invalid and run_token
// sends value to next cluster
cluster_one = itocc(cid() == 1);
true_cc = -1;
blocks_left = 6; // 8 - (1 + 1);
invalid = itocc(select(cluster_one, 0, true_cc));
send_run = clust_zero;
run_idx = 0; // index into final run values, 1 per block
block_idx = 0;
run_token = runs[0];
i = 12; // 7 patches per block; 6 blocks -> 7 + (6-1) = 12
loop_count(i) {
// get the run-level pair calculated above and see if it equals zero
orig_run_level = rle_out[block_idx];
all_zeros = itocc(select(Iframe, 1, orig_run_level) == 0);
// get the final run of zeros of the previous cluster
run_token = commucperm(perm_token, run_token);
// adjust first run-level pair of this cluster
new_run_level = orig_run_level + shift(run_token, 16);
// and store it if the previous cluster's run was valid; if it wasn't,
// store the original value back again
rle_out[block_idx] = select(invalid, orig_run_level,
select(all_zeros, 0, new_run_level));
// any blocks left?
blocks_left = blocks_left - 1;
done = blocks_left == 0;
// update the idx to the run values and to the block data
run_idx = select(send_run, run_idx + 1, run_idx);
block_idx = select(invalid, block_idx, block_idx + 8);
block_idx = select(clust_zero, block_idx + 8, block_idx);
// if this cluster was all zeros, update run and pass it along, else
// pass the final run of zeros for this cluster to the next
int next_run = runs[run_idx];
run_token = select(all_zeros, select(clust_zero, next_run, run_token + 8), next_run);
// pass the invalid "token" on to the next cluster, and if this is
// cluster 1, mark it true if no more blocks left
invalid = itocc(select(cluster_one, done, commucperm(perm_token, cctoi(invalid))));
// pass the send_run token, except mark it false if cluster 0 and done
send_run = itocc(select(clust_zero, ~done, commucperm(perm_token, cctoi(send_run))));
}
// calculate coded block pattern (cbp)
int cbp, zerorow, allzero;
cc az;
uc<int> perm_combine_1 = 0x77553311;
uc<int> perm_combine_2 = 0x66662222;
uc<int> perm_combine_3 = 0x44444444;
zerorow = (rle_out[0] == 0);
allzero = zerorow & commucperm(perm_combine_1, zerorow);
allzero = allzero & commucperm(perm_combine_2, allzero);
allzero = allzero & commucperm(perm_combine_3, allzero);
az = itocc(allzero);
cbp = select(az, 0, 32);
zerorow = (rle_out[8] == 0);
allzero = zerorow & commucperm(perm_combine_1, zerorow);
allzero = allzero & commucperm(perm_combine_2, allzero);
allzero = allzero & commucperm(perm_combine_3, allzero);
az = itocc(allzero);
cbp = cbp | select(az, 0, 16);
zerorow = (rle_out[16] == 0);
allzero = zerorow & commucperm(perm_combine_1, zerorow);
allzero = allzero & commucperm(perm_combine_2, allzero);
allzero = allzero & commucperm(perm_combine_3, allzero);
az = itocc(allzero);
cbp = cbp | select(az, 0, 8);
zerorow = (rle_out[24] == 0);
allzero = zerorow & commucperm(perm_combine_1, zerorow);
allzero = allzero & commucperm(perm_combine_2, allzero);
allzero = allzero & commucperm(perm_combine_3, allzero);
az = itocc(allzero);
cbp = cbp | select(az, 0, 4);
zerorow = (rle_out[32] == 0);
allzero = zerorow & commucperm(perm_combine_1, zerorow);
allzero = allzero & commucperm(perm_combine_2, allzero);
allzero = allzero & commucperm(perm_combine_3, allzero);
az = itocc(allzero);
cbp = cbp | select(az, 0, 2);
zerorow = (rle_out[40] == 0);
allzero = zerorow & commucperm(perm_combine_1, zerorow);
allzero = allzero & commucperm(perm_combine_2, allzero);
allzero = allzero & commucperm(perm_combine_3, allzero);
az = itocc(allzero);
cbp = cbp | select(az, 0, 1);
// Transpose blocks containing run-level pairs
// calculate cluster dependent send and store indices
int idx0 = cid();
int idx1 = (idx0 - 1) & 7;
int idx2 = (idx1 - 1) & 7;
int idx3 = (idx2 - 1) & 7;
int idx4 = (idx3 - 1) & 7;
int idx5 = (idx4 - 1) & 7;
int idx6 = (idx5 - 1) & 7;
int idx7 = (idx6 - 1) & 7;
array<int> output_array(48);
output_array[0+idx0] = rle_out[0+idx0];
output_array[0+idx7] = commucperm(perm_a, rle_out[0+idx1]);
output_array[0+idx6] = commucperm(perm_b, rle_out[0+idx2]);
output_array[0+idx5] = commucperm(perm_c, rle_out[0+idx3]);
output_array[0+idx4] = commucperm(perm_d, rle_out[0+idx4]);
output_array[0+idx3] = commucperm(perm_e, rle_out[0+idx5]);
output_array[0+idx2] = commucperm(perm_f, rle_out[0+idx6]);
output_array[0+idx1] = commucperm(perm_g, rle_out[0+idx7]);
output_array[8+idx0] = rle_out[8+idx0];
output_array[8+idx7] = commucperm(perm_a, rle_out[8+idx1]);
output_array[8+idx6] = commucperm(perm_b, rle_out[8+idx2]);
output_array[8+idx5] = commucperm(perm_c, rle_out[8+idx3]);
output_array[8+idx4] = commucperm(perm_d, rle_out[8+idx4]);
output_array[8+idx3] = commucperm(perm_e, rle_out[8+idx5]);
output_array[8+idx2] = commucperm(perm_f, rle_out[8+idx6]);
output_array[8+idx1] = commucperm(perm_g, rle_out[8+idx7]);
output_array[16+idx0] = rle_out[16+idx0];
output_array[16+idx7] = commucperm(perm_a, rle_out[16+idx1]);
output_array[16+idx6] = commucperm(perm_b, rle_out[16+idx2]);
output_array[16+idx5] = commucperm(perm_c, rle_out[16+idx3]);
output_array[16+idx4] = commucperm(perm_d, rle_out[16+idx4]);
output_array[16+idx3] = commucperm(perm_e, rle_out[16+idx5]);
output_array[16+idx2] = commucperm(perm_f, rle_out[16+idx6]);
output_array[16+idx1] = commucperm(perm_g, rle_out[16+idx7]);
output_array[24+idx0] = rle_out[24+idx0];
output_array[24+idx7] = commucperm(perm_a, rle_out[24+idx1]);
output_array[24+idx6] = commucperm(perm_b, rle_out[24+idx2]);
output_array[24+idx5] = commucperm(perm_c, rle_out[24+idx3]);
output_array[24+idx4] = commucperm(perm_d, rle_out[24+idx4]);
output_array[24+idx3] = commucperm(perm_e, rle_out[24+idx5]);
output_array[24+idx2] = commucperm(perm_f, rle_out[24+idx6]);
output_array[24+idx1] = commucperm(perm_g, rle_out[24+idx7]);
output_array[32+idx0] = rle_out[32+idx0];
output_array[32+idx7] = commucperm(perm_a, rle_out[32+idx1]);
output_array[32+idx6] = commucperm(perm_b, rle_out[32+idx2]);
output_array[32+idx5] = commucperm(perm_c, rle_out[32+idx3]);
output_array[32+idx4] = commucperm(perm_d, rle_out[32+idx4]);
output_array[32+idx3] = commucperm(perm_e, rle_out[32+idx5]);
output_array[32+idx2] = commucperm(perm_f, rle_out[32+idx6]);
output_array[32+idx1] = commucperm(perm_g, rle_out[32+idx7]);
output_array[40+idx0] = rle_out[40+idx0];
output_array[40+idx7] = commucperm(perm_a, rle_out[40+idx1]);
output_array[40+idx6] = commucperm(perm_b, rle_out[40+idx2]);
output_array[40+idx5] = commucperm(perm_c, rle_out[40+idx3]);
output_array[40+idx4] = commucperm(perm_d, rle_out[40+idx4]);
output_array[40+idx3] = commucperm(perm_e, rle_out[40+idx5]);
output_array[40+idx2] = commucperm(perm_f, rle_out[40+idx6]);
output_array[40+idx1] = commucperm(perm_g, rle_out[40+idx7]);
// output quant_scale
uint qs = commclperm(ucid(), 0, quant_scale);
out(clust_zero) << half2(qs);
// output cbp
out(clust_zero) << half2(cbp);
// output motion vector
int mv, bestval;
motion(Pframe, ccend) >> mv >> bestval; // don't need 'bestval'
out(Pframe) << mv;
// output run-level pairs
i = 6;
int idx = 0;
int tmpout;
cc valid;
loop_count(i) {
tmpout = output_array[idx];
valid = itocc(cctoi(Iframe) | (tmpout > 0));
out(valid) << tmpout;
tmpout = output_array[1+idx];
valid = itocc(tmpout > 0);
out(valid) << tmpout;
tmpout = output_array[2+idx];
valid = itocc(tmpout > 0);
out(valid) << tmpout;
tmpout = output_array[3+idx];
valid = itocc(tmpout > 0);
out(valid) << tmpout;
tmpout = output_array[4+idx];
valid = itocc(tmpout > 0);
out(valid) << tmpout;
tmpout = output_array[5+idx];
valid = itocc(tmpout > 0);
out(valid) << tmpout;
tmpout = output_array[6+idx];
valid = itocc(tmpout > 0);
out(valid) << tmpout;
tmpout = output_array[7+idx];
valid = itocc(tmpout > 0);
out(valid) << tmpout;
idx = idx + 8;
out(clust_zero) << -1;
}
}
flush(out, -2);
// read remaining valus in motion stream, if any (should be NULL elements)
int drain;
cc cctrue = itocc(1 == 1);
motion(cctrue, ccend) >> drain;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -