📄 mpeg_kc.cpp
字号:
rle_out[0+store_idx] = int(CrCbY4Y3Y2Y1_b[0+send_idx]);
rle_out[16+store_idx] = int(CrCbY4Y3Y2Y1_b[8+send_idx]);
rle_out[32+store_idx] = int(CrCbY4Y3Y2Y1_b[16+send_idx]);
store_idx = store_idx_arr[1];
send_idx = send_idx_arr[1];
rle_out[0+store_idx] = int(commucperm(perm_1, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_1,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_1,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[2];
send_idx = send_idx_arr[2];
rle_out[0+store_idx] = int(commucperm(perm_2, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_2,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_2,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[3];
send_idx = send_idx_arr[3];
rle_out[0+store_idx] = int(commucperm(perm_3, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_3,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_3,CrCbY4Y3Y2Y1_b[16+send_idx]));
// temporary, to get around rf allocation problems
i = 0;
loop_count(i) {
int g = 0;
}
store_idx = store_idx_arr[4];
send_idx = send_idx_arr[4];
rle_out[0+store_idx] = int(commucperm(perm_4, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_4,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_4,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[5];
send_idx = send_idx_arr[5];
rle_out[0+store_idx] = int(commucperm(perm_5, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_5,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_5,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[6];
send_idx = send_idx_arr[6];
rle_out[0+store_idx] = int(commucperm(perm_6, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_6,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_6,CrCbY4Y3Y2Y1_b[16+send_idx]));
store_idx = store_idx_arr[7];
send_idx = send_idx_arr[7];
rle_out[0+store_idx] = int(commucperm(perm_7, CrCbY4Y3Y2Y1_b[0+send_idx]));
rle_out[16+store_idx] = int(commucperm(perm_7,CrCbY4Y3Y2Y1_b[8+send_idx]));
rle_out[32+store_idx] = int(commucperm(perm_7,CrCbY4Y3Y2Y1_b[16+send_idx]));
// First, each cluster independently does the run-level encoding for
// the eight values of each block that that cluster has. Later, the
// "mini-blocks" of eight will be patched up. To do the run level encoding,
// a variable called run is used to keep track of the current run. The
// final value of this variable needs to be saved and will be used to
// "patch up" the encoding for the whole block. Also, only the run-level
// pairs are recorded. All zero elements are removed. Thus, not all eight
// locations within a block hold useful data at the end of this operation.
// A zero is used to mark the first non-useful data (if there are less
// than eight actual run-level pairs). Storing only the actual run-level
// pairs speeds up the condition output stream loop later because it
// doesn't necessarily have to check all eight values, and thus the loop
// can break early if possible. Cluster zero needs to deal with
// the dc value, which is the first element in its array. The dc value
// is stored as a difference between the current value and the prediction,
// which is the previous dc value of a similar block. Thus there are three
// different dc predictions, Y, Cb, and Cr. After using the prediction,
// the current value is saved and becomes the prediction for next time.
// The first four blocks use the Y prediction, and the next two use the
// Cb and Cr predictions respectively. Finally, the data is initially
// stored in half2 format with each 16 bit half containing an element
// from a different block. Thus, each loop iteration, each cluster run
// level encodes eight values from two different blocks at the same time.
// The first block's results overwrite the original 8 elements, and the
// second block's results will occupy the eight locations that start
// right after the source elements. Thus, for example, Y4Y3 originally
// is in locations 16-23, and the results for Y3 will be in 16-23, and
// those for Y4 will be in 24-31. (Actually, there may be less that eight
// results for each block, but the layout remains the same...)
uc<int> j, k; // loop vars
cc clust_zero = itocc(cid() == 0); // cluster 0 ?
cc ac_zero; // original half2 elements == 0 ??
cc luminance; // are we dealing with Y blocks? True for 1st 2 iter.
cc all_non_zero; // Were all 8 values in this cluster != 0 for this block
int data_idx = 0; // source data index
int lo_idx; // result data index for first block
int hi_idx; // result data index for second block
int hi_mask, lo_mask; // 16 bit masks
int run_level; // high 16 bits = run, low 16 bits = level
int pred; // dc prediction for this block type
int pred_idx; // 0 for Y4,Y3,Y2,Y1, 1 for Cb, 2 for Cr
half2 run; // current run of zeros
half2 ac_val; // original levels (zero and non zero)
int lo_ac_val; // ac value for first block
int hi_ac_val; // ac value for second block
int dc_val; // current dc value - predicted dc value
int run_idx = 0; // 0-5 for Y1,Y2,Y3,Y4,Cb,CR respectively
array<int> runs(6); // final vals of the run variable; used for "patching up"
minus_one = 0 - 1;
minus_sixteen = 0 - sixteen;
lo_mask = shift(minus_one, minus_sixteen);
hi_mask = shift(minus_one, sixteen);
i = 3;
loop_count(i) {
lo_idx = data_idx;
hi_idx = data_idx + eight;
// get values and check if they == 0
ac_val = half2(rle_out[data_idx]);
ac_zero = itocc(int(ac_val == 0));
// first run is 0, so only set level - will be fixed by "patch up" later
lo_ac_val = int(ac_val) & lo_mask;
run_level = lo_ac_val;
// in case this is a dc value, do dc prediction
luminance = itocc(data_idx <= sixteen);
pred_idx = select(luminance, 0, 1);
pred = dc_pred[pred_idx];
dc_val = lo_ac_val - pred;
dc_pred[pred_idx] = select(clust_zero, lo_ac_val, pred);
// save dc prediction if cluster 0, else store the ac level
run_level = select(clust_zero, dc_val, run_level);
rle_out[lo_idx] = run_level;
// do the same for second block
hi_ac_val = shift(int(ac_val), minus_sixteen);
run_level = hi_ac_val;
// second block dc value prediction
pred_idx = select(luminance, 0, two);
pred = dc_pred[pred_idx];
dc_val = hi_ac_val - pred;
dc_pred[pred_idx] = select(clust_zero, hi_ac_val, pred);
// store second block dc value if cluster 0, else ac level
run_level = select(clust_zero, dc_val, run_level);
rle_out[hi_idx] = run_level;
// update source data index
data_idx = data_idx + 1;
// result pointers update unconditionally if it was a dc value; else
// if it was an ac value, they only increment if the level != 0
lo_idx = lo_idx + select(clust_zero, 1, select(ac_zero, 0, 1));
hi_idx = hi_idx + select(clust_zero, 1, shift(int(select(ac_zero, 0, h2_one)), minus_sixteen));
// update the current run of zeros, and reset to 0 if level was non_zero
run = half2(select(clust_zero, 0, int(select(ac_zero, h2_one, 0))));
// all values are now ac values, so don't worry about dc stuff
j = 7;
loop_count(j) {
// get values and check if they == 0
ac_val = half2(rle_out[data_idx]);
ac_zero = itocc(int(ac_val == 0));
// encode run-level pair for the two blocks and store them
run_level = (int(ac_val) & lo_mask) | shift(int(run), sixteen);
rle_out[lo_idx] = run_level;
run_level = shift(int(ac_val),minus_sixteen) | (int(run) & hi_mask);
rle_out[hi_idx] = run_level;
// update the run variable, and reset to 0 if level != 0
run = select(ac_zero, (run + h2_one), 0);
// update source data index
data_idx = data_idx + 1;
// update the result data indices if level != 0
lo_idx = select(ac_zero, lo_idx, lo_idx + 1);
ac_zero = itocc(shift(cctoi(ac_zero), minus_sixteen));
hi_idx = select(ac_zero, hi_idx, hi_idx + 1);
}
// save final value of the run variable for each block
runs[0+run_idx] = int(run) & lo_mask;
runs[1+run_idx] = shift(int(run), minus_sixteen);
run_idx = run_idx + two;
// if not all the values for non-zero, store a zero to indicate the end
// of actual run-level pairs for the first block.
all_non_zero = itocc(lo_idx == data_idx);
rle_out[lo_idx] = select( all_non_zero, int(rle_out[lo_idx]), 0);
// update the source data index
data_idx = data_idx + eight;
// if not all the values for non-zero, store a zero to indicate the end
// of actual run-level pairs for the second block.
all_non_zero = itocc(hi_idx == data_idx);
rle_out[hi_idx] = select(all_non_zero, int(rle_out[hi_idx]), 0);
}
// Now, patch up the run-level encoding. This is done by patching up the
// the cluster 0/cluster 1 data first, and then the cluster 1/cluster 2
// data, etc. However it is pipelined so that the patching for all 6
// blocks are done with as much parallelism as possible. Thus, while
// the cluster 1/cluster 2 data is being patched up for the first block,
// the cluster 0/cluster 1 data will be patched up for the second block,
// and so on. The invalid cc tells the cluster when it has a valid run
// from the previous cluster. Thus cluster 7 will be invalid the first
// 6 iterations, and cluster 1 will be invalid the last 6. Cluster 0 is
// always invalid because it doesn't need to have any of its data patched
// up (ie, there is no previous cluster from which to get a run). The
// invalid cc is passed from cluster to cluster via communication. It
// originates in cluster 1, and cluster 1 sets it to false until it has
// finished all its blocks. It starts as true in every other cluster.
// Eventually the invalid flag will be false in every cluster, and then
// true again as the value cluster 1 sets it to percolates to the rest
// of the clusters.
cc cluster_one; // cluster 1 ??
cc invalid; // false when a cluster has a valid run to work with
cc send_run; // true if cluster needs to send its run to next cluster
cc all_zeros; // true if the cluster had all zeros for this block
int true1; // = 0xFFFF; used for setting cc's to TRUE
int done; // non-zero when cluster 1 has patched up all the blocks
int blocks_left; // how many blocks cluster 1 has left to patch up
int run_token; // run of zeros, passed from cluster to cluster
int orig_run_level; // the run-level calculated above
int new_run_level; // the run-level adjusted with the final run from the
// previous cluster
int block_idx; // index to the blocks, 0,8,16,24,32,40
uc<int> perm_token = 0x65432100; // used to comm. invalid and run_token
// sends value to next cluster
cluster_one = itocc(cid() == 1);
true1 = minus_one;
blocks_left = eight - (1 + 1);
invalid = itocc(select(cluster_one, 0, true1));
send_run = clust_zero;
run_idx = 0; // index into final run values, 1 per block
block_idx = 0;
run_token = runs[0];
i = 12; // 7 patches per block; 6 blocks -> 7 + (6-1) = 12
loop_count(i) {
// get the run-level pair calculated above and see if it equals zero
orig_run_level = rle_out[block_idx];
all_zeros = itocc(orig_run_level == 0);
// get the final run of zeros of the previous cluster
run_token = commucperm(perm_token, run_token);
// adjust first run-level pair of this cluster
new_run_level = orig_run_level + shift(run_token, sixteen);
// and store it if the previous cluster's run was valid; if it wasn't,
// store the original value back again
rle_out[block_idx] = select(invalid, orig_run_level, select(all_zeros, 0, new_run_level));
// any blocks left?
blocks_left = blocks_left - 1;
done = blocks_left == 0;
// update the idx to the run values and to the block data
run_idx = select(send_run, run_idx + 1, run_idx);
block_idx = select(invalid, block_idx, block_idx + eight);
// if this cluster was all zeros, update run and pass it along, else
// pass the final run of zeros for this cluster to the next
run_token = select(all_zeros, run_token + eight, runs[run_idx]);
// pass the invalid "token" on to the next cluster, and if this is
// cluster 1, mark it true if no more blocks left
invalid = itocc(select(cluster_one, done, commucperm(perm_token, cctoi(invalid))));
// pass the send_run token, except mark it false if cluster 0 and done
send_run = itocc(select(clust_zero, ~done, commucperm(perm_token, cctoi(send_run))));
}
// Output the results using a conditional output stream. This is an
// inherently serial operation. The outermost loop goes through the
// six blocks of each macroblock. This loop first makes cluster 0
// output the dc value. Then the remaining (up to 63) values are output.
// A while_any loop is used. Each cluster outputs all the data it has
// and then passes the "token" (my_turn) on to the next cluster. Thus
// there is only one loop iteration per run-level pair in the block,
// and this may be less than 64 iterations. The loop is software pipelined
// such that all the bookkeeping is done on the first "iteration." Every
// cycle this "iteration" of the loop keeps track of which cluster should
// be outputting, reads the data to output by a later "iteration" of the
// software pipelined loop, and updates the scratchpad index. The second
// "iteration" of the loop calculates the conditional output stream
// condition. The final "iteration" sets the loop condition and does the
// conditional stream output operations. Since the loop condition is set
// only in the final "iteration" of the software pipeline, and the only
// changing of any state also occurs in the last "iteration", software
// pipelining the loop will work withough adjusting how the loop condition
// is calculated. Thus, there will have to be 2 extra real iterations of
// the loop for every block.
int rl; // the run_level value; if == 0 don't send it
int out_idx, out_idx_tmp; // index into the data
int my_turn_save; // save the value of my_turn before it is updated (scratch)
int my_turn; // -1 if it IS OR HAS BEEN my turn for this block
int cluster_zero; // -1 for cluster 0, zero for the rest
int next_turn; // -1 if out of data, next cluster's turn for this block
int next_turn_inv; // -1 if out of data, next cluster's turn for this block
cc send; // I have valid data to output (it IS my turn to output)
cc my_turn_cc; // holds my_turn for SELECTs and costreams
block_idx = 0;
i = 6;
loop_count(i) {
cluster_zero = (cid() == 0);
my_turn = cluster_zero;
my_turn_cc = itocc(my_turn);
// must send dc coefficient
rl = rle_out[block_idx];
out(my_turn_cc) << rl;
out_idx = select(my_turn_cc, block_idx + 1, block_idx);
block_idx = block_idx + eight;
next_turn = 0;
cc loopcc = itocc(minus_one);
loop_while_any(loopcc) { // pipeline(1) {
stage(1);
// get next value
rl = rle_out[out_idx];
// is this cluster done yet?
next_turn = my_turn & ((rl == 0) | ((block_idx == out_idx) | next_turn));
// update index if it is your turn to output
my_turn_cc = itocc(my_turn);
out_idx = select(my_turn_cc, (out_idx + 1), out_idx);
stage(2);
// send if it is my_turn but not the next cluster's turn yet
next_turn_inv = ~next_turn;
send = itocc(my_turn & next_turn_inv);
// communicate to next clusters if it is their turn to output now
my_turn = cluster_zero | commucperm(perm_token, next_turn);
// loop until all clusters have signalled that they are done
loopcc = itocc(next_turn_inv);
stage(3);
out(send) << rl;
}
// out(cluster_one) << minus_one; // End of Block ????
}
// flush(out, minus_one + 1); // NULL value ????
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -