📄 decoder.c
字号:
#include "decoder.h"
#include "wavelet.h"
#include "transform.h"
#include "quantize.h"
#include "coder.h"
#include "mathutil.h"
#include "dpcm.h"
#include <crblib/tsc.h>
#include "yuv.h"
/***************
<> todo : try an all int version ?
@@ would save us loads of time in the ftoi()
(remember to use the cdfxhaar for timing)
------------------
<> BTW the rung ladder is 16k - rather a large part of our cache usage
we should only touch half of it (the "zero is MPS" half) - about 8k
------------------
(these times are "decode" only, not counting the YUV->RGB & blit at the end)
on a 256x256 ; trans = CDF22xHaar
0.039886 secs = 182.6 cycles / pel = 1643066 pels /sec
(on K7 and P2 takes about 133 cycles/pel ; = 62 fps at 500 Mhz!!)
on the K7-500 :
DecodePlane : 0.016255 secs = 136.4 cycles / pel = 4031628 pels /sec
on a 512x512 we have exactly the same cycles/pel = excellent!
the old decoder :
256: 0.070820 secs = 324.2 cycles / pel = 925386 pels /sec
512: 0.330963 secs = 378.8 cycles / pel = 792064 pels /sec
is about twice as slow, with bad scaling
*******************/
void DecodeIt(Wavelet *w,ubyte *compPtr,ubyte * BGRplane,float *fPlanes[3],int imwidth,int imheight,float Q,
Driver_Window * Driver)
{
int l,p;
float *fPlaneEnd,*LLptr;
SubBand *sb;
RowBuffer ** DCBs;
RowBuffer *tbL,*tbH;
RowAbstract tp;
transformFunc hTranser;
transformRowsFunc vRowTranser;
float *Vptr,*Uptr;
ubyte * BGRptr;
int untransRowBase,blittedRows=0;
//------------------------------------
// make the wavelet object & get it ready
Wavelet_SetupQuantizers(w,Q);
w->compPtr = compPtr;
arithDecodeInit(w->ari,w->compPtr);
//--------------------------------------------
// prepare for the inverse transform & TCB's
hTranser = hTransform->inverse;
vRowTranser = vTransform->inverseRows;
assert( hTranser && vRowTranser );
tbL = w->TransBufL;
tbH = w->TransBufH;
DCBs= w->CodeBuffer;
assert( tbL && tbH && DCBs );
//------------------------------------
tp.IsPlaneBuffer = false;
RowBuffer_Init(&(tp.RB),imwidth,4);
//------------------------------------
Vptr = fPlanes[0];
Uptr = fPlanes[1];
BGRptr = BGRplane;
//------------------------------------
for(p=0;p<3;p++)
{
SubBandTree * sbt;
int sbtwidth,sbtheight,levels;
sbt = w->trees + p;
//------------------------------------
// prep the LL and decode the lowest LL
sbtwidth = imwidth;
sbtheight= imheight;
levels = w->levels;
if ( p != 2 )
{
sbtwidth >>= 1;
sbtheight>>= 1;
levels --;
}
fPlaneEnd = fPlanes[p] + (sbtwidth * sbtheight);
sb = &(sbt->LL);
sb->band = fPlaneEnd - (sb->width * sb->height);
sb->stride = sb->width;
dpcmCode(sb,levels,w->dpcmLoss,w->ari,false);
SubBand_Quantize(sb,false);
//------------------------------------
// start doing 3 bands at a time : decode,dequant & make a new LL
for(l=0;l<levels;l++)
{
SubBand *bands[3];
int y,width,height,i;
LapInfo li[3]; // the coder states
//-----------------------
// prepare for coding
bands[0] = &(sbt->bands[0][l]);
bands[1] = &(sbt->bands[1][l]);
bands[2] = &(sbt->bands[2][l]);
height = bands[0]->height;
width = bands[0]->width;
// start up the codebuffer:
for(i=0;i<3;i++)
{
bands[i]->band = NULL; // make sure noone touches this
RowBuffer_Clear(DCBs[i],width);
LapInfo_Reset(&li[i]);
}
//--------------------------
// set up the target for the transform :
tp.width = width<<1;
tp.halfh = height;
untransRowBase = tp.RB.rowCenter;
tp.PB.halfh = height;
tp.PB.halfw = width; // the target is twice as big as our bands
tp.PB.height = tp.PB.halfh<<1;
tp.PB.width = tp.PB.halfw <<1;
tp.PB.plane = fPlaneEnd - (tp.PB.width * tp.PB.height);
tp.PB.stride = tp.PB.width;
tp.PB.untransptr = tp.PB.plane;
if ( l == (w->levels - 1) )
{
// it's a RB
tp.IsPlaneBuffer = false;
tp.RB.rowCenter = tp.RB.numRows;
}
else
{
tp.IsPlaneBuffer = true;
}
//-------------------
// do the rows of the LH,HL,HH
LLptr = fPlaneEnd - (width * height);
for(y=0;y<height;y++)
{
uint * dPtr[3];
float * fPtr[3];
/****************
*
* This row-flow method uses a minimum amount of "hot" memory -
* only 6k cache is used for a 256-wide image
*
// <>
On larger cache machines, we may do better by adding some rows
to the DCB and TCB so that we can do :
1. decode several rows to the DCB
2. transform to several rows in the TCB
3. do vtransform several times on the TCB's
this gives us better register coherence & lets us stay
in one function longer (big deal on K7)
---------------
we do :
on 3 bands :
decode row
dequant row
(touches 3 DCB rows = 3*width)
de-htransform up & down
(touches the LL row = width)
(touched 2 target TCB rows = 4*width)
de-vtransform
(touches 2 target rows = 2*width)
total = 10*subb_width floats
= 5*next_width*4 bytes
<= 5k for a 256x256 image
total touched *= subb_height
= 10 * next_width * next_height bytes
= 10 bytes per image pel
at all times :
the 3 DCB rows should be in cache (3*4/2 * next_width)
the 2*2 TCB rows should be in cache (4*4 * next_width)
= 5.5 K (for 256 width)
the only hot memory is :
one LL row in (4/2 * next_width)
two LL rows out (4*2 * next_width)
= 2.5 K (for 256 width)
or 1/4 of the image bytes in additional overhead, per level
adding all levels :
* (1/4) + (1/16) + .. = 4/3 of image bytes
* = 87k total overhead touched (on 256x256)
*
******************/
for(i=0;i<3;i++)
{
// decode a row from this band
dPtr[i] = decodeRow(bands[i],w->ari,&li[i]);
// advance the circular buffer
DCBs[i]->rowCenter ++;
// get a temp work space; this is the row that will be filled by the next decode
fPtr[i] = RowBuffer_Row(DCBs[i],0);
// dequantize to the temp workspace
dequantizerow( dPtr[i], fPtr[i], width, bands[i]->quantizer);
}
// old LL + new LH -> low vert circular buffer
hTranser( RowBuffer_Row(tbL,0) , LLptr, fPtr[0], width);
LLptr += width;
// LH + HL -> high vert circular buffer
hTranser( RowBuffer_Row(tbH,0) , fPtr[1], fPtr[2], width);
// now vTrans to the new LL from the two circ DCBs
vRowTranser( &tp, tbL, tbH, y);
if ( l == (w->levels - 1) ) // only happens in the last plane
{
//-- this is it; we're untransforming into a little rowbuffer
// and doing a YUV -> final image from that rowbuffer
assert( (tp.RB.rowCenter - untransRowBase) <= 4 );
while( tp.RB.rowCenter >= (untransRowBase + 2) )
{
if ( Driver )
{
YUV_122toDriver( RowBuffer_RawRow(&(tp.RB),untransRowBase),
RowBuffer_RawRow(&(tp.RB),untransRowBase+1),
Uptr,Vptr,Driver,imwidth);
}
else
{
YUV_122toBGRbytes( RowBuffer_RawRow(&(tp.RB),untransRowBase),
RowBuffer_RawRow(&(tp.RB),untransRowBase+1),
Uptr,Vptr,BGRptr,imwidth);
BGRptr += imwidth*6;
}
Uptr += (imwidth>>1);
Vptr += (imwidth>>1);
untransRowBase += 2;
}
}
tbL->rowCenter ++;
tbH->rowCenter ++;
}
assert( l != (w->levels - 1) || tp.RB.rowCenter == untransRowBase );
}
}
assert( Driver || (BGRptr == (BGRplane + imwidth*imheight*3)) );
assert( Uptr == (fPlanes[1] + (imwidth*imheight>>2)) );
assert( Vptr == (fPlanes[0] + (imwidth*imheight>>2)) );
//------------------------------------
// finish up
RowBuffer_DeInit(&(tp.RB));
arithDecodeDone(w->ari);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -