📄 pa_x86_plain_converters.c
字号:
#include "pa_x86_plain_converters.h"#include "pa_converters.h"#include "pa_dither.h"/* plain intel assemby versions of standard pa converter functions. the main reason these versions are faster than the equivalent C versions is that float -> int casting is expensive in C on x86 because the rounding mode needs to be changed for every cast. these versions only set the rounding mode once outside the loop. small additional speed gains are made by the way that clamping is implemented.TODO: o- inline dither code o- implement Dither only (no-clip) versions o- implement int8 and uint8 versions o- test thouroughly o- the packed 24 bit functions could benefit from unrolling and avoiding byte and word sized register access.*//* -------------------------------------------------------------------------- *//*#define PA_CLIP_( val, min, max )\ { val = ((val) < (min)) ? (min) : (((val) > (max)) ? (max) : (val)); }*//* the following notes were used to determine whether a floating point value should be saturated (ie >1 or <-1) by loading it into an integer register. these should be rewritten so that they make sense. an ieee floating point value 1.xxxxxxxxxxxxxxxxxxxx? is less than or equal to 1 and greater than or equal to -1 either: if the mantissa is 0 and the unbiased exponent is 0 OR if the unbiased exponent < 0 this translates to: if the mantissa is 0 and the biased exponent is 7F or if the biased exponent is less than 7F therefore the value is greater than 1 or less than -1 if the mantissa is not 0 and the biased exponent is 7F or if the biased exponent is greater than 7F in other words, if we mask out the sign bit, the value is greater than 1 or less than -1 if its integer representation is greater than: 0 01111111 0000 0000 0000 0000 0000 000 0011 1111 1000 0000 0000 0000 0000 0000 => 0x3F800000*//* -------------------------------------------------------------------------- */static const short fpuControlWord_ = 0x033F; /*round to nearest, 64 bit precision, all exceptions masked*/static const double int32Scaler_ = 0x7FFFFFFF;static const double ditheredInt32Scaler_ = 0x7FFFFFFE;static const double int24Scaler_ = 0x7FFFFF;static const double ditheredInt24Scaler_ = 0x7FFFFE;static const double int16Scaler_ = 0x7FFF;static const double ditheredInt16Scaler_ = 0x7FFE;#define PA_DITHER_BITS_ (15)/* Multiply by PA_FLOAT_DITHER_SCALE_ to get a float between -2.0 and +1.99999 */#define PA_FLOAT_DITHER_SCALE_ (1.0 / ((1<<PA_DITHER_BITS_)-1))static const float const_float_dither_scale_ = PA_FLOAT_DITHER_SCALE_;#define PA_DITHER_SHIFT_ ((32 - PA_DITHER_BITS_) + 1)/* -------------------------------------------------------------------------- */static void Float32_To_Int32( void *destinationBuffer, signed int destinationStride, void *sourceBuffer, signed int sourceStride, unsigned int count, PaUtilTriangularDitherGenerator *ditherGenerator ){/* float *src = (float*)sourceBuffer; signed long *dest = (signed long*)destinationBuffer; (void)ditherGenerator; // unused parameter while( count-- ) { // REVIEW double scaled = *src * 0x7FFFFFFF; *dest = (signed long) scaled; src += sourceStride; dest += destinationStride; }*/ short savedFpuControlWord; (void) ditherGenerator; /* unused parameter */ __asm{ // esi -> source ptr // eax -> source byte stride // edi -> destination ptr // ebx -> destination byte stride // ecx -> source end ptr // edx -> temp mov esi, sourceBuffer mov edx, 4 // sizeof float32 and int32 mov eax, sourceStride imul eax, edx mov ecx, count imul ecx, eax add ecx, esi mov edi, destinationBuffer mov ebx, destinationStride imul ebx, edx fwait fstcw savedFpuControlWord fldcw fpuControlWord_ fld int32Scaler_ // stack: (int)0x7FFFFFFF Float32_To_Int32_loop: // load unscaled value into st(0) fld dword ptr [esi] // stack: value, (int)0x7FFFFFFF add esi, eax // increment source ptr //lea esi, [esi+eax] fmul st(0), st(1) // st(0) *= st(1), stack: value*0x7FFFFFFF, (int)0x7FFFFFFF /* note: we could store to a temporary qword here which would cause wraparound distortion instead of int indefinite 0x10. that would be more work, and given that not enabling clipping is only advisable when you know that your signal isn't going to clip it isn't worth it. */ fistp dword ptr [edi] // pop st(0) into dest, stack: (int)0x7FFFFFFF add edi, ebx // increment destination ptr //lea edi, [edi+ebx] cmp esi, ecx // has src ptr reached end? jne Float32_To_Int32_loop ffree st(0) fincstp fwait fnclex fldcw savedFpuControlWord }}/* -------------------------------------------------------------------------- */static void Float32_To_Int32_Clip( void *destinationBuffer, signed int destinationStride, void *sourceBuffer, signed int sourceStride, unsigned int count, PaUtilTriangularDitherGenerator *ditherGenerator ){/* float *src = (float*)sourceBuffer; signed long *dest = (signed long*)destinationBuffer; (void) ditherGenerator; // unused parameter while( count-- ) { // REVIEW double scaled = *src * 0x7FFFFFFF; PA_CLIP_( scaled, -2147483648., 2147483647. ); *dest = (signed long) scaled; src += sourceStride; dest += destinationStride; }*/ short savedFpuControlWord; (void) ditherGenerator; /* unused parameter */ __asm{ // esi -> source ptr // eax -> source byte stride // edi -> destination ptr // ebx -> destination byte stride // ecx -> source end ptr // edx -> temp mov esi, sourceBuffer mov edx, 4 // sizeof float32 and int32 mov eax, sourceStride imul eax, edx mov ecx, count imul ecx, eax add ecx, esi mov edi, destinationBuffer mov ebx, destinationStride imul ebx, edx fwait fstcw savedFpuControlWord fldcw fpuControlWord_ fld int32Scaler_ // stack: (int)0x7FFFFFFF Float32_To_Int32_Clip_loop: mov edx, dword ptr [esi] // load floating point value into integer register and edx, 0x7FFFFFFF // mask off sign cmp edx, 0x3F800000 // greater than 1.0 or less than -1.0 jg Float32_To_Int32_Clip_clamp // load unscaled value into st(0) fld dword ptr [esi] // stack: value, (int)0x7FFFFFFF add esi, eax // increment source ptr //lea esi, [esi+eax] fmul st(0), st(1) // st(0) *= st(1), stack: value*0x7FFFFFFF, (int)0x7FFFFFFF fistp dword ptr [edi] // pop st(0) into dest, stack: (int)0x7FFFFFFF jmp Float32_To_Int32_Clip_stored Float32_To_Int32_Clip_clamp: mov edx, dword ptr [esi] // load floating point value into integer register shr edx, 31 // move sign bit into bit 0 add esi, eax // increment source ptr //lea esi, [esi+eax] add edx, 0x7FFFFFFF // convert to maximum range integers mov dword ptr [edi], edx Float32_To_Int32_Clip_stored: //add edi, ebx // increment destination ptr lea edi, [edi+ebx] cmp esi, ecx // has src ptr reached end? jne Float32_To_Int32_Clip_loop ffree st(0) fincstp fwait fnclex fldcw savedFpuControlWord }}/* -------------------------------------------------------------------------- */static void Float32_To_Int32_DitherClip( void *destinationBuffer, signed int destinationStride, void *sourceBuffer, signed int sourceStride, unsigned int count, PaUtilTriangularDitherGenerator *ditherGenerator ){ /* float *src = (float*)sourceBuffer; signed long *dest = (signed long*)destinationBuffer; while( count-- ) { // REVIEW double dither = PaUtil_GenerateFloatTriangularDither( ditherGenerator ); // use smaller scaler to prevent overflow when we add the dither double dithered = ((double)*src * (2147483646.0)) + dither; PA_CLIP_( dithered, -2147483648., 2147483647. ); *dest = (signed long) dithered; src += sourceStride; dest += destinationStride; } */ short savedFpuControlWord; // spill storage: signed long sourceByteStride; signed long highpassedDither; // dither state: unsigned long ditherPrevious = ditherGenerator->previous; unsigned long ditherRandSeed1 = ditherGenerator->randSeed1; unsigned long ditherRandSeed2 = ditherGenerator->randSeed2; __asm{ // esi -> source ptr // eax -> source byte stride // edi -> destination ptr // ebx -> destination byte stride // ecx -> source end ptr // edx -> temp mov esi, sourceBuffer mov edx, 4 // sizeof float32 and int32 mov eax, sourceStride imul eax, edx mov ecx, count imul ecx, eax add ecx, esi mov edi, destinationBuffer mov ebx, destinationStride imul ebx, edx fwait fstcw savedFpuControlWord fldcw fpuControlWord_ fld ditheredInt32Scaler_ // stack: int scaler Float32_To_Int32_DitherClip_loop: mov edx, dword ptr [esi] // load floating point value into integer register and edx, 0x7FFFFFFF // mask off sign cmp edx, 0x3F800000 // greater than 1.0 or less than -1.0 jg Float32_To_Int32_DitherClip_clamp // load unscaled value into st(0) fld dword ptr [esi] // stack: value, int scaler add esi, eax // increment source ptr //lea esi, [esi+eax] fmul st(0), st(1) // st(0) *= st(1), stack: value*(int scaler), int scaler /* // call PaUtil_GenerateFloatTriangularDither with C calling convention mov sourceByteStride, eax // save eax mov sourceEnd, ecx // save ecx push ditherGenerator // pass ditherGenerator parameter on stack call PaUtil_GenerateFloatTriangularDither // stack: dither, value*(int scaler), int scaler pop edx // clear parameter off stack mov ecx, sourceEnd // restore ecx mov eax, sourceByteStride // restore eax */ // generate dither mov sourceByteStride, eax // save eax mov edx, 196314165 mov eax, ditherRandSeed1 mul edx // eax:edx = eax * 196314165 //add eax, 907633515 lea eax, [eax+907633515] mov ditherRandSeed1, eax mov edx, 196314165 mov eax, ditherRandSeed2 mul edx // eax:edx = eax * 196314165 //add eax, 907633515 lea eax, [eax+907633515] mov edx, ditherRandSeed1 shr edx, PA_DITHER_SHIFT_ mov ditherRandSeed2, eax shr eax, PA_DITHER_SHIFT_ //add eax, edx // eax -> current lea eax, [eax+edx]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -