📄 gpufftw.cpp
字号:
/******************************************************************************\
Copyright 2005 The University of North Carolina at Chapel Hill.
All Rights Reserved.
Permission to use, copy, modify and distribute this software and its
documentation for educational, research and non-profit purposes, without
fee, and without a written agreement is hereby granted, provided that the
above copyright notice and the following three paragraphs appear in all
copies. Any use in a commercial organization requires a separate license.
IN NO EVENT SHALL THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL BE LIABLE
TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND
ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF NORTH CAROLINA HAVE BEEN
ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
Permission to use, copy, modify and distribute this software and its
documentation for educational, research and non-profit purposes, without
fee, and without a written agreement is hereby granted, provided that the
above copyright notice and the following three paragraphs appear in all
copies.
THE UNIVERSITY OF NORTH CAROLINA SPECIFICALLY DISCLAIM ANY WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN
"AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA HAS NO OBLIGATION TO
PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
---------------------------------
|Please send all BUG REPORTS to: |
| |
| geom@cs.unc.edu |
| |
---------------------------------
The author may be contacted via:
US Mail: N. Govindaraju
Department of Computer Science
Sitterson Hall, CB #3175
University of North Carolina
Chapel Hill, NC 27599-3175
Add functions manycgpufft1d and manyscgpufft1d
by Simon Potvin and Jerome Genest, July 2006
Centre d'optique, photonique et laser (COPL)
Universite Laval
Quebec, Canada
Mail: simon.potvin.1@ulaval.ca
\*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <GL/glut.h>
#include <shader.h>
#include <defines.h>
#include <GPUFFTW.h>
#if(defined _WIN64 || defined _WIN32)
typedef BOOL (*NvCplGetDataIntType)(long, long*);
#endif
GLenum buffers1[] = {GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT2_EXT};
GLenum buffers2[] = {GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT3_EXT};
GLenum buffers41[] = {GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT3_EXT};
GLenum buffers42[] = {GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT3_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT2_EXT};
GLenum buffers4[] = {GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT};
GPUFFTW::GPUFFTW(){
//Create OpenGL context
glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
glutCreateWindow( "NVToolkit" );
InitFBOExts();
fullscreenrowwidthfp.Load("fullscreenrowwidthFFTptr", fullscreenrowwidthfptext);
fftstages_twicerowwidth_to_n_fp.Load("maxptrrg", fftstages_twicerowwidth_to_n_fptext);
fftstages3_to_rowwidthfp.Load("minptrrg", fftstages3_to_rowwidthfptext);
fullscreenfft1ststage.Load("fullscreenfft1ststage", fullscreenfft1ststagetext);
fullscreenfft2ndstagefp.Load("fullscreenfft2ndstagefp",fullscreenfft2ndstagefptext);
copyfp.Load("copy", copyfptext);
glGenFramebuffersEXT(1, &fb);
glGenTextures(1, &textureid[0]);
glGenTextures(1, &textureid[1]);
glGenTextures(1, &textureid[2]);
glGenTextures(1, &textureid[3]);
glGenTextures(1, &textureid[4]);
glGenTextures(1, &textureid[5]);
bool VRAMdetected=false;
long int videoMemorySize;
#if(defined _WIN64 || defined WIN32)
// Load NVCPL library
HINSTANCE hLib = ::LoadLibrary("NVCPL.dll");
if (hLib == 0) {
printf("Unable to load NVCPL.dll\n");
//return -1;
}
#ifdef GPU_DEBUG
printf("\nNVCPL.dll successfully loaded\n\n");
#endif
NvCplGetDataIntType NvCplGetDataInt = (NvCplGetDataIntType)::GetProcAddress(hLib, "NvCplGetDataInt");
if (NvCplGetDataInt == 0)
printf("- Unable to get a pointer to NvCplGetDataInt\n");
if (NvCplGetDataInt(2, &videoMemorySize) == FALSE){
printf("Unable to retrieve Video Memory Size!\n");
}
else{
#ifdef GPU_DEBUG
printf("Detected Video Memory Size = %d MBytes\n", videoMemorySize);
#endif
VRAMdetected=true;
}
// Free NVCPL library
::FreeLibrary(hLib);
#endif
if(!VRAMdetected){
printf("WARNING: Video Memory Size could not be detected. Assuming 256 MB.\n");
videoMemorySize=256;
}
N=videoMemorySize<<16;
//LOGN represents Log(texture width* texture height) rather than log of number of elements.
for(LOGN=0;N>>LOGN;LOGN++);
LOGN=LOGN-4;//added for MRTs
LOGN--;//added for temporary buffers
W = (1 << (LOGN - (int) (LOGN/2)) );
H = (1 << (int) (LOGN/2));
#ifdef GPU_DEBUG
printf("Largest texture which can be allocated = %d * %d\n",W,H);
#endif
glActiveTextureARB( GL_TEXTURE2_ARB );
glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[0]);
glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);
glActiveTextureARB( GL_TEXTURE2_ARB );
glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[1]);
glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);
glActiveTextureARB( GL_TEXTURE3_ARB );
glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[2]);
glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);
glActiveTextureARB( GL_TEXTURE4_ARB );
glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[3]);
glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);
glActiveTextureARB( GL_TEXTURE2_ARB );
glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[4]);
glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);
glActiveTextureARB( GL_TEXTURE3_ARB );
glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[5]);
glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);
GLuint error=glGetError();
if(error!=GL_NO_ERROR){
printf("Error while creating two textures of size %d * %d. Cannot proceed further.\n",W,H);
}
#ifdef GPU_DEBUG
else{
printf("Successfully created two textures of size %d * %d. \n",W,H);
}
#endif
MAX_SIZE = 4*W*H; // Dependent on the available video memory ON THE GPU.
MIN_SIZE = 16; //MIN_SIZE should not be less than 16 so that data is packed correctly in the FFT routine
//This is for cache-optimization. TILE_SIZE should be a power of 2 and the performance improvement based on TILE_SIZE
//can be dependent on the underlying graphics processor
TILE_SIZE=64;
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb);
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
GL_COLOR_ATTACHMENT0_EXT,
GL_TEXTURE_RECTANGLE_NV, textureid[0], 0);
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
GL_COLOR_ATTACHMENT1_EXT,
GL_TEXTURE_RECTANGLE_NV, textureid[1], 0);
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
GL_COLOR_ATTACHMENT2_EXT,
GL_TEXTURE_RECTANGLE_NV, textureid[2], 0);
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
GL_COLOR_ATTACHMENT3_EXT,
GL_TEXTURE_RECTANGLE_NV, textureid[3], 0);
glViewport(0,0,W, H);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(0, W, 0, H);
glMatrixMode( GL_MODELVIEW );
glLoadIdentity();
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
}
GPUFFTW::~GPUFFTW(){
glDeleteTextures(1, &textureid[0]);
glDeleteTextures(1, &textureid[1]);
glDeleteFramebuffersEXT(1, &fb);
glutDestroyWindow(glutGetWindow());
}
void GPUFFTW::CheckErrors(){
GLenum error = glGetError();
if( error != GL_NO_ERROR ) {
fprintf( stderr, "\nGL Error: %s\n", gluErrorString( error ) );
assert(0);
}
}
/** Prints a formatted error message for error codes returned from setFFTParams()
*/
char *GPUFFTW::FFTErrorString(FFT_ERROR error){
char *err = new char[1000];
sprintf(err,"GPUFFTW: ");
switch(error){
case FFT_SUCCESS: sprintf(err,"No error."); break;
case FFT_ARRAY_TOO_BIG: sprintf(err,"Not enough video RAM to FFT. Maximum array size is %d", MAX_SIZE); break;
case FFT_ARRAY_TOO_SMALL: sprintf(err,"Input Array is too small. Minimum size: %d",MIN_SIZE); break;
case FFT_NON_POWER_OF_TWO: sprintf(err,"Non power of two array - not handled"); break;
default: sprintf(err,"Unknown error code. This should not happen."); break;
}
return(err);
}
/** Sets internal parameters for performing fft on a given sized array. Returns the actual number
of fft elements
*/
FFT_ERROR GPUFFTW::setFFTParams(int size){
N=size;
if(N<MIN_SIZE) return(FFT_ARRAY_TOO_SMALL);
if(N>MAX_SIZE) return(FFT_ARRAY_TOO_BIG);
//LOGN represents Log(texture width* texture height) rather than log of number of elements.
for(LOGN=0;N>>LOGN;LOGN++);
LOGN=LOGN-3;
Width = (1 << (LOGN - (int) (LOGN/2)));
Height = (1 << (int) (LOGN/2));
TILE_SIZE=64;
if(TILE_SIZE > Width) TILE_SIZE=Width;
if(TILE_SIZE > Height) TILE_SIZE=Height;
#ifdef GPU_DEBUG
printf("\nGPUFFTW: Size of array = %d. Width = %d, height = %d, LOGN=%d, Tile Size = %d\n",size,Width,Height,LOGN,TILE_SIZE);
#endif
return(FFT_SUCCESS);
}
#include "stopwatch.hpp"
FFT_ERROR GPUFFTW::cgpufft1d(float *real, float *imag, int size, int sign, int isreal){
if(size < MIN_SIZE){
return (FFT_ARRAY_TOO_SMALL);
}
//amount of data to be ffted on the GPU
int gpuSize=0;
//Find largest power of 2 smaller than size
int logSize=0;
for(;size>>logSize;logSize++);
logSize--;
gpuSize=1<<logSize;
int cpuSize=size-gpuSize;
if(cpuSize==0){
FFT_ERROR err = setFFTParams(gpuSize);
if(err!=FFT_SUCCESS) return(err);
glActiveTextureARB( GL_TEXTURE3_ARB );
glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[3]);
glTexSubImage2D(GL_TEXTURE_RECTANGLE_NV, 0, 0, 0, Width, Height, GL_RGBA, GL_FLOAT, imag);
uploadData(real);
Stopwatch bitonic_timer;
glFinish();
bitonic_timer.Start();
//for(int dummy=0;dummy<100;dummy++)
if(isreal){
//rearrange
}
cgpufft1d(sign);
if(isreal){
//rearrange
}
glFinish();
bitonic_timer.Stop();
printf("time:%4.5f \n", bitonic_timer.GetTime());
readbackData(imag);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb);
if(target==1)
glReadBuffer( GL_COLOR_ATTACHMENT1_EXT);
else
glReadBuffer( GL_COLOR_ATTACHMENT0_EXT);
glReadPixels(0, 0, Width, Height, GL_RGBA, GL_FLOAT, real);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
if(sign==-1){
for(int d=0;d<size;d++){
real[d]/=size;
imag[d]/=size;
}
}
}
else return(FFT_NON_POWER_OF_TWO);
return(FFT_SUCCESS);
}
FFT_ERROR GPUFFTW::scgpufft1d(float *real, int size, int sign){
float *reals = new float[size/2];
float *imags = new float[size/2];
FFT_ERROR result;
unsigned long i,i1,i2,i3,i4,np3;
float c1=0.5,c2,h1r,h1i,h2r,h2i;
double wr,wi,wpr,wpi,wtemp,theta; //Double precision for the trigonometric recurrences.
theta=3.14159265358979323846/(double) (size>>1); //Initialize the recurrence.
int nsize=size/2;
//convert the 1D real FFT into a complex FFT of half the size -- this can be speeded up using a fragment program and is slow on CPUs
if (sign == 1) {
c2 = -0.5;
for(i=0;i<nsize;i++){
reals[i] = real[2*i];
imags[i] = real[2*i+1];
}
result=cgpufft1d(reals,imags, size/2, sign, 1);
for(i=0;i<nsize;i++){
real[2*i] = reals[i];
real[2*i+1] = imags[i];
}
delete reals;
delete imags;
}
else {
c2=0.5; //Otherwise set up for an inverse trans
theta= -theta; //form.
}
wtemp=sin(0.5*theta);
wpr = -2.0*wtemp*wtemp;
wpi=sin(theta);
wr=1.0+wpr;
wi=wpi;
np3=size+3;
for (i=2;i<=(size>>2);i++) { //Case i=1 done separately below.
i4=1+(i3=np3-(i2=1+(i1=i+i-1)));
h1r= c1*(real[i1]+real[i3]); //The two separate transforms are sep
h1i= c1*(real[i2]-real[i4]); //arated out of real.
h2r = -c2*(real[i2]+real[i4]);
h2i=c2*(real[i1]-real[i3]);
real[i1]=h1r+wr*h2r-wi*h2i; //Here they are recombined to form the true transform of the original real real.
real[i2]=h1i+wr*h2i+wi*h2r;
real[i3]=h1r-wr*h2r+wi*h2i;
real[i4] = -h1i+wr*h2i+wi*h2r;
wr=(wtemp=wr)*wpr-wi*wpi+wr; //The recurrence.
wi=wi*wpr+wtemp*wpi+wi;
}
if (sign == 1) {
real[0] = (h1r=real[0])+real[1]; //Squeeze the first and last real together to get them all within the original array.
real[1] = h1r-real[1];
}
else {
real[0]=c1*((h1r=real[0])+real[1]);
real[1]=c1*(h1r-real[1]);
//convert the 1D real FFT into a complex FFT of half the size -- this can be speeded up using a fragment program and is slow on CPUs
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -