📄 gpufftw.cpp

📁 基于GPU进行快速科学计算
💻 CPP
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/******************************************************************************\

  Copyright 2005 The University of North Carolina at Chapel Hill.
  All Rights Reserved.

  Permission to use, copy, modify and distribute this software and its
  documentation for educational, research and non-profit purposes, without
  fee, and without a written agreement is hereby granted, provided that the
  above copyright notice and the following three paragraphs appear in all
  copies. Any use in a commercial organization requires a separate license.

  IN NO EVENT SHALL THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL BE LIABLE
  TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
  DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND
  ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF NORTH CAROLINA HAVE BEEN
  ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.


  Permission to use, copy, modify and distribute this software and its
  documentation for educational, research and non-profit purposes, without
  fee, and without a written agreement is hereby granted, provided that the
  above copyright notice and the following three paragraphs appear in all
  copies.

  THE UNIVERSITY OF NORTH CAROLINA SPECIFICALLY DISCLAIM ANY WARRANTIES,
  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN
  "AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA HAS NO OBLIGATION TO
  PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.


   ---------------------------------
  |Please send all BUG REPORTS to:  |
  |                                 |
  |   geom@cs.unc.edu               |
  |                                 |
   ---------------------------------


  The author may be contacted via:

  US Mail:         N. Govindaraju 
                       Department of Computer Science
                       Sitterson Hall, CB #3175
                       University of North Carolina
                       Chapel Hill, NC 27599-3175


	Add functions manycgpufft1d and manyscgpufft1d

	by			Simon Potvin and Jerome Genest, July 2006
				Centre d'optique, photonique et laser (COPL)
				Universite Laval
				Quebec, Canada
	Mail:		simon.potvin.1@ulaval.ca
				
\*****************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>

#include <GL/glut.h>
#include <shader.h>
#include <defines.h>
#include <GPUFFTW.h>

#if(defined _WIN64 || defined _WIN32)
typedef BOOL (*NvCplGetDataIntType)(long, long*);
#endif

GLenum buffers1[] = {GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT2_EXT};
GLenum buffers2[] = {GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT3_EXT};


GLenum buffers41[] = {GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT3_EXT};
GLenum buffers42[] = {GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT3_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT2_EXT};
GLenum buffers4[] = {GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT, GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT};



GPUFFTW::GPUFFTW(){
	
	//Create OpenGL context
	glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
	glutCreateWindow( "NVToolkit" );
  
	InitFBOExts();


	fullscreenrowwidthfp.Load("fullscreenrowwidthFFTptr", fullscreenrowwidthfptext);
	fftstages_twicerowwidth_to_n_fp.Load("maxptrrg", fftstages_twicerowwidth_to_n_fptext);
	fftstages3_to_rowwidthfp.Load("minptrrg", fftstages3_to_rowwidthfptext);
	fullscreenfft1ststage.Load("fullscreenfft1ststage", fullscreenfft1ststagetext);
	fullscreenfft2ndstagefp.Load("fullscreenfft2ndstagefp",fullscreenfft2ndstagefptext);
	copyfp.Load("copy", copyfptext);


	glGenFramebuffersEXT(1, &fb);	

	glGenTextures(1, &textureid[0]);
	glGenTextures(1, &textureid[1]);
	glGenTextures(1, &textureid[2]);
	glGenTextures(1, &textureid[3]);
	glGenTextures(1, &textureid[4]);
	glGenTextures(1, &textureid[5]);

	bool VRAMdetected=false;
	long int videoMemorySize;    
#if(defined _WIN64 || defined WIN32)
	// Load NVCPL library
    HINSTANCE hLib = ::LoadLibrary("NVCPL.dll");
    if (hLib == 0) {
        printf("Unable to load NVCPL.dll\n");
        //return -1;
    }
#ifdef GPU_DEBUG
    printf("\nNVCPL.dll successfully loaded\n\n");
#endif
	NvCplGetDataIntType NvCplGetDataInt = (NvCplGetDataIntType)::GetProcAddress(hLib, "NvCplGetDataInt");
	if (NvCplGetDataInt == 0)
       printf("- Unable to get a pointer to NvCplGetDataInt\n");

    if (NvCplGetDataInt(2, &videoMemorySize) == FALSE){
          printf("Unable to retrieve Video Memory Size!\n");
	}
    else{
#ifdef GPU_DEBUG
          printf("Detected Video Memory Size = %d MBytes\n", videoMemorySize);
#endif
		  VRAMdetected=true;
	}

	// Free NVCPL library
    ::FreeLibrary(hLib);
#endif
	
	if(!VRAMdetected){
		printf("WARNING: Video Memory Size could not be detected. Assuming 256 MB.\n");
		videoMemorySize=256;
	}

	N=videoMemorySize<<16;
	//LOGN represents Log(texture width* texture height) rather than log of number of elements.
	for(LOGN=0;N>>LOGN;LOGN++);
	LOGN=LOGN-4;//added for MRTs
	LOGN--;//added for temporary buffers
	W = (1 << (LOGN - (int) (LOGN/2)) );
	H = (1 << (int) (LOGN/2));

#ifdef GPU_DEBUG
	printf("Largest texture which can be allocated = %d * %d\n",W,H);
#endif
		
	glActiveTextureARB( GL_TEXTURE2_ARB );
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[0]);
	
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);

	glActiveTextureARB( GL_TEXTURE2_ARB );
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[1]);
	
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);

	glActiveTextureARB( GL_TEXTURE3_ARB );
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[2]);
	
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);


	glActiveTextureARB( GL_TEXTURE4_ARB );
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[3]);
	
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);


	glActiveTextureARB( GL_TEXTURE2_ARB );
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[4]);
	
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);

	glActiveTextureARB( GL_TEXTURE3_ARB );
	glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[5]);
	
	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_FLOAT_RGBA32_NV, W, H, 0, GL_RGBA, GL_FLOAT, NULL);



	GLuint error=glGetError();
	if(error!=GL_NO_ERROR){
		printf("Error while creating two textures of size %d * %d. Cannot proceed further.\n",W,H);
	}
#ifdef GPU_DEBUG
	else{
		printf("Successfully created two textures of size %d * %d. \n",W,H);
	}
#endif	
	
	MAX_SIZE = 4*W*H; // Dependent on the available video memory ON THE GPU. 
	MIN_SIZE = 16;   //MIN_SIZE should not be less than 16 so that data is packed correctly in the FFT routine
	//This is for cache-optimization. TILE_SIZE should be a power of 2 and the performance improvement based on TILE_SIZE 
	//can be dependent on the underlying graphics processor
	TILE_SIZE=64;
	

	glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb);

	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
                                  GL_COLOR_ATTACHMENT0_EXT,
                                  GL_TEXTURE_RECTANGLE_NV, textureid[0], 0);

	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
                                  GL_COLOR_ATTACHMENT1_EXT,
                                  GL_TEXTURE_RECTANGLE_NV, textureid[1], 0);

	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
                                  GL_COLOR_ATTACHMENT2_EXT,
                                  GL_TEXTURE_RECTANGLE_NV, textureid[2], 0);
	
	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
                                  GL_COLOR_ATTACHMENT3_EXT,
                                  GL_TEXTURE_RECTANGLE_NV, textureid[3], 0);
	
	glViewport(0,0,W, H);
	glMatrixMode(GL_PROJECTION);
	glLoadIdentity();
	gluOrtho2D(0, W, 0, H);
	glMatrixMode( GL_MODELVIEW );
	glLoadIdentity();

	glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);	
}

GPUFFTW::~GPUFFTW(){
	glDeleteTextures(1, &textureid[0]);
	glDeleteTextures(1, &textureid[1]);	
	glDeleteFramebuffersEXT(1, &fb);	
	glutDestroyWindow(glutGetWindow());
}

void GPUFFTW::CheckErrors(){
	GLenum error = glGetError();
	if( error != GL_NO_ERROR ) {
		fprintf( stderr, "\nGL Error: %s\n", gluErrorString( error ) );
		assert(0);
	}
}

/** Prints a formatted error message for error codes returned from setFFTParams()
*/
char *GPUFFTW::FFTErrorString(FFT_ERROR error){
		char *err = new char[1000];
		sprintf(err,"GPUFFTW: ");
		switch(error){
			case FFT_SUCCESS: sprintf(err,"No error."); break;
			case FFT_ARRAY_TOO_BIG: sprintf(err,"Not enough video RAM to FFT. Maximum array size is %d", MAX_SIZE); break;
			case FFT_ARRAY_TOO_SMALL: sprintf(err,"Input Array is too small. Minimum size: %d",MIN_SIZE); break;
			case FFT_NON_POWER_OF_TWO: sprintf(err,"Non power of two array - not handled"); break;
			default: sprintf(err,"Unknown error code. This should not happen."); break;
		}
		return(err);
}

/** Sets internal parameters for performing fft on a given sized array. Returns the actual number
    of fft elements 
*/
FFT_ERROR GPUFFTW::setFFTParams(int size){	
	
	N=size;
	
	if(N<MIN_SIZE) return(FFT_ARRAY_TOO_SMALL);
	if(N>MAX_SIZE) return(FFT_ARRAY_TOO_BIG);
	
	//LOGN represents Log(texture width* texture height) rather than log of number of elements.
	for(LOGN=0;N>>LOGN;LOGN++);
	LOGN=LOGN-3;
	Width = (1 << (LOGN - (int) (LOGN/2)));
	Height = (1 << (int) (LOGN/2));

	TILE_SIZE=64;
	if(TILE_SIZE > Width) TILE_SIZE=Width;
	if(TILE_SIZE > Height) TILE_SIZE=Height;	
	
#ifdef GPU_DEBUG
	printf("\nGPUFFTW: Size of array = %d. Width = %d, height = %d, LOGN=%d, Tile Size = %d\n",size,Width,Height,LOGN,TILE_SIZE);
#endif
	
	return(FFT_SUCCESS);
}
#include "stopwatch.hpp"
FFT_ERROR GPUFFTW::cgpufft1d(float *real, float  *imag, int size, int sign, int isreal){
	if(size < MIN_SIZE){
		return (FFT_ARRAY_TOO_SMALL);
	}
	//amount of data to be ffted on the GPU
	int gpuSize=0;

	//Find largest power of 2 smaller than size
	int logSize=0;
	for(;size>>logSize;logSize++);
	logSize--;
	
	gpuSize=1<<logSize;
	int cpuSize=size-gpuSize;

	if(cpuSize==0){
		FFT_ERROR err = setFFTParams(gpuSize);
		if(err!=FFT_SUCCESS) return(err);	
	
		glActiveTextureARB( GL_TEXTURE3_ARB );
		glBindTexture(GL_TEXTURE_RECTANGLE_NV, textureid[3]);    
		glTexSubImage2D(GL_TEXTURE_RECTANGLE_NV, 0, 0, 0, Width, Height, GL_RGBA, GL_FLOAT, imag);
		
		
		uploadData(real);
		Stopwatch bitonic_timer;
		glFinish();
		bitonic_timer.Start();
		//for(int dummy=0;dummy<100;dummy++)

		if(isreal){
			//rearrange
		}
		cgpufft1d(sign);
		if(isreal){
			//rearrange
		}
		glFinish();
		bitonic_timer.Stop();
		printf("time:%4.5f \n", bitonic_timer.GetTime());



		readbackData(imag);	
		
		glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb);
		if(target==1) 
			glReadBuffer( GL_COLOR_ATTACHMENT1_EXT);
		else
			glReadBuffer( GL_COLOR_ATTACHMENT0_EXT);

		glReadPixels(0, 0, Width, Height, GL_RGBA, GL_FLOAT, real);

		glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);

		if(sign==-1){
			for(int d=0;d<size;d++){
				real[d]/=size;
				imag[d]/=size;
			}

		}

	}
	else return(FFT_NON_POWER_OF_TWO);

	return(FFT_SUCCESS);
}


FFT_ERROR GPUFFTW::scgpufft1d(float *real, int size, int sign){
	float *reals = new float[size/2];
	float *imags = new float[size/2];
	FFT_ERROR result;
	
	


	unsigned long i,i1,i2,i3,i4,np3;
	float c1=0.5,c2,h1r,h1i,h2r,h2i;
	double wr,wi,wpr,wpi,wtemp,theta; //Double precision for the trigonometric recurrences.
	theta=3.14159265358979323846/(double) (size>>1); //Initialize the recurrence.
	int nsize=size/2;

	//convert the 1D real FFT into a complex FFT of half the size -- this can be speeded up using a fragment program and is slow on CPUs
	if (sign == 1) {
		c2 = -0.5;
		for(i=0;i<nsize;i++){
			reals[i] = real[2*i];
			imags[i] = real[2*i+1];

		}
		result=cgpufft1d(reals,imags, size/2, sign, 1);
		for(i=0;i<nsize;i++){
			real[2*i] = reals[i];
			real[2*i+1] = imags[i];

		}
		delete reals;
		delete imags;
	} 
	else {
		c2=0.5; //Otherwise set up for an inverse trans
		theta= -theta; //form.
	}
	wtemp=sin(0.5*theta);
	wpr = -2.0*wtemp*wtemp;
	wpi=sin(theta);
	wr=1.0+wpr;
	wi=wpi;
	np3=size+3;
	for (i=2;i<=(size>>2);i++) { //Case i=1 done separately below.
		i4=1+(i3=np3-(i2=1+(i1=i+i-1)));
		h1r= c1*(real[i1]+real[i3]); //The two separate transforms are sep
		h1i= c1*(real[i2]-real[i4]); //arated out of real.
		h2r = -c2*(real[i2]+real[i4]);
		h2i=c2*(real[i1]-real[i3]);
		real[i1]=h1r+wr*h2r-wi*h2i; //Here they are recombined to form the true transform of the original real real.
		real[i2]=h1i+wr*h2i+wi*h2r;
		real[i3]=h1r-wr*h2r+wi*h2i;
		real[i4] = -h1i+wr*h2i+wi*h2r;
		wr=(wtemp=wr)*wpr-wi*wpi+wr; //The recurrence.
		wi=wi*wpr+wtemp*wpi+wi;
	}
	if (sign == 1) {
		real[0] = (h1r=real[0])+real[1]; //Squeeze the first and last real together to get them all within the original array.
		real[1] = h1r-real[1];
	} 
	else {
		real[0]=c1*((h1r=real[0])+real[1]);
		real[1]=c1*(h1r-real[1]);

		//convert the 1D real FFT into a complex FFT of half the size -- this can be speeded up using a fragment program and is slow on CPUs
12 3 下一页
💿 文件大小 255 K
👤 上传用户 jianmingcao
📂 所属分类数学计算
🏷️ 相关标签

#GPU #计算
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -