📄 montecarlo_kernel.cuh

📁 采用GPU通用计算API（CUDA）实现蒙特卡罗方程。
💻 CUH
字号:
/*
 * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.  Users and possessors of this source code
 * are hereby granted a nonexclusive, royalty-free license to use this code
 * in individual and commercial software.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.   This source code is a "commercial item" as
 * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer  software"  and "commercial computer software
 * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 *
 * Any use of this source code in individual and commercial software must
 * include, in the user documentation and internal comments to the code,
 * the above Disclaimer and U.S. Government End Users Notice.
 */



#ifndef MONTECARLO_KERNEL_CUH
#define MONTECARLO_KERNEL_CUH



////////////////////////////////////////////////////////////////////////////////
// Global types
////////////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <cutil.h>
#include "realtype.h"
#include "MonteCarlo_common.h"



////////////////////////////////////////////////////////////////////////////////
// Helper reduction template
// Please see the "reduction" CUDA SDK sample for more information
////////////////////////////////////////////////////////////////////////////////
#include "MonteCarlo_reduction.cuh"



////////////////////////////////////////////////////////////////////////////////
// Internal GPU-side data structures
////////////////////////////////////////////////////////////////////////////////
#define MAX_OPTIONS 2048

//Preprocessed input option data
typedef struct{
    real S;
    real X;
    real MuByT;
    real VBySqrtT;
} __TOptionData;
static __device__ __constant__ __TOptionData d_OptionData[MAX_OPTIONS];

//GPU outputs before CPU postprocessing
typedef struct{
    real Expected;
    real Confidence;
} __TOptionValue;
static __device__ __TOptionValue d_CallValue[MAX_OPTIONS];



////////////////////////////////////////////////////////////////////////////////
// Overloaded shortcut payoff functions for different precision modes
////////////////////////////////////////////////////////////////////////////////
#ifndef DOUBLE_PRECISION
__device__ inline float endCallValue(float S, float X, float r, float MuByT, float VBySqrtT){
    float callValue = S * __expf(MuByT + VBySqrtT * r) - X;
    return (callValue > 0) ? callValue : 0;
}
#else
__device__ inline double endCallValue(double S, double X, double r, double MuByT, double VBySqrtT){
    double callValue = S * exp(MuByT + VBySqrtT * r) - X;
    return (callValue > 0) ? callValue : 0;
}
#endif


////////////////////////////////////////////////////////////////////////////////
// This kernel computes partial integrals over all paths using a multiple thread
// blocks per option. It is used when a single thread block per option would not
// be enough to keep the GPU busy. Execution of this kernel is followed by
// MonteCarloReduce() to get the complete integral for each option.
////////////////////////////////////////////////////////////////////////////////
#define THREAD_N 256

static __global__ void MonteCarloKernel(
    __TOptionValue *d_Buffer,
    float *d_Samples,
    int pathN
){
    const int optionIndex = blockIdx.y;

    const real        S = d_OptionData[optionIndex].S;
    const real        X = d_OptionData[optionIndex].X;
    const real    MuByT = d_OptionData[optionIndex].MuByT;
    const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;

    //One thread per partial integral
    const int   iSum = blockIdx.x * blockDim.x + threadIdx.x;
    const int accumN = blockDim.x * gridDim.x;

    //Cycle through the entire samples array:
    //derive end stock price for each path
    //accumulate into intermediate global memory array
    __TOptionValue sumCall = {0, 0};
    for(int i = iSum; i < pathN; i += accumN){
        real              r = d_Samples[i];
        real      callValue = endCallValue(S, X, r, MuByT, VBySqrtT);
        sumCall.Expected   += callValue;
        sumCall.Confidence += callValue * callValue;
    }
    d_Buffer[optionIndex * accumN + iSum] = sumCall;
}



////////////////////////////////////////////////////////////////////////////////
// This kernel computes the integral over all paths using a single thread block
// per option. It is fastest when the number of thread blocks times the work per
// block is high enough to keep the GPU busy. When this is not the case, using
// more blocks per option is faster, so we use MonteCarloKernel() plus
// MonteCarloReduce() instead.
////////////////////////////////////////////////////////////////////////////////
static __global__ void MonteCarloOneBlockPerOption(
    float *d_Samples,
    int pathN
){
    const int SUM_N = THREAD_N;
    __shared__ real s_SumCall[SUM_N];
    __shared__ real s_Sum2Call[SUM_N];

    const int optionIndex = blockIdx.x;
    const real        S = d_OptionData[optionIndex].S;
    const real        X = d_OptionData[optionIndex].X;
    const real    MuByT = d_OptionData[optionIndex].MuByT;
    const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;

    //Cycle through the entire samples array:
    //derive end stock price for each path
    //accumulate partial integrals into intermediate shared memory buffer
    for(int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x){
        __TOptionValue sumCall = {0, 0};
        for(int i = iSum; i < pathN; i += SUM_N){
            real              r = d_Samples[i];
            real      callValue = endCallValue(S, X, r, MuByT, VBySqrtT);
            sumCall.Expected   += callValue;
            sumCall.Confidence += callValue * callValue;
        }
        s_SumCall[iSum]  = sumCall.Expected;
        s_Sum2Call[iSum] = sumCall.Confidence;
    }

    //Reduce shared memory accumulators
    //and write final result to global memory
    sumReduce<real, SUM_N, THREAD_N>(s_SumCall, s_Sum2Call);
    if(threadIdx.x == 0){
        __TOptionValue t = {s_SumCall[0], s_Sum2Call[0]};
        d_CallValue[optionIndex] = t;
    }
}



////////////////////////////////////////////////////////////////////////////////
//Finalizing reduction for MonteCarloKernel1()
//Final reduction for each per-option accumulator output
////////////////////////////////////////////////////////////////////////////////
static __global__ void MonteCarloReduce(
    __TOptionValue *d_Buffer,
    int accumN
){
    const int SUM_N = THREAD_N;
    __shared__ real s_SumCall[SUM_N];
    __shared__ real s_Sum2Call[SUM_N];
    __TOptionValue *d_SumBase = &d_Buffer[blockIdx.x * accumN];

    //Reduce global memory accumulators array for current option
    //to a set fitting into shared memory
    for(int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x){
        __TOptionValue sumCall = {0, 0};
        for(int pos = iSum; pos < accumN; pos += SUM_N){
            __TOptionValue    t = d_SumBase[pos];
            sumCall.Expected   += t.Expected;
            sumCall.Confidence += t.Confidence;
        }
        s_SumCall[iSum]  = sumCall.Expected;
        s_Sum2Call[iSum] = sumCall.Confidence;
    }

    //Reduce shared memory accumulators
    //and write final result to global memory
    sumReduce<real, SUM_N, THREAD_N>(s_SumCall, s_Sum2Call);
    if(threadIdx.x == 0){
        __TOptionValue t = {s_SumCall[0], s_Sum2Call[0]};
        d_CallValue[blockIdx.x] = t;
    }
}



////////////////////////////////////////////////////////////////////////////////
// Host-side interface to GPU Monte Carlo
////////////////////////////////////////////////////////////////////////////////
//Allocate internal device memory
static void initMonteCarloGPU(TOptionPlan *plan){
    const int doMultiBlock = (plan->pathN / plan->optionCount) >= 8192;
    if(doMultiBlock){
        const int blocksPerOption = (plan->optionCount < 16) ? 64 : 16;
        const int          accumN = THREAD_N * blocksPerOption;
        CUDA_SAFE_CALL( cudaMalloc(
            (void **)&plan->d_Buffer,
            accumN * plan->optionCount * sizeof(__TOptionValue)
        ) );
    }
}

//Deallocate internal device memory
static void closeMonteCarloGPU(TOptionPlan *plan){
    const int doMultiBlock = (plan->pathN / plan->optionCount) >= 8192;
    if(doMultiBlock) CUDA_SAFE_CALL( cudaFree(plan->d_Buffer) );
}

//Main computations
static void MonteCarloGPU(TOptionPlan *plan){
    __TOptionData h_OptionData[MAX_OPTIONS];
    __TOptionValue h_CallValue[MAX_OPTIONS];

    if(plan->optionCount <= 0 || plan->optionCount > MAX_OPTIONS){
        printf("MonteCarloGPU(): bad option count.\n");
        return;
    }

    for(int i = 0; i < plan->optionCount; i++){
        const double           T = plan->optionData[i].T;
        const double           R = plan->optionData[i].R;
        const double           V = plan->optionData[i].V;
        const double       MuByT = (R - 0.5 * V * V) * T;
        const double    VBySqrtT = V * sqrt(T);
        h_OptionData[i].S        = (real)plan->optionData[i].S;
        h_OptionData[i].X        = (real)plan->optionData[i].X;
        h_OptionData[i].MuByT    = (real)MuByT;
        h_OptionData[i].VBySqrtT = (real)VBySqrtT;
    }

    CUDA_SAFE_CALL( cudaMemcpyToSymbol(
        d_OptionData,
        h_OptionData,
        plan->optionCount * sizeof(__TOptionData)
    ) );

    const int doMultiBlock = (plan->pathN / plan->optionCount) >= 8192;
    if(doMultiBlock){
        const int blocksPerOption = (plan->optionCount < 16) ? 64 : 16;
        const int          accumN = THREAD_N * blocksPerOption;
        const dim3 gridMain(blocksPerOption, plan->optionCount);
        MonteCarloKernel<<<gridMain, THREAD_N>>>(
            (__TOptionValue *)plan->d_Buffer,
            plan->d_Samples,
            plan->pathN
        );
        CUT_CHECK_ERROR("MonteCarloKernel() execution failed\n");
        MonteCarloReduce<<<plan->optionCount, THREAD_N>>>(
            (__TOptionValue *)plan->d_Buffer,
            accumN
        );
        CUT_CHECK_ERROR("MonteCarloReduce() execution failed\n");
    }else{
        MonteCarloOneBlockPerOption<<<plan->optionCount, THREAD_N>>>(
            plan->d_Samples,
            plan->pathN
        );
        CUT_CHECK_ERROR("MonteCarloOneBlockPerOption() execution failed\n");
    }
    CUDA_SAFE_CALL( cudaMemcpyFromSymbol(
        h_CallValue,
        d_CallValue,
        plan->optionCount * sizeof(__TOptionValue)
    ) );

    for(int i = 0; i < plan->optionCount; i++){
        const double    RT = plan->optionData[i].R * plan->optionData[i].T;
        const double   sum = h_CallValue[i].Expected;
        const double  sum2 = h_CallValue[i].Confidence;
        const double pathN = plan->pathN;
        //Derive average from the total sum and discount by riskfree rate 
        plan->callValue[i].Expected = (float)(exp(-RT) * sum / pathN);
        //Standart deviation
        double stdDev = sqrt((pathN * sum2 - sum * sum)/ (pathN * (pathN - 1)));
        //Confidence width; in 95% of all cases theoretical value lies within these borders
        plan->callValue[i].Confidence = (float)(exp(-RT) * 1.96 * stdDev / sqrt(pathN));
    }
}



#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -