병렬기법
홈 > CUDA > 병렬기법 > SIMD nbody 예제

SIMD nbody 예제

 본 예제는 NVIDIA에서 제공하는 CUDA SDK에 수록되어 있습니다.

 

 

CPU 코드와 CUDA 코드를 비교해 볼 수 있습니다.

 

 

 CPU코드

#include

 

////////////////////////////////////////////////////////////////////////////////

// export C interface

extern "C"

void computeGold( float* reference, float* idata, const unsigned int len);

 

void

bodyBodyInteraction(float accel[3], float posMass0[4], float posMass1[4], float softeningSquared)

{

    float r[3];

 

    // r_01  [3 FLOPS]

    r[0] = posMass0[0] - posMass1[0];

    r[1] = posMass0[1] - posMass1[1];

    r[2] = posMass0[2] - posMass1[2];

 

    // d^2 + e^2 [6 FLOPS]

    float distSqr = r[0] * r[0] + r[1] * r[1] + r[2] * r[2];

    distSqr += softeningSquared;

 

    // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]

    float invDist = 1.0f / sqrtf(distSqr);

        float invDistCube =  invDist * invDist * invDist;

 

    // s = m_j * invDistCube [1 FLOP]

    float s = posMass1[3] * invDistCube;

 

    // (m_1 * r_01) / (d^2 + e^2)^(3/2)  [6 FLOPS]

    accel[0] += r[0] * s;

    accel[1] += r[1] * s;

    accel[2] += r[2] * s;

}

 

////////////////////////////////////////////////////////////////////////////////

//! Compute reference data set

//! Each element is multiplied with the number of threads / array length

//! @param reference  reference data, computed but preallocated

//! @param idata      input data as provided to device

//! @param len        number of elements in reference / idata

////////////////////////////////////////////////////////////////////////////////

void

computeGold( float* force, float* pos, const unsigned int numBodies, float softeningSquared)

{

    for(unsigned int i = 0; i < numBodies; ++i)

    {

        force[i*4  ] = 0;

               force[i*4+1] = 0;

               force[i*4+2] = 0;

               force[i*4+3] = 0;

    }

 

    for(unsigned int i = 0; i < numBodies; ++i)

    {

        for(unsigned int j = 0; j < numBodies; ++j)

            {

                float f[4];          

                bodyBodyInteraction(f, &pos[j*4], &pos[i*4], softeningSquared);    

                for (int k = 0; k < 3; ++k)

                {

                    force[i*4+k] += f[k];

                }

            }

    }

}

 

 

 

CUDA 코드

#ifndef _NBODY_KERNEL_H_

#define _NBODY_KERNEL_H_

 

#include

 

#define BLOCKDIM 256

#define LOOP_UNROLL 4

 

__constant__ float softeningSquared;

 

// Macros to simplify shared memory addressing

#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]

// This macro is only used when multithreadBodies is true (below)

#define SX_SUM(i,j) sharedPos[i+blockDim.x*j]

 

 

__device__ float3

bodyBodyInteraction(float3 ai, float4 bi, float4 bj)

{

    float3 r;

 

    // r_ij  [3 FLOPS]

    r.x = bi.x - bj.x;

    r.y = bi.y - bj.y;

    r.z = bi.z - bj.z;

 

    // distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPS]

    float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;

    distSqr += softeningSquared;

 

    // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]

    float invDist = 1.0f / sqrtf(distSqr);

        float invDistCube =  invDist * invDist * invDist;

 

    //float distSixth = distSqr * distSqr * distSqr;

    //float invDistCube = 1.0f / sqrtf(distSixth);

   

    // s = m_j * invDistCube [1 FLOP]

    float s = bj.w * invDistCube;

 

    // a_i =  a_i + s * r_ij [6 FLOPS]

    ai.x += r.x * s;

    ai.y += r.y * s;

    ai.z += r.z * s;

 

    return ai;

}

 

 

// This is the "tile_calculation" function from the GPUG3 article.

__device__ float3 gravitation(float4 myPos, float3 accel)

{

    extern __shared__ float4 sharedPos[];

   

    // The CUDA 1.1 compiler cannot determine that i is not going to

    // overflow in the loop below.  Therefore if int is used on 64-bit linux

    // or windows (or long instead of long long on win64), the compiler

    // generates suboptimal code.  Therefore we use long long on win64 and

    // long on everything else. (Workaround for Bug ID 347697)

#ifdef _Win64

    unsigned long long i = 0;

#else

    unsigned long i = 0;

#endif

 

    // Here we unroll the loop

 

    // Note that having an unsigned int loop counter and an unsigned

    // long index helps the compiler generate efficient code on 64-bit

    // OSes.  The compiler can't assume the 64-bit index won't overflow

    // so it incurs extra integer operations.  This is a standard issue

    // in porting 32-bit code to 64-bit OSes.

 

    for (unsigned int counter = 0; counter < blockDim.x; )

    {

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        counter++;

#if LOOP_UNROLL > 1

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        counter++;

#endif

#if LOOP_UNROLL > 2

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        counter += 2;

#endif

#if LOOP_UNROLL > 4

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        accel = bodyBodyInteraction(accel, SX(i++), myPos);

        counter += 4;

#endif

    }

 

    return accel;

}

 

// WRAP is used to force each block to start working on a different

// chunk (and wrap around back to the beginning of the array) so that

// not all multiprocessors try to read the same memory locations at

// once.

#define WRAP(x,m) (((x)  // Mod without divide, works on values from 0 up to 2m

 

template <bool multithreadBodies>

__device__ float3

computeBodyAccel(float4 bodyPos, float4* positions, int numBodies)

{

    extern __shared__ float4 sharedPos[];

 

    float3 acc = {0.0f, 0.0f, 0.0f};

   

    int p = blockDim.x;

    int q = blockDim.y;

    int n = numBodies;

    int numTiles = n / (p * q);

 

    for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++)

    {

        sharedPos[threadIdx.x+blockDim.x*threadIdx.y] =

            multithreadBodies ?

            positions[WRAP(blockIdx.x + q * tile + threadIdx.y, gridDim.x) * p + threadIdx.x] :

            positions[WRAP(blockIdx.x + tile,                   gridDim.x) * p + threadIdx.x];

       

        __syncthreads();

 

        // This is the "tile_calculation" function from the GPUG3 article.

        acc = gravitation(bodyPos, acc);

       

        __syncthreads();

    }

 

    // When the numBodies / thread block size is < # multiprocessors (16 on G80), the GPU is

    // underutilized.  For example, with a 256 threads per block and 1024 bodies, there will only

    // be 4 thread blocks, so the GPU will only be 25% utilized. To improve this, we use multiple

    // threads per body.  We still can use blocks of 256 threads, but they are arranged in q rows

    // of p threads each.  Each thread processes 1/q of the forces that affect each body, and then

    // 1/q of the threads (those with threadIdx.y==0) add up the partial sums from the other

    // threads for that body.  To enable this, use the "--p=" and "--q=" command line options to

    // this example. e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4 threads per body and 256

    // threads per block. There will be n/p = 16 blocks, so a G80 GPU will be 100% utilized.

 

    // We use a bool template parameter to specify when the number of threads per body is greater

    // than one, so that when it is not we don't have to execute the more complex code required!

    if (multithreadBodies)

    {

        SX_SUM(threadIdx.x, threadIdx.y).x = acc.x;

        SX_SUM(threadIdx.x, threadIdx.y).y = acc.y;

        SX_SUM(threadIdx.x, threadIdx.y).z = acc.z;

 

        __syncthreads();

 

        // Save the result in global memory for the integration step

        if (threadIdx.y == 0)

        {

            for (int i = 1; i < blockDim.y; i++)

            {

                acc.x += SX_SUM(threadIdx.x,i).x;

                acc.y += SX_SUM(threadIdx.x,i).y;

                acc.z += SX_SUM(threadIdx.x,i).z;

            }

        }

    }

 

    return acc;

}

 

template<bool multithreadBodies>

__global__ void

integrateBodies(float4* newPos, float4* newVel,

                float4* oldPos, float4* oldVel,

                float deltaTime, float damping,

                int numBodies)

{

    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;

    float4 pos = oldPos[index];  

 

    float3 accel = computeBodyAccel(pos, oldPos, numBodies);

 

    // acceleration = force \ mass;

    // new velocity = old velocity + acceleration * deltaTime

    // note we factor out the body's mass from the equation, here and in bodyBodyInteraction

    // (because they cancel out).  Thus here force == acceleration

    float4 vel = oldVel[index];

      

    vel.x += accel.x * deltaTime;

    vel.y += accel.y * deltaTime;

    vel.z += accel.z * deltaTime; 

 

    vel.x *= damping;

    vel.y *= damping;

    vel.z *= damping;

       

    // new position = old position + velocity * deltaTime

    pos.x += vel.x * deltaTime;

    pos.y += vel.y * deltaTime;

    pos.z += vel.z * deltaTime;

 

    // store new position and velocity

    newPos[index] = pos;

    newVel[index] = vel;

}

 

 

#endif // #ifndef _NBODY_KERNEL_H_