     병렬기법 홈 > CUDA > 병렬기법 > SIMD nbody 예제

SIMD nbody 예제

본 예제는 NVIDIA에서 제공하는 CUDA SDK에 수록되어 있습니다.

CPU 코드와 CUDA 코드를 비교해 볼 수 있습니다.

CPU코드

 #include   //////////////////////////////////////////////////////////////////////////////// // export C interface extern "C" void computeGold( float* reference, float* idata, const unsigned int len);   void bodyBodyInteraction(float accel, float posMass0, float posMass1, float softeningSquared) {     float r;       // r_01  [3 FLOPS]     r = posMass0 - posMass1;     r = posMass0 - posMass1;     r = posMass0 - posMass1;       // d^2 + e^2 [6 FLOPS]     float distSqr = r * r + r * r + r * r;     distSqr += softeningSquared;       // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]     float invDist = 1.0f / sqrtf(distSqr);         float invDistCube =  invDist * invDist * invDist;       // s = m_j * invDistCube [1 FLOP]     float s = posMass1 * invDistCube;       // (m_1 * r_01) / (d^2 + e^2)^(3/2)  [6 FLOPS]     accel += r * s;     accel += r * s;     accel += r * s; }   //////////////////////////////////////////////////////////////////////////////// //! Compute reference data set //! Each element is multiplied with the number of threads / array length //! @param reference  reference data, computed but preallocated //! @param idata      input data as provided to device //! @param len        number of elements in reference / idata //////////////////////////////////////////////////////////////////////////////// void computeGold( float* force, float* pos, const unsigned int numBodies, float softeningSquared) {     for(unsigned int i = 0; i < numBodies; ++i)     {         force[i*4  ] = 0;                force[i*4+1] = 0;                force[i*4+2] = 0;                force[i*4+3] = 0;     }       for(unsigned int i = 0; i < numBodies; ++i)     {         for(unsigned int j = 0; j < numBodies; ++j)             {                 float f;                           bodyBodyInteraction(f, &pos[j*4], &pos[i*4], softeningSquared);                     for (int k = 0; k < 3; ++k)                 {                     force[i*4+k] += f[k];                 }             }     } }

CUDA 코드