병렬기법
홈 > CUDA > 병렬기법 > SIMD Reverse 예제

SIMD Reverse 예제

 본 reverse 예제는 NVIDIA에서 CUDA 교육을 담당하고 계시는 유현곤 대리님께서 제공하였습니다. 

 교육 목적을 위하여 빈칸처리되었습니다. 본 예제는 CUDA 기초교육에서 실제로 사용되는 예제입니다.

 

 

 

#include

#include

 

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char* msg);

 

// Part2: implement the kernel

__global__ void reverseArrayBlock(int *d_out, int *d_in)

{

    int inOffset  = 

    int outOffset = 

    int in  = 

    int out = 

    d_out[out] = d_in[in];

}

 

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main( int argc, char** argv)

{

    // pointer for host memory and size

    int *h_a;

    int dimA = 256 * 1024; // 256K elements (1MB total)

 

    // pointer for device memory

    int *d_b, *d_a;

 

    // define grid and block size

    int numThreadsPerBlock = 256;

 

    // Part 1: compute number of blocks needed based on array size and desired block size

    int numBlocks = 

 

    // allocate host and device memory

    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);

    h_a = (int *) malloc(memSize);

    cudaMalloc( (void **) &d_a, memSize );

    cudaMalloc( (void **) &d_b, memSize );

 

    // Initialize input array on host

    for (int i = 0; i < dimA; ++i)

    {

        h_a[i] = i;

    }

 

    // Copy host array to device array

    cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );

 

    // launch kernel

    dim3 dimGrid(numBlocks);

    dim3 dimBlock(numThreadsPerBlock);

    reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a );

 

    // block until the device has completed

    cudaThreadSynchronize();

 

    // check if kernel execution generated an error

    // Check for any CUDA errors

    checkCUDAError("kernel invocation");

 

    // device to host copy

    cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );

 

    // Check for any CUDA errors

    checkCUDAError("memcpy");

 

    // verify the data returned to the host is correct

    for (int i = 0; i < dimA; i++)

    {

        assert(h_a[i] == dimA - 1 - i );

    }

 

    // free device memory

    cudaFree(d_a);

    cudaFree(d_b);

 

    // free host memory

    free(h_a);

 

    // If the program makes it this far, then the results are correct and

    // there are no run-time errors.  Good work!

    printf("Correct!\n");

 

    return 0;

}

 

void checkCUDAError(const char *msg)

{

    cudaError_t err = cudaGetLastError();

    if( cudaSuccess != err)

    {

        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

        exit(EXIT_FAILURE);

    }                        

}