Jacket for MATLAB
Ȩ > SWÁ¦Ç° > Jacket for MATLAB

NV-MEX °­ÁÂ

 Matlab¿¡¼­ NV-MEX¸¦ ÀÌ¿ëÇϸé CUDA¿Í MatlabÀ» ¿¬µ¿ÇÒ ¼ö ÀÖ½À´Ï´Ù.

 

MEX ±¸Á¶

 

MEX´Â Matlab EXecutableÀÇ ¾àÀÚ·Î, MatlabÀÇ ÀåÁ¡ÀÎ ±×·¡ÇÈ Á¦¾î µîÀÇ ÆíÀ̼º ¹× º¤ÅÍ °ü¸®ÀÇ ÀåÁ¡°ú C¾ð¾îÀÇ ÀåÁ¡ÀÎ ¼Óµµ¿Í È®À强À» µ¿½Ã¿¡ ÃæÁ·½ÃŰ´Â À¯¿ëÇÑ ÅøÀÌ´Ù.

 

MEX´Â ´ÙÀ½°ú °°ÀÌ ÀÔ·ÂÆÄ¶ó¹ÌÅÍ ºÎºÐÀÎ RHS(right hand side)°ú °á°úÆÄ¶ó¹ÌÅ͸¦ Ãâ·ÂÇϴ LHS(left hand side)°ú ÇÔ¼ö¸íÀ¸·Î ÀÌÁÖ¾îÁ® ÀÖ´Ù. 

 

> LHS = function (RHS)

 

½ÇÁ¦ ÇÔ¼ö ¼±¾ðÀº ´ÙÀ½°ú °°Àº ±¸Á¶¸¦ °¡Áö°í Àִµ¥,

void mexFunction(int nlhs, mxArray *plhs[],
                            int nrhs, const mxArray *prhs[]) {   } 

 

Ç¥ÁØ C¾ð¾îÀÇ void main(int argc, char* argv[]){   } ¿Í À¯»çÇÑ ±¸Á¶ÀÓÀ» ½±°Ô ¾Ë ¼ö ÀÖ´Ù.

 

MEXÆÄÀÏÀº C/fortran ÄÄÆÄÀÏ·¯¸¦ ¿¬°áÇØÁÖ´Â perl script ±¸¹®°ú Matlab ÇÔ¼ö¿ÍÀÇ ¿¬°á°ü·Ã API¸¦ Á¦°øÇÏ´Â MEX header ¹× ¶óÀ̺귯¸® ÆÄÀÏ·Î ±¸¼ºµÇ¾î ÀÖ´Ù.  µû¶ó¼­, »ç¿ëÀÚ´Â »õ·Î¿î ÇÔ¼ö¸¦ MEX API ±Ô¾à¿¡ ¸Â°Ô ÇÔ¼ö¸¦ Á¤ÀÇÇÏ¿© ÄÄÆÄÀÏÇÑ ÈÄ matlab¿¡¼­ ÇÔ¼ö¸¦ ·ÎµåÇÏ¿© »ç¿ëÇÒ ¼ö ÀÖ´Ù. 

 

NV-MEX ±¸Á¶

MEX´Â gcc, lcc, cl µîÀÇ ÄÄÆÄÀÏ·¯¸¦ »ç¿ëÇϴµ¥, CUDA°¡¼Ó±â´ÉÀ» MEX¸¦ ÀÌ¿ëÇϱâ À§Çؼ­´Â CUDA ÄÄÆÄÀÏÀÌ °¡´ÉÇÑ nvcc ÄÄÆÄÀÏ·¯¸¦ »ç¿ëÇÏ¿©¾ß ÇÑ´Ù. ±âÁ¸ÀÇ Matlab MEX¿¡¼­´Â nvcc ÄÄÆÄÀÏ·¯¸¦ Áö¿øÇÏÁö ¾ÊÀ¸¹Ç·Î CUDA°ü·Ã ÄÄÆÄÀÏÀÌ °¡´ÉÇϵµ·Ï Á¶Á¤ÇØÁÖ¾î¾ß ÇÑ´Ù. NVIDIA¿¡¼­´Â MEX perl ½ºÅ©¸³Æ®¸¦ º¯ÇüÇÑ NV-MEX perl ½ºÅ©¸³Æ®¸¦ Á¦°øÇϰí ÀÖÀ¸¸ç, À̸¦ ÀÌ¿ëÇÏ¿© nvcc¿Í MatlabÀÇ ¿¬µ¿ÀÌ °¡´ÉÇÏ´Ù.

 

¼¼ÆÃ¼ø¼­

NV-MEXÀÇ È¯°æ ¼³Á¤ ¼ø¼­´Â ´ÙÀ½°ú °°´Ù.

1. Matlab ¼³Ä¡

2. (WindowsÀÇ °æ¿ì) Visual Studio 2005/2008 ¼³Ä¡

2. CUDA driver, toolkit, SDK ¼³Ä¡

3. NV-MEX ´Ù¿î·Îµå ¹× ¼³Ä¡

4. Matlab µ¥¸ð¿¹Á¦ È®ÀÎ

5. (optional) Jacket 1.2 ¼³Ä¡

6. Matlab + CUDA °³¹ß

 

¸µÅ©¸ñ·Ï

http://www.mathworks.com/support/tech-notes/1600/1605.html : Matlab MEX °­ÁÂ

http://www.nvidia.com/object/matlab_acceleration.html : CUDA+Matlab ¼º°ø½ºÅ丮

http://developer.nvidia.com/object/matlab_cuda.html : NVMEX ÆÄÀÏ ´Ù¿î·Îµå

http://www.cs.ucf.edu/~janaka/gpu/using_nvmex.htm : NVMEX ÆÄÀÏ ´Ù¿î·Îµå

 

 

´ÙÀ½Àº NVMEX »ùÇÿ¹Á¦·Î, Matlab¿¡¼­ CUDA¸¦ Àû¿ë½ÃŰ´Â ¹æ¹ýÀ» È®ÀÎÇÒ ¼ö ÀÖ´Ù.

 

#include "mex.h"

#include "cufft.h"

#include "cuda_runtime.h"

 

/**************************************************************************/

 

/* MATLAB stores complex numbers in separate arrays for the real and

   imaginary parts.  The following functions take the data in

   this format and pack it into a complex work array, or

   unpack it, respectively. 

   We are using cufftComplex defined in cufft.h to  handle complex on Windows and Linux

 

*/

 

void pack_r2c(cufftComplex  *output_float,

              double *input_re,

              int Ntot)

{

    int i;

    for (i = 0; i < Ntot; i++)

              {

               output_float[i].x = input_re[i];

               output_float[i].y = 0.0f;

              }

}

 

void pack_c2c(cufftComplex  *output_float,

              double *input_re,

              double *input_im,

              int Ntot)

{

    int i;

    for (i = 0; i < Ntot; i++)

             {

               output_float[i].x = input_re[i];

               output_float[i].y = input_im[i];

             }

}

 

 

void pack_r2c_expand(cufftComplex  *output_float,

                     double *input_re,

                     int originalM,

                     int originalN,

                     int M,

                     int N)

{

     int i, j;

    for (i = 0; i < originalM; i++)

     for (j = 0; j < originalN; j++)

     {

               output_float[i+M*j].x = input_re[i+originalM*j];

               output_float[i+M*j].y = 0.0f;

     }

}

 

void pack_c2c_expand(cufftComplex  *output_float,

                     double *input_re,

                     double *input_im,

                     int originalM,

                     int originalN,

                     int M,

                     int N)

{

     int i, j;

 

    for (i = 0; i < originalM; i++)

     for (j = 0; j < originalN; j++)

     {

               output_float[i+M*j].x = input_re[i+originalM*j];

               output_float[i+M*j].y = input_im[i+originalM*j];

     }

}

 

void unpack_c2c(cufftComplex  *input_float,

                double *output_re,

                double *output_im, 

                int Ntot)

{

    int i;

    for (i = 0; i < Ntot; i++)

    {

               output_re[i] = input_float[i].x;

               output_im[i] = input_float[i].y;

    }

 

}

 

 

/**************************************************************************/

 

void mexFunction( int nlhs, mxArray *plhs[],

                  int nrhs, const mxArray *prhs[])

{

  int originalM, originalN, M, N;

  double *ar,*ai;

  cufftComplex *input_single ;

  cufftHandle            plan;

  cufftComplex *rhs_complex_d;

 

  /*

     Find out the  dimension of the array we want to transform:

 

     prhs(originalM,originalN)

     originalM = Number of rows    in the mxArray prhs

     originalN = Number of columns in the mxArray prhs

 

  */

 

    originalM = mxGetM(prhs[0]);

    originalN = mxGetN(prhs[0]);

 

    M = originalM;

    N = originalN;

 

 

  /*

    Find out if the result is of the same size of the input.

 

    plhs(M,N)

    M = The number of rows in the mxArray plhs

    N = The number of columns in the mxArray plhs

 

  */

   if (nrhs == 3)

   {

    M = mxGetScalar(prhs[1]);

    N = mxGetScalar(prhs[2]);

   }

 

  

  /*

    Find out if the input array was real or complex.

    Matlab is passing two separate pointers for the real and imaginary part.

    The current version of  CUDAFFT is expecting interleaved data.

    We will need to pack and unpack the data.

 

    The current version of CUDA supports only single precision,

    so the original double precision data need to be converted.

  */

 

  /* Allocating working array on host */

    if(nrhs == 1) input_single  = (cufftComplex*) mxMalloc(sizeof(cufftComplex)*N*M);

    if(nrhs == 3) input_single  = (cufftComplex*) mxCalloc(N*M,sizeof(cufftComplex));

 

 

  /* Pointer for the real part of the input */

    ar =  (double *) mxGetData(prhs[0]);

 

  if(mxIsComplex(prhs[0]))

  {

   /* The input array is complex */

   ai =  (double *) mxGetImagData(prhs[0]);

  

   /* Input and output have same dimensions */

   if(nrhs ==1) pack_c2c(input_single, ar, ai, N*M);

 

   /* Input and output have different dimensions */

   if(nrhs ==3) pack_c2c_expand(input_single, ar, ai, originalM, originalN, M, N);

 

  }

  else

  {

   /* The input array is real */

 

   /* Input and output have same dimensions */

   if(nrhs ==1) pack_r2c(input_single, ar, N*M);

 

   /* Input and output have different dimensions */

   if(nrhs ==3) pack_r2c_expand(input_single, ar, originalM, originalN, M, N);

  }

 

  /* Allocate array on device */

  cudaMalloc( (void **) &rhs_complex_d,sizeof(cufftComplex)*N*M);

 

  /* Copy input array in interleaved format to the device */

  cudaMemcpy( rhs_complex_d, input_single, sizeof(cufftComplex)*N*M, cudaMemcpyHostToDevice);

 

 

  /* Create plan for CUDA FFT */

  cufftPlan2d(&plan, N, M, CUFFT_C2C) ;

 

  /* Execute FFT on device */

    cufftExecC2C(plan, rhs_complex_d, rhs_complex_d, CUFFT_FORWARD) ;

 

  /* Destroy plan */

    cufftDestroy(plan);

 

  /* Copy result back to host */

  cudaMemcpy( input_single, rhs_complex_d, sizeof(cufftComplex)*N*M, cudaMemcpyDeviceToHost);

 

 

  plhs[0]=mxCreateDoubleMatrix(M,N,mxCOMPLEX);

 

  ar = mxGetPr(plhs[0]);

  ai = mxGetPi(plhs[0]);

  unpack_c2c(input_single, ar, ai, N*M);

 

  mxFree(input_single);

  cudaFree(rhs_complex_d);

 

  return;

}