#include "mex.h"
#include "cufft.h"
#include "cuda_runtime.h"
/**************************************************************************/
/* MATLAB stores complex numbers in separate arrays for the real and
imaginary parts. The following functions take the data in
this format and pack it into a complex work array, or
unpack it, respectively.
We are using cufftComplex defined in cufft.h to handle complex on Windows and Linux
*/
void pack_r2c(cufftComplex *output_float,
double *input_re,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = 0.0f;
}
}
void pack_c2c(cufftComplex *output_float,
double *input_re,
double *input_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = input_im[i];
}
}
void pack_r2c_expand(cufftComplex *output_float,
double *input_re,
int originalM,
int originalN,
int M,
int N)
{
int i, j;
for (i = 0; i < originalM; i++)
for (j = 0; j < originalN; j++)
{
output_float[i+M*j].x = input_re[i+originalM*j];
output_float[i+M*j].y = 0.0f;
}
}
void pack_c2c_expand(cufftComplex *output_float,
double *input_re,
double *input_im,
int originalM,
int originalN,
int M,
int N)
{
int i, j;
for (i = 0; i < originalM; i++)
for (j = 0; j < originalN; j++)
{
output_float[i+M*j].x = input_re[i+originalM*j];
output_float[i+M*j].y = input_im[i+originalM*j];
}
}
void unpack_c2c(cufftComplex *input_float,
double *output_re,
double *output_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_re[i] = input_float[i].x;
output_im[i] = input_float[i].y;
}
}
/**************************************************************************/
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int originalM, originalN, M, N;
double *ar,*ai;
cufftComplex *input_single ;
cufftHandle plan;
cufftComplex *rhs_complex_d;
/*
Find out the dimension of the array we want to transform:
prhs(originalM,originalN)
originalM = Number of rows in the mxArray prhs
originalN = Number of columns in the mxArray prhs
*/
originalM = mxGetM(prhs[0]);
originalN = mxGetN(prhs[0]);
M = originalM;
N = originalN;
/*
Find out if the result is of the same size of the input.
plhs(M,N)
M = The number of rows in the mxArray plhs
N = The number of columns in the mxArray plhs
*/
if (nrhs == 3)
{
M = mxGetScalar(prhs[1]);
N = mxGetScalar(prhs[2]);
}
/*
Find out if the input array was real or complex.
Matlab is passing two separate pointers for the real and imaginary part.
The current version of CUDAFFT is expecting interleaved data.
We will need to pack and unpack the data.
The current version of CUDA supports only single precision,
so the original double precision data need to be converted.
*/
/* Allocating working array on host */
if(nrhs == 1) input_single = (cufftComplex*) mxMalloc(sizeof(cufftComplex)*N*M);
if(nrhs == 3) input_single = (cufftComplex*) mxCalloc(N*M,sizeof(cufftComplex));
/* Pointer for the real part of the input */
ar = (double *) mxGetData(prhs[0]);
if(mxIsComplex(prhs[0]))
{
/* The input array is complex */
ai = (double *) mxGetImagData(prhs[0]);
/* Input and output have same dimensions */
if(nrhs ==1) pack_c2c(input_single, ar, ai, N*M);
/* Input and output have different dimensions */
if(nrhs ==3) pack_c2c_expand(input_single, ar, ai, originalM, originalN, M, N);
}
else
{
/* The input array is real */
/* Input and output have same dimensions */
if(nrhs ==1) pack_r2c(input_single, ar, N*M);
/* Input and output have different dimensions */
if(nrhs ==3) pack_r2c_expand(input_single, ar, originalM, originalN, M, N);
}
/* Allocate array on device */
cudaMalloc( (void **) &rhs_complex_d,sizeof(cufftComplex)*N*M);
/* Copy input array in interleaved format to the device */
cudaMemcpy( rhs_complex_d, input_single, sizeof(cufftComplex)*N*M, cudaMemcpyHostToDevice);
/* Create plan for CUDA FFT */
cufftPlan2d(&plan, N, M, CUFFT_C2C) ;
/* Execute FFT on device */
cufftExecC2C(plan, rhs_complex_d, rhs_complex_d, CUFFT_FORWARD) ;
/* Destroy plan */
cufftDestroy(plan);
/* Copy result back to host */
cudaMemcpy( input_single, rhs_complex_d, sizeof(cufftComplex)*N*M, cudaMemcpyDeviceToHost);
plhs[0]=mxCreateDoubleMatrix(M,N,mxCOMPLEX);
ar = mxGetPr(plhs[0]);
ai = mxGetPi(plhs[0]);
unpack_c2c(input_single, ar, ai, N*M);
mxFree(input_single);
cudaFree(rhs_complex_d);
return;
}
|