//a example mex c-file for use as a basic template //[C]=cMEXexample(A,B) //A and B are matrices of any size /* matdir='c:\MATLAB\' outdir=pwd compile=['nvmex -f ' matdir 'cCode\CUDA\nvmexoptsxp64_VS2008SDK7.bat cudaMEXexample.cu -IC:\CUDA\include -LC:\CUDA\lib64 -lcudart -output ' outdir 'cudaMEXexample'] eval(compile) */ //basic includes, others may be needed depending on application #include //memory allocation, process control, conversions #include //macro definitions, constants, and declarations of functions and types used not only for string handling but also various memory handling functions #include "mex.h" //MATLAB Excutable head file, must have #include "math.h" //basic mathematical operations #include "cuda_runtime.h" //CUDA header file #define MEM 700 //Preallocated memory size for each matrix-set copied into shared memory for each block #define BSZ 8 //Block size:How many thread for each block //claim functions at top //float *a :pointer type float variable __global__ void kernel_multiply(float *, float *, float *, int); void CUDAERRROR(const char *instr) { const char *str; cudaError_t errornum; if (errornum=cudaGetLastError()) { str=cudaGetErrorString(errornum); mexPrintf("%s\n",str); mexPrintf("You should clear this function in MATLAB for proper operation.\n",str); } } // This function is mandatory. It takes the place of 'main' in c. // void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { float *pA, *pB; // pointers to incoming and outgoing data int ii; //number of pixles const mwSize *pSizeA, *pSizeB; // size array int Adims, Bdims; int numl; int num_bytes; int sz; //one size of incoming Matrix /* Check for proper number of arguments. */ if (nrhs != 2) { mexErrMsgTxt("two inputs required."); } /* Assign pointers to each input. */ pA=(float *)mxGetData(prhs[0]); pB=(float *)mxGetData(prhs[1]); printf("Check! pA0: %f\n", pA[0]); //pC=(float *) mxGetData(plhs[0]); //get dimensions and sizes Adims=mxGetNumberOfDimensions(prhs[0]); Bdims=mxGetNumberOfDimensions(prhs[1]); pSizeA = mxGetDimensions(prhs[0]); pSizeB = mxGetDimensions(prhs[1]); printf("Check! dim: %d\n", Adims); //sz=pSizeA[0]; sz=8; // this writes to the matlab command line for the input data dimension // for input A mexPrintf("size A: "); for (ii = 0;ii>> is mandatory for every cuda code. // The following line start GPU calculation using d_A, d_B, d_C in global memory kernel_multiply<<>>( d_A, d_B, d_C,sz ); // processing goes to kernel_multiply defined below // copy result d_C from global memory to host memory cudaMemcpy( mxGetData(plhs[0]), d_C, numl*sizeof(float), cudaMemcpyDeviceToHost ); CUDAERRROR("kernel"); mexEvalString("pause(.001)"); // free those alocated global memory cudaFree( d_A); cudaFree( d_B); cudaFree( d_C); } // kernel_mutiply function is excuted by each thread __global__ void kernel_multiply(float *A, float *B, float *C, int sz) { int ii,jj; // create space in shared memory in each block __shared__ float sa_data[MEM]; __shared__ float sb_data[MEM]; __shared__ float sc_data[MEM]; // copy input from global memory into shared memory in each block for (ii=0;ii