//a example mex c-file for use as a basic template
//[C]=cMEXexample(A,B)
//A and B are matrices of any size

/*
matdir='c:\MATLAB\'
outdir=pwd
compile=['nvmex -f ' matdir 'cCode\CUDA\nvmexoptsxp64_VS2008SDK7.bat cudaMEXexample.cu -IC:\CUDA\include -LC:\CUDA\lib64 -lcudart -output ' outdir 'cudaMEXexample']    
eval(compile)
*/

//basic includes, others may be needed depending on application
#include <stdlib.h> //memory allocation, process control, conversions
#include <string.h> //macro definitions, constants, and declarations of functions and types used not only for string handling but also various memory handling functions
#include "mex.h"    //MATLAB Excutable head file, must have
#include "math.h"   //basic mathematical operations
#include "cuda_runtime.h" //CUDA header file
#define MEM 700 //Preallocated memory size for each matrix-set copied into shared memory for each block
#define BSZ 8   //Block size:How many thread for each block

//claim functions at top
//float *a  :pointer type float variable

__global__ void kernel_multiply(float *, float *, float *, int);

void CUDAERRROR(const char *instr)
{
    const char *str;
    cudaError_t errornum;
    if (errornum=cudaGetLastError())
    {
        str=cudaGetErrorString(errornum);
        mexPrintf("%s\n",str);
        mexPrintf("You should clear this function in MATLAB for proper operation.\n",str);
    }
}

// This function is mandatory.  It takes the place of 'main' in c.
//
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    
    float *pA, *pB; // pointers to incoming and outgoing data
    int ii; //number of pixles
    const mwSize *pSizeA, *pSizeB; // size array
    int Adims, Bdims;
    int numl;
    int num_bytes;
    int sz; //one size of incoming Matrix
    
    
    
    /* Check for proper number of arguments. */
    if (nrhs != 2) {
        mexErrMsgTxt("two inputs required.");
    }
    
    /* Assign pointers to each input. */
    pA=(float *)mxGetData(prhs[0]);
    pB=(float *)mxGetData(prhs[1]);
    
    printf("Check! pA0: %f\n",   pA[0]);
    
    //pC=(float *) mxGetData(plhs[0]);
    
    //get dimensions and sizes
    Adims=mxGetNumberOfDimensions(prhs[0]);
    Bdims=mxGetNumberOfDimensions(prhs[1]);
    
    pSizeA = mxGetDimensions(prhs[0]);
    pSizeB = mxGetDimensions(prhs[1]);
    
    printf("Check! dim: %d\n",   Adims);
    //sz=pSizeA[0];
    sz=8;
    
    // this writes to the matlab command line for the input data dimension
    // for input A
    mexPrintf("size A: ");
    for (ii = 0;ii<Adims;ii++) {
        mexPrintf("%d",pSizeA[ii]);
        if (ii < Adims-1)
            mexPrintf(" X ");
        if (ii == Adims-1)
            mexPrintf("\n");
    }
    
    // for input B
    mexPrintf("size B: ");
    for (ii = 0;ii<Bdims;ii++) {
        mexPrintf("%d",pSizeB[ii]);
        if (ii < Bdims-1)
            mexPrintf(" X ");
        if (ii == Bdims-1)
            mexPrintf("\n");
    }
        
    //check dimensions
    if (Adims != Bdims){
        mexErrMsgTxt("A and B must be same size.");
    }
    

    //check sizes
    numl= 1;
    for (ii = 0;ii<Adims;ii++) {
        if (pSizeA[ii] != pSizeB[ii]) {
            mexErrMsgTxt("A and B must be same size.");
        }
        numl = numl*pSizeA[ii];
    }
    
    // total number of size which would be allocated in GPU global memory
    // this number should be n times of the memory which is transfered to 
    // shared memory later to avoid operating on random data in the final block
    // fail to do so might cause segmentation error or wierd errors.
    num_bytes=ceil((float) numl/sz/sz/BSZ)*sz*sz*BSZ;
    num_bytes=num_bytes*sizeof(float);
    // check our result by printing them on screen
    printf("Check! numl: %d\n",  numl);
    printf("Check! sz*sz*BSZ: %d\n", sz*sz*BSZ);
    printf("Check! num_bytes: %d\n",  num_bytes);
    
    
    //check datatype
    
    if (mxGetClassID(prhs[0])!=mxSINGLE_CLASS)
        mexErrMsgTxt("Data must be comprised of single floats!\n");
    
    if (mxGetClassID(prhs[1])!=mxSINGLE_CLASS)
        mexErrMsgTxt("Data must be comprised of single floats!\n");
    

//   plhs[0] = mxCreateDoubleMatrix((int)pSizeA[0],((int)pSizeA[1]), mxREAL);
//   mxCreateDoubleMatrix won't work since CUDA only operates on single data.
//   mxArray has to be created for plhs before being availible to be assigned with values.
     plhs[0]= mxCreateNumericArray(Adims, pSizeA, mxSINGLE_CLASS, mxREAL);
//   size in byte of the matrix. This variable is used in cudaMalloc and cudaMemcpy   
    
//  float pointer type for variables in global memory in Graphics card.    
    float *d_A;
//  Allocate memory of size 'num_bytes', and give back the address to d_A  
    cudaMalloc((void**)&d_A, num_bytes);
//  Set each byte of the allocated momery (pointed by d_A) to 0.    
    cudaMemset(d_A,0,num_bytes);
//  Create data location for matrix B in Global memory (512MB?)    
    float *d_B;
    cudaMalloc((void**)&d_B, num_bytes);
    cudaMemset(d_B,0,num_bytes);
//  Create data location for matrix C (512MB?)
    float *d_C;
    cudaMalloc((void**)&d_C, num_bytes);
    cudaMemset(d_C,0,num_bytes);
    
//  Copy input matrix from host memory into global memory     
    cudaMemcpy(d_A, pA, num_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, pB, num_bytes, cudaMemcpyHostToDevice);
    
//  'dim3' is a special type of data created to define block size and grid size for CUDA
//  dim gridsize,blocksize; gridsize.x, gridsize.y, gridsize.z;
    dim3 grid, block;
    block.x = BSZ;
    grid.x = ceil((float) num_bytes/4/sz/sz/ block.x);
    

     printf("Check! blocksize: %d\n",  block.x);
     printf("Check! gridsize: %d\n",  grid.x);

//  the <<< >>>  is mandatory for every cuda code. 
//  The following line start GPU calculation using d_A, d_B, d_C in global memory  
 
    kernel_multiply<<<grid.x, block.x>>>( d_A, d_B, d_C,sz );
//  processing goes to kernel_multiply defined below    
    
//  copy result d_C from global memory to host memory   
    cudaMemcpy( mxGetData(plhs[0]), d_C, numl*sizeof(float), cudaMemcpyDeviceToHost );
    
    CUDAERRROR("kernel");
    mexEvalString("pause(.001)");
//  free those alocated global memory    
    cudaFree( d_A);
    cudaFree( d_B);
    cudaFree( d_C);
        
    
}

//  kernel_mutiply function is excuted by each thread
__global__ void kernel_multiply(float *A, float *B, float *C, int sz)
{    int ii,jj;
//  create space in shared memory in each block     
    __shared__ float sa_data[MEM];
    __shared__ float sb_data[MEM];
    __shared__ float sc_data[MEM];
//  copy input from global memory into shared memory in each block
    for (ii=0;ii<sz;ii++)  for (jj=0;jj<sz;jj++)    {
        sa_data[sz*sz*threadIdx.x+sz*jj+ii]=A[sz*sz*blockIdx.x*BSZ+sz*sz*threadIdx.x+sz*jj+ii];
    }
//  do it for B    
    for (ii=0;ii<sz;ii++)  for (jj=0;jj<sz;jj++)    {
        sb_data[sz*sz*threadIdx.x+sz*jj+ii]=B[sz*sz*blockIdx.x*BSZ+sz*sz*threadIdx.x+sz*jj+ii];
    }
//  A*B for each thread    
    for  (ii=0;ii<sz;ii++)  for (jj=0;jj<sz;jj++)  {
        sc_data[sz*sz*threadIdx.x+sz*jj+ii] = sa_data[sz*sz*threadIdx.x+sz*jj+ii] * sb_data[sz*sz*threadIdx.x+sz*jj+ii];
    }
//  copy output C from shared memory into global memory    
    for (ii=0;ii<sz;ii++)  for (jj=0;jj<sz;jj++)    {
        C[sz*sz*blockIdx.x*BSZ+sz*sz*threadIdx.x+sz*jj+ii]=sc_data[sz*sz*threadIdx.x+sz*jj+ii];
    }
//  return    
}