CUDA Advanced Memory Usage and Optimization

CUDA Advanced Memory Usage and Optimization YukaiHung a0934147@gmail.comDepartment of MathematicsNational Taiwan University

Register as Cache?

Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { int temp1; int temp2; if(threadIdx.x<warpSize) { temp1=array[threadIdx.x] array[threadIdx.x+1]=2; temp2=array[threadIdx.x] result[threadIdx.x]=temp1*temp2; } } identical reads compiler optimized this read away 3

Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { int temp1; int temp2; if(threadIdx.x<warpSize) { • int temp=array[threadIdx.x]; temp1=temp; array[threadIdx.x+1]=2; temp2=temp; result[threadIdx.x]=temp1*temp2; } } 4

Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { • int temp1; • int temp2; if(threadIdx.x<warpSize) { temp1=array[threadIdx.x]*1; array[threadIdx.x+1]=2; __syncthreads(); temp2=array[threadIdx.x]*2; result[threadIdx.x]=temp1*temp2; } } 5

Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { • volatile int temp1; • volatile int temp2; if(threadIdx.x<warpSize) { temp1=array[threadIdx.x]*1; array[threadIdx.x+1]=2; temp2=array[threadIdx.x]*2; result[threadIdx.x]=temp1*temp2; } } 6

Data Prefetch

Data Prefetch • Hide memory latency by overlapping loading and computing • - double buffer is traditional software pipeline technique Nd load blue block to shared memory compute blue block on shared memory and load next block to shared memory Pd Md Pdsub 8

Data Prefetch • Hide memory latency by overlapping loading and computing • - double buffer is traditional software pipeline technique for loop { load data from global to shared memory synchronize block compute data in the shared memory synchronize block } 9

Data Prefetch • Hide memory latency by overlapping loading and computing • - double buffer is traditional software pipeline technique load data from global memory to registers for loop { store data from register to shared memory synchronize block load data from global memory to registers compute data in the shared memory synchronize block } very small overhead both memory are very fast computing and loading overlap register and shared are independent 10

Data Prefetch • Matrix-matrix multiplication 11

Constant Memory

Constant Memory • Where is constant memory? • - data is stored in the device global memory • - read data through multiprocessor constant cache • - 64KB constant memory and 8KB cache for each multiprocessor • How about the performance? • - optimized when warp of threads read same location • - 4 bytes per cycle through broadcasting to warp of threads • - serialized when warp of threads read in different location • - very slow when cache miss (read data from global memory) • - access latency can range from one to hundreds clock cycles 13

Constant Memory • How to use constant memory? • - declare constant memory on the file scope (global variable) • - copy data to constant memory by host (because it is constant!!) //declare constant memory __constant__ float cst_ptr[size]; //copy data from host to constant memory • cudaMemcpyToSymbol(cst_ptr,host_ptr,data_size); 14

Constant Memory //declare constant memory __constant__ float cangle[360]; int main(int argc,char** argv) { int size=3200; float* darray; • float hangle[360]; //allocate device memory cudaMalloc((void**)&darray,sizeof(float)*size); //initialize allocated memory cudaMemset(darray,0,sizeof(float)*size); //initialize angle array on host for(int loop=0;loop<360;loop++) hangle[loop]=acos(-1.0f)*loop/180.0f; //copy host angle data to constant memory cudaMemcpyToSymbol(cangle,hangle,sizeof(float)*360); 15

Constant Memory //execute device kernel test_kernel<<<size/64,64>>>(darray); //free device memory cudaFree(darray); return 0; } __global__ void test_kernel(float* darray) { int index; //calculate each thread global index index=blockIdx.x*blockDim.x+threadIdx.x; #pragma unroll 10 for(int loop=0;loop<360;loop++) darray[index]=darray[index]+cangle[loop]; return; } 16

Texture Memory

Texture Memory • Texture mapping 18

Texture Memory • Texture mapping 19

Texture Memory • Texture filtering nearest-neighborhood interpolation 20

Texture Memory • Texture filtering linear/bilinear/trilinear interpolation 21

Texture Memory • Texture filtering two times bilinear interpolation 22

SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF L1 L1 L1 L1 L1 L1 L1 L1 Host Input Assembler Setup / Rstr / ZCull Vtx Thread Issue Work Distribution Pixel Thread Issue Thread Processor L2 L2 L2 L2 L2 L2 FB FB FB FB FB FB Texture Memory these units perform graphical texture operations 23

Texture Memory two SMs are cooperated as texture processing cluster scalable units on graphics texture specific unit only available for texture 24

Texture Memory texture specific unit texture address units compute texture addresses texture filtering units compute data interpolation read only texture L1 cache 25

SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF L1 L1 L1 L1 L1 L1 L1 L1 Host Input Assembler Setup / Rstr / ZCull Vtx Thread Issue Work Distribution Pixel Thread Issue Thread Processor L2 L2 L2 L2 L2 L2 FB FB FB FB FB FB Texture Memory read only texture L2 cache for all TPC read only texture L1 cache for each TPC 26

Texture Memory texture specific units 27

SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF Texture Memory • Texture is an object for reading data • - data is stored on the device global memory • - global memory is bound with texture cache Thread Processor L1 L1 L1 L1 L1 L1 L1 L1 L2 L2 L2 L2 L2 L2 FB global memory FB FB FB FB 28

What is the advantages of texture?

SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF L1 L1 L1 L1 L1 L1 L1 L1 L2 L2 L2 L2 L2 L2 FB FB FB FB FB FB Texture Memory • Data caching • - helpful when global memory coalescing is the main bottleneck Thread Processor 30

Texture Memory • Data filtering • - support linear/bilinear and trilinear hardware interpolation texture specific unit intrinsic interpolation cudaFilterModePoint cudaFilterModeLinear 31

Texture Memory • Accesses modes • - clamp and wrap memory accessing for out-of-bound addresses wrap boundary texture specific unit cudaAddressModeWrap clamp boundary cudaAddressModeClamp 32

Texture Memory • Bound to linear memory • - only support 1-dimension problems • - only get the benefits from texture cache • - not support addressing modes and filtering • Bound to cuda array • - support float addressing • - support addressing modes • - support hardware interpolation • - support 1/2/3-dimension problems 33

Texture Memory • Host code • - allocate global linear memory or cuda array • - create and set the texture reference on file scope • - bind the texture reference to the allocated memory • - unbind the texture reference to free cache resource • Device code • - fetch data by indicating texture reference • - fetch data by using texture fetch function 34

Texture Memory • Texture memory constrain 35

Texture Memory • Measuring texture cache miss or hit number • - latest visual profiler can count cache miss or hit • - need device compute capability higher than 1.2 36

Example: 1-dimension linear memory

Texture Memory //declare texture reference texture<float,1,cudaReadModeElementType>texreference; int main(int argc,char** argv) { int size=3200; float* harray; float* diarray; float* doarray; //allocate host and device memory harray=(float*)malloc(sizeof(float)*size); cudaMalloc((void**)&diarray,sizeof(float)*size); cudaMalloc((void**)&doarray,sizeof(float)*size); //initialize host array before usage for(int loop=0;loop<size;loop++) harray[loop]=(float)rand()/(float)(RAND_MAX-1); //copy array from host to device memory cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice); 38

Texture Memory • //bind texture reference with linear memory • cudaBindTexture(0,texreference,diarray,sizeof(float)*size); //execute device kernel kernel<<<(int)ceil((float)size/64),64>>>(doarray,size); //unbind texture reference to free resource cudaUnbindTexture(texreference); //copy result array from device to host memory cudaMemcpy(harray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost); //free host and device memory free(harray); cudaFree(diarray); cudaFree(doarray); return 0; } 39

Texture Memory __global__ void kernel(float* doarray,int size) { int index; //calculate each thread global index index=blockIdx.x*blockDim.x+threadIdx.x; //fetch global memory through texture reference doarray[index]=tex1Dfetch(texreference,index); return; } 40

Texture Memory __global__ void offsetCopy(float* idata,float* odata,int offset) { • //compute each thread global index int index=blockIdx.x*blockDim.x+threadIdx.x; //copy data from global memory odata[index]=idata[index+offset]; } 41

Texture Memory __global__ void offsetCopy(float* idata,float* odata,int offset) { • //compute each thread global index int index=blockIdx.x*blockDim.x+threadIdx.x; //copy data from global memory odata[index]=tex1Dfetch(texreference,index+offset); } 42

Example: 2-dimension cuda array

Texture Memory #define size 3200 //declare texture reference texture<float,2,cudaReadModeElementType>texreference; int main(int argc,char** argv) { dim3 blocknum; dim3 blocksize; float* hmatrix; float* dmatrix; cudaArray* carray; cudaChannelFormatDesc channel; //allocate host and device memory hmatrix=(float*)malloc(sizeof(float)*size*size); cudaMalloc((void**)&dmatrix,sizeof(float)*size*size); //initialize host matrix before usage for(int loop=0;loop<size*size;loop++) • hmatrix[loop]=float)rand()/(float)(RAND_MAX-1); 44

Texture Memory //create channel to describe data type channel=cudaCreateChannelDesc<float>(); • //allocate device memory for cuda array cudaMallocArray(&carray,&channel,size,size); • //copy matrix from host to device memory bytes=sizeof(float)*size*size; cudaMemcpyToArray(carray,0,0,hmatrix,bytes,cudaMemcpyHostToDevice); //set texture filter mode property //use cudaFilterModePoint or cudaFilterModeLinear texreference.filterMode=cudaFilterModePoint; //set texture address mode property //use cudaAddressModeClamp or cudaAddressModeWrap texreference.addressMode[0]=cudaAddressModeWrap; texreference.addressMode[1]=cudaaddressModeClamp; 45

Texture Memory //bind texture reference with cuda array cudaBindTextureToArray(texreference,carray); blocksize.x=16; blocksize.y=16; blocknum.x=(int)ceil((float)size/16); blocknum.y=(int)ceil((float)size/16); • //execute device kernel • kernel<<<blocknum,blocksize>>>(dmatrix,size); • //unbind texture reference to free resource • cudaUnbindTexture(texreference); • //copy result matrix from device to host memory • cudaMemcpy(hmatrix,dmatrix,bytes,cudaMemcpyDeviceToHost); • //free host and device memory free(hmatrix); cudaFree(dmatrix); cudaFreeArray(carray); return 0; } 46

Texture Memory __global__ void kernel(float* dmatrix,int size) { int xindex; int yindex; //calculate each thread global index xindex=blockIdx.x*blockDim.x+threadIdx.x; yindex=blockIdx.y*blockDim.y+threadIdx.y; //fetch cuda array through texture reference dmatrix[yindex*size+xindex]=tex2D(texreference,xindex,yindex); return; } 47

Example: 3-dimension cuda array

Texture Memory #define size 256 //declare texture reference texture<float,3,cudaReadModeElementType>texreference; int main(int argc,char** argv) { dim3 blocknum; dim3 blocksize; float* hmatrix; float* dmatrix; cudaArray* cudaarray; • cudaExtent volumesize; cudaChannelFormatDesc channel; • cudaMemcpy3DParms copyparms={0}; • //allocate host and device memory • hmatrix=(float*)malloc(sizeof(float)*size*size*size); • cudaMalloc((void**)&dmatrix,sizeof(float)*size*size*size); 49

Texture Memory • //initialize host matrix before usage • for(int loop=0;loop<size*size*size;loop++) • hmatrix[loop]=(float)rand()/(float)(RAND_MAX-1); • //set cuda array volume size • volumesize=make_cudaExtent(size,size,size); • //create channel to describe data type • channel=cudaCreateChannelDesc<float>(); • //allocate device memory for cuda array • cudaMalloc3DArray(&cudaarray,&channel,volumesize); • //set cuda array copy parameters • copyparms.extent=volumesize; • copyparms.dstArray=cudaarray; • copyparms.kind=cudaMemcpyHostToDevice; • copyparms.srcPtr= • make_cudaPitchPtr((void*)hmatrix,sizeof(float)*size,size,size); • cudaMemcpy3D(&copyparms); 50

CUDA Advanced Memory Usage and Optimization