1 / 70

CUDA Advanced Memory Usage and Optimization

CUDA Advanced Memory Usage and Optimization. Yukai Hung a0934147@gmail.com Department of Mathematics National Taiwan University. Register as Cache?. Volatile Qualifier. Volatile qualifier. __global__ void kernelFunc(int* result) { int temp1; int temp2;

quana
Download Presentation

CUDA Advanced Memory Usage and Optimization

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


  1. CUDA Advanced Memory Usage and Optimization YukaiHung a0934147@gmail.comDepartment of MathematicsNational Taiwan University

  2. Register as Cache?

  3. Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { int temp1; int temp2; if(threadIdx.x<warpSize) { temp1=array[threadIdx.x] array[threadIdx.x+1]=2; temp2=array[threadIdx.x] result[threadIdx.x]=temp1*temp2; } } identical reads compiler optimized this read away 3

  4. Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { int temp1; int temp2; if(threadIdx.x<warpSize) { • int temp=array[threadIdx.x]; temp1=temp; array[threadIdx.x+1]=2; temp2=temp; result[threadIdx.x]=temp1*temp2; } } 4

  5. Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { • int temp1; • int temp2; if(threadIdx.x<warpSize) { temp1=array[threadIdx.x]*1; array[threadIdx.x+1]=2; __syncthreads(); temp2=array[threadIdx.x]*2; result[threadIdx.x]=temp1*temp2; } } 5

  6. Volatile Qualifier • Volatile qualifier __global__ void kernelFunc(int* result) { • volatile int temp1; • volatile int temp2; if(threadIdx.x<warpSize) { temp1=array[threadIdx.x]*1; array[threadIdx.x+1]=2; temp2=array[threadIdx.x]*2; result[threadIdx.x]=temp1*temp2; } } 6

  7. Data Prefetch

  8. Data Prefetch • Hide memory latency by overlapping loading and computing • - double buffer is traditional software pipeline technique Nd load blue block to shared memory compute blue block on shared memory and load next block to shared memory Pd Md Pdsub 8

  9. Data Prefetch • Hide memory latency by overlapping loading and computing • - double buffer is traditional software pipeline technique for loop { load data from global to shared memory synchronize block compute data in the shared memory synchronize block } 9

  10. Data Prefetch • Hide memory latency by overlapping loading and computing • - double buffer is traditional software pipeline technique load data from global memory to registers for loop { store data from register to shared memory synchronize block load data from global memory to registers compute data in the shared memory synchronize block } very small overhead both memory are very fast computing and loading overlap register and shared are independent 10

  11. Data Prefetch • Matrix-matrix multiplication 11

  12. Constant Memory

  13. Constant Memory • Where is constant memory? • - data is stored in the device global memory • - read data through multiprocessor constant cache • - 64KB constant memory and 8KB cache for each multiprocessor • How about the performance? • - optimized when warp of threads read same location • - 4 bytes per cycle through broadcasting to warp of threads • - serialized when warp of threads read in different location • - very slow when cache miss (read data from global memory) • - access latency can range from one to hundreds clock cycles 13

  14. Constant Memory • How to use constant memory? • - declare constant memory on the file scope (global variable) • - copy data to constant memory by host (because it is constant!!) //declare constant memory __constant__ float cst_ptr[size]; //copy data from host to constant memory • cudaMemcpyToSymbol(cst_ptr,host_ptr,data_size); 14

  15. Constant Memory //declare constant memory __constant__ float cangle[360]; int main(int argc,char** argv) { int size=3200; float* darray; • float hangle[360]; //allocate device memory cudaMalloc((void**)&darray,sizeof(float)*size); //initialize allocated memory cudaMemset(darray,0,sizeof(float)*size); //initialize angle array on host for(int loop=0;loop<360;loop++) hangle[loop]=acos(-1.0f)*loop/180.0f; //copy host angle data to constant memory cudaMemcpyToSymbol(cangle,hangle,sizeof(float)*360); 15

  16. Constant Memory //execute device kernel test_kernel<<<size/64,64>>>(darray); //free device memory cudaFree(darray); return 0; } __global__ void test_kernel(float* darray) { int index; //calculate each thread global index index=blockIdx.x*blockDim.x+threadIdx.x; #pragma unroll 10 for(int loop=0;loop<360;loop++) darray[index]=darray[index]+cangle[loop]; return; } 16

  17. Texture Memory

  18. Texture Memory • Texture mapping 18

  19. Texture Memory • Texture mapping 19

  20. Texture Memory • Texture filtering nearest-neighborhood interpolation 20

  21. Texture Memory • Texture filtering linear/bilinear/trilinear interpolation 21

  22. Texture Memory • Texture filtering two times bilinear interpolation 22

  23. SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF L1 L1 L1 L1 L1 L1 L1 L1 Host Input Assembler Setup / Rstr / ZCull Vtx Thread Issue Work Distribution Pixel Thread Issue Thread Processor L2 L2 L2 L2 L2 L2 FB FB FB FB FB FB Texture Memory these units perform graphical texture operations 23

  24. Texture Memory two SMs are cooperated as texture processing cluster scalable units on graphics texture specific unit only available for texture 24

  25. Texture Memory texture specific unit texture address units compute texture addresses texture filtering units compute data interpolation read only texture L1 cache 25

  26. SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF L1 L1 L1 L1 L1 L1 L1 L1 Host Input Assembler Setup / Rstr / ZCull Vtx Thread Issue Work Distribution Pixel Thread Issue Thread Processor L2 L2 L2 L2 L2 L2 FB FB FB FB FB FB Texture Memory read only texture L2 cache for all TPC read only texture L1 cache for each TPC 26

  27. Texture Memory texture specific units 27

  28. SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF Texture Memory • Texture is an object for reading data • - data is stored on the device global memory • - global memory is bound with texture cache Thread Processor L1 L1 L1 L1 L1 L1 L1 L1 L2 L2 L2 L2 L2 L2 FB global memory FB FB FB FB 28

  29. What is the advantages of texture?

  30. SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP SP TF TF TF TF TF TF TF TF L1 L1 L1 L1 L1 L1 L1 L1 L2 L2 L2 L2 L2 L2 FB FB FB FB FB FB Texture Memory • Data caching • - helpful when global memory coalescing is the main bottleneck Thread Processor 30

  31. Texture Memory • Data filtering • - support linear/bilinear and trilinear hardware interpolation texture specific unit intrinsic interpolation cudaFilterModePoint cudaFilterModeLinear 31

  32. Texture Memory • Accesses modes • - clamp and wrap memory accessing for out-of-bound addresses wrap boundary texture specific unit cudaAddressModeWrap clamp boundary cudaAddressModeClamp 32

  33. Texture Memory • Bound to linear memory • - only support 1-dimension problems • - only get the benefits from texture cache • - not support addressing modes and filtering • Bound to cuda array • - support float addressing • - support addressing modes • - support hardware interpolation • - support 1/2/3-dimension problems 33

  34. Texture Memory • Host code • - allocate global linear memory or cuda array • - create and set the texture reference on file scope • - bind the texture reference to the allocated memory • - unbind the texture reference to free cache resource • Device code • - fetch data by indicating texture reference • - fetch data by using texture fetch function 34

  35. Texture Memory • Texture memory constrain 35

  36. Texture Memory • Measuring texture cache miss or hit number • - latest visual profiler can count cache miss or hit • - need device compute capability higher than 1.2 36

  37. Example: 1-dimension linear memory

  38. Texture Memory //declare texture reference texture<float,1,cudaReadModeElementType>texreference; int main(int argc,char** argv) { int size=3200; float* harray; float* diarray; float* doarray; //allocate host and device memory harray=(float*)malloc(sizeof(float)*size); cudaMalloc((void**)&diarray,sizeof(float)*size); cudaMalloc((void**)&doarray,sizeof(float)*size); //initialize host array before usage for(int loop=0;loop<size;loop++) harray[loop]=(float)rand()/(float)(RAND_MAX-1); //copy array from host to device memory cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice); 38

  39. Texture Memory • //bind texture reference with linear memory • cudaBindTexture(0,texreference,diarray,sizeof(float)*size); //execute device kernel kernel<<<(int)ceil((float)size/64),64>>>(doarray,size); //unbind texture reference to free resource cudaUnbindTexture(texreference); //copy result array from device to host memory cudaMemcpy(harray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost); //free host and device memory free(harray); cudaFree(diarray); cudaFree(doarray); return 0; } 39

  40. Texture Memory __global__ void kernel(float* doarray,int size) { int index; //calculate each thread global index index=blockIdx.x*blockDim.x+threadIdx.x; //fetch global memory through texture reference doarray[index]=tex1Dfetch(texreference,index); return; } 40

  41. Texture Memory __global__ void offsetCopy(float* idata,float* odata,int offset) { • //compute each thread global index int index=blockIdx.x*blockDim.x+threadIdx.x; //copy data from global memory odata[index]=idata[index+offset]; } 41

  42. Texture Memory __global__ void offsetCopy(float* idata,float* odata,int offset) { • //compute each thread global index int index=blockIdx.x*blockDim.x+threadIdx.x; //copy data from global memory odata[index]=tex1Dfetch(texreference,index+offset); } 42

  43. Example: 2-dimension cuda array

  44. Texture Memory #define size 3200 //declare texture reference texture<float,2,cudaReadModeElementType>texreference; int main(int argc,char** argv) { dim3 blocknum; dim3 blocksize; float* hmatrix; float* dmatrix; cudaArray* carray; cudaChannelFormatDesc channel; //allocate host and device memory hmatrix=(float*)malloc(sizeof(float)*size*size); cudaMalloc((void**)&dmatrix,sizeof(float)*size*size); //initialize host matrix before usage for(int loop=0;loop<size*size;loop++) • hmatrix[loop]=float)rand()/(float)(RAND_MAX-1); 44

  45. Texture Memory //create channel to describe data type channel=cudaCreateChannelDesc<float>(); • //allocate device memory for cuda array cudaMallocArray(&carray,&channel,size,size); • //copy matrix from host to device memory bytes=sizeof(float)*size*size; cudaMemcpyToArray(carray,0,0,hmatrix,bytes,cudaMemcpyHostToDevice); //set texture filter mode property //use cudaFilterModePoint or cudaFilterModeLinear texreference.filterMode=cudaFilterModePoint; //set texture address mode property //use cudaAddressModeClamp or cudaAddressModeWrap texreference.addressMode[0]=cudaAddressModeWrap; texreference.addressMode[1]=cudaaddressModeClamp; 45

  46. Texture Memory //bind texture reference with cuda array cudaBindTextureToArray(texreference,carray); blocksize.x=16; blocksize.y=16; blocknum.x=(int)ceil((float)size/16); blocknum.y=(int)ceil((float)size/16); • //execute device kernel • kernel<<<blocknum,blocksize>>>(dmatrix,size); • //unbind texture reference to free resource • cudaUnbindTexture(texreference); • //copy result matrix from device to host memory • cudaMemcpy(hmatrix,dmatrix,bytes,cudaMemcpyDeviceToHost); • //free host and device memory free(hmatrix); cudaFree(dmatrix); cudaFreeArray(carray); return 0; } 46

  47. Texture Memory __global__ void kernel(float* dmatrix,int size) { int xindex; int yindex; //calculate each thread global index xindex=blockIdx.x*blockDim.x+threadIdx.x; yindex=blockIdx.y*blockDim.y+threadIdx.y; //fetch cuda array through texture reference dmatrix[yindex*size+xindex]=tex2D(texreference,xindex,yindex); return; } 47

  48. Example: 3-dimension cuda array

  49. Texture Memory #define size 256 //declare texture reference texture<float,3,cudaReadModeElementType>texreference; int main(int argc,char** argv) { dim3 blocknum; dim3 blocksize; float* hmatrix; float* dmatrix; cudaArray* cudaarray; • cudaExtent volumesize; cudaChannelFormatDesc channel; • cudaMemcpy3DParms copyparms={0}; • //allocate host and device memory • hmatrix=(float*)malloc(sizeof(float)*size*size*size); • cudaMalloc((void**)&dmatrix,sizeof(float)*size*size*size); 49

  50. Texture Memory • //initialize host matrix before usage • for(int loop=0;loop<size*size*size;loop++) • hmatrix[loop]=(float)rand()/(float)(RAND_MAX-1); • //set cuda array volume size • volumesize=make_cudaExtent(size,size,size); • //create channel to describe data type • channel=cudaCreateChannelDesc<float>(); • //allocate device memory for cuda array • cudaMalloc3DArray(&cudaarray,&channel,volumesize); • //set cuda array copy parameters • copyparms.extent=volumesize; • copyparms.dstArray=cudaarray; • copyparms.kind=cudaMemcpyHostToDevice; • copyparms.srcPtr= • make_cudaPitchPtr((void*)hmatrix,sizeof(float)*size,size,size); • cudaMemcpy3D(&copyparms); 50

More Related