CUDA性能优化----线程配置
CUDA性能優化----線程配置??
2017-01-12 14:19:29|??分類: HPC&CUDA優化 |??標簽:cuda??gpu??hpc?? |舉報 |字號?訂閱
下載LOFTER 我的照片書??| 前言: CUDA線程的組織形式(block的維度配置)對程序的性能影響是至關重要的。線程索引: 矩陣在memory中是row-major線性存儲的: ?在kernel里,線程的唯一索引非常有用,為了確定一個線程的索引,需要(以2D為例):
- 線程和block索引
- 矩陣中元素坐標
- 線性global memory 的偏移
下面我們以2D矩陣相加為例,來測試CUDA線程配置( block的大小和數量 )對程序性能的影響,,這里以2D grid和2D block為例。 測試環境:Tesla M2070一塊,CUDA 6.0, 操作系統:Red Hat 4.1.2-50,gcc version 4.1.2 20080704 測試代碼:
//Threads assign test #include <cuda_runtime.h> #include <stdio.h> #include <math.h> #include <time.h>#define PRECISION 1e-5 #define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ ))static void HandleError( cudaError_t err,const char *file,int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); } }__global__ void sumMatrix2DKernel(float *d_MatA,float *d_MatB,float *d_MatC,int nx,int ny) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; int tid = nx*idy + idx; if(idx < nx && idy < ny) d_MatC[tid] = d_MatA[tid] + d_MatB[tid]; }void sumMatrix2DOnHost (float *h_A,float *h_B,float *hostRef,int nx,int ny) { for(int i=0; i< nx*ny; i++) hostRef[i] = h_A[i] + h_B[i]; }int main(int argc, char **argv) { printf("%s Program Starting...\n",argv[0]); // set up device int devID = 0; cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, devID)); printf("Using Device %d: %s\n", devID, deviceProp.name); HANDLE_ERROR(cudaSetDevice(devID)); // set up date size of matrix int nx = 1<<14; int ny = 1<<14; int nxy = nx*ny; int nBytes = nxy * sizeof(float); printf("Matrix size: nx= %d, ny= %d\n",nx, ny); // malloc host memory float *h_A, *h_B, *hostRef, *gpuRef; h_A = (float *)malloc(nBytes); h_B = (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef = (float *)malloc(nBytes); // initialize data at host side clock_t iStart,iEnd; iStart = clock(); for(int i=0;i<nxy;i++) { h_A[i] = rand()/(float)RAND_MAX; h_B[i] = rand()/(float)RAND_MAX; } iEnd = clock(); double iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; memset(hostRef, 0, nBytes); memset(gpuRef, 0, nBytes); // add matrix at host side for result checks iStart = clock(); sumMatrix2DOnHost(h_A, h_B, hostRef, nx,ny); iEnd = clock(); iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; printf("--sumMatrix2DOnHost() elapsed %f sec..\n", iElaps); // malloc device global memory float *d_MatA, *d_MatB, *d_MatC; cudaMalloc((void **)&d_MatA, nBytes); cudaMalloc((void **)&d_MatB, nBytes); cudaMalloc((void **)&d_MatC, nBytes); // transfer data from host to device cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);/// // invoke kernel at host side int dimx = 32; int dimy = 32; //int dimy = 16; dim3 block(dimx, dimy); dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y); iStart = clock(); sumMatrix2DKernel <<< grid, block >>>(d_MatA, d_MatB, d_MatC, nx, ny); cudaDeviceSynchronize(); iEnd = clock(); iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; printf("--sumMatrix2DOnGPU<<<(%d,%d),(%d,%d)>>> elapsed %f sec..\n", grid.x, grid.y, block.x, block.y, iElaps); /// // copy kernel result back to host side cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost); // check device results for(int i=0; i< nxy; i++) { if(fabs(gpuRef[i]-hostRef[i]) > PRECISION) { fprintf(stderr,"Result verification failed at elemnt %d\n", i); exit(EXIT_FAILURE); } } // free device global memory cudaFree(d_MatA); cudaFree(d_MatB); cudaFree(d_MatC); // free host memory free(h_A); free(h_B); free(hostRef); free(gpuRef); // reset device cudaDeviceReset(); printf("Test Passed..\n"); return 0; }
編譯運行: $ nvcc -arch=sm_20 sumMatrix2D.cu -o sumMatrix2D $ ./sumMatrix2D 程序輸出:./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.410000 sec.. --sumMatrix2DOnGPU<<<(512,1024),(32,32)>>> elapsed 0.070000 sec.. Test Passed..
現在我們將block的大小改成(32, 16),此時block數量為512*1024,再次編譯運行,會發現:./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.410000 sec.. --sumMatrix2DOnGPU<<<(512,1024),(32,16)>>> elapsed 0.040000 sec.. Test Passed..
可以看到,程序性能提升了將近1倍,直觀來看是第二次線程配置比第一次配置block的數量增加了1倍,實際上也正是由于block數量增加了的緣故。但是如果繼續增加block的數量,性能反而又會下降。 現在我們將block的大小改為(16,16),此時block數量為1024*1024,再次編譯運行,會發現:./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.400000 sec.. --sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> elapsed 0.050000 sec.. Test Passed..
關于線程塊配置的性能分析參考后續章節。總結
以上是生活随笔為你收集整理的CUDA性能优化----线程配置的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: CUDA学习----sp, sm, th
- 下一篇: python爬虫下载模块_python爬