當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

cuda编程实例

發布時間：2023/12/20 编程问答 17 豆豆

生活随笔收集整理的這篇文章主要介紹了 cuda编程实例小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

矢量求和

include "error.cuh" #include <stdio.h> #include <cuda_runtime.h>#define N 100__global__ void add( int *a, int *b, int *c ) {int tid = blockIdx.x; // this thread handles the data at its thread idif (tid < N)c[tid] = a[tid] + b[tid]; }int main( void ) {int a[N], b[N], c[N];int *dev_a, *dev_b, *dev_c;// allocate the memory on the GPUCHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = -i;b[i] = i * i;}// copy the arrays 'a' and 'b' to the GPUCHECK( cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) );CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) );add<<<N,1>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUCHECK( cudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost ) );// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %d\n", a[i], b[i], c[i] );}// free the memory allocated on the GPUCHECK( cudaFree( dev_a ) );CHECK( cudaFree( dev_b ) );CHECK( cudaFree( dev_c ) );return 0; }

julia集

在這里插入代碼片

任意長度矢量求和

#include "error.cuh" #include <stdio.h> #include <cuda_runtime.h> #define N (33 * 1024)__global__ void add( int *a, int *b, int *c ) {//獲取線程id，網格大小1D,線程塊大小1D,gridDim.x=128，blockDim.x=128 //數據量：33792，線程數：16384int tid = threadIdx.x + blockIdx.x * blockDim.x;//方法1：for(int i=0;i<N;i++){if(tid==i%blockDim.x*gridDim.x){c[i] = a[i] + b[i];}} /*//方法2：while (tid < N) {c[tid] = a[tid] + b[tid];tid += blockDim.x * gridDim.x;} */ } int main( void ) {//定義主機指針和設備指針int *a, *b, *c;int *dev_a, *dev_b, *dev_c;//CPU上動態分配內存a = (int*)malloc( N * sizeof(int) );b = (int*)malloc( N * sizeof(int) );c = (int*)malloc( N * sizeof(int) );//GPU上動態分配內存CHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );//CPU上為a和b賦值for (int i=0; i<N; i++) {a[i] = i;b[i] = 2 * i;}//將a和b的數據從主機拷貝到設備CHECK( cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) );CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) );//調用核函數，設置線程，網格大小1D，線程塊大小1D,32add<<<128,128>>>( dev_a, dev_b, dev_c );//將數據從設備傳遞到主機上CHECK( cudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost ) );//結果校驗bool success = true;for (int i=0; i<N; i++) {if ((a[i] + b[i]) != c[i]) {success = false;}}if (success)printf( "We did it!\n" );elseprintf("Not Pass!Error!!!\n");//釋放GPU內存空間CHECK( cudaFree( dev_a ) );CHECK( cudaFree( dev_b ) );CHECK( cudaFree( dev_c ) );//釋放CPU內存空間free( a );free( b );free( c );return 0; }

矩陣處理

#define N 1000 #define BLOCK_SIZE 32__managed__ int input_Matrix[N][N]; __managed__ int output_GPU[N][N]; __managed__ int output_CPU[N][N]; __managed__ int output_GPU[N][N]; __managed__ int output_CPU[N][N]; __global__ void kernel(int input_M[N][N], int output_M[N][N]) {int x = blockIdx.x * blockDim.x + threadIdx.x;//<<gradx,blockx>>int y = blockIdx.y * blockDim.y + threadIdx.y;//<grady,blocky>> //x維度變化快相當于內層循環if(x<N && y<N){if(x%2==0 && y%2==0){output_M[y][x] = input_M[y][x]*input_M[y][x];}else{output_M[y][x] = input_M[y][x]-1;}} } void cpu_kernel(int intput_M[N][N], int output_CPU[N][N]) {for(int i=0; i<N; i++){for(int j=0; j<N; j++){if(j%2==0 && i%2==0){output_CPU[i][j] = intput_M[i][j]*intput_M[i][j];}else{output_CPU[i][j] = intput_M[i][j]-1;}}} }int main(int argc, char const *argv[]) {for (int i = 0; i < N; ++i) {for (int j = 0; j < N; ++j){input_Matrix[i][j] = rand()%3001;}}//CPUcudaEvent_t start_cpu,stop_cpu;CHECK(cudaEventCreate(&start_cpu));CHECK(cudaEventCreate(&start_cpu));CHECK(cudaEventCreate(&stop_cpu));CHECK(cudaEventRecord(start_cpu));printf("\n***********CPU RUN**************\n");cpu_kernel(input_Matrix, output_CPU);CHECK(cudaEventRecord(stop_cpu));CHECK(cudaEventSynchronize(stop_cpu));float elapsed_time_cpu;CHECK(cudaEventElapsedTime(&elapsed_time_cpu, start_cpu, stop_cpu));printf("Time_CPU = %g ms.\n", elapsed_time_cpu);CHECK(cudaEventDestroy(start_cpu));CHECK(cudaEventDestroy(stop_cpu));//GPUcudaEvent_t start,stop_gpu;CHECK(cudaEventCreate(&start));CHECK(cudaEventCreate(&stop_gpu));CHECK(cudaEventRecord(start));unsigned int grid_rows = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;unsigned int grid_cols = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;dim3 dimGrid(grid_cols, grid_rows);//網格大小，保證gridDim.x/y*blockDim.y,可以被32整除，且大于數組（x,y）的數據大小dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);printf("\n***********GPU RUN**************\n");kernel<<<dimGrid, dimBlock>>>(input_Matrix, output_GPU);CHECK(cudaDeviceSynchronize());CHECK(cudaEventRecord(stop_gpu));CHECK(cudaEventSynchronize(stop_gpu));float elapsed_time_gpu;CHECK(cudaEventElapsedTime(&elapsed_time_gpu, start, stop_gpu));printf("Time_GPU = %g ms.\n", elapsed_time_gpu);CHECK(cudaEventDestroy(start));CHECK(cudaEventDestroy(stop_gpu));//校驗printf("\n***********Check result**************\n");int ok=1;for (int i = 0; i < N; ++i){for (int j = 0; j < N; ++j){if(fabs(output_GPU[i][j] - output_CPU[i][j])>(1.0e-10)){ok = 0;}}}if(ok){printf("Pass!!!\n");}else{printf("Error!!!\n");}// free memoryreturn 0; }

矩陣轉置

點積

常量內存

總結

以上是生活随笔為你收集整理的cuda编程实例的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

实例
CUDA

上一篇： mysql触发器更新前触发_mysql触
下一篇： win10 UEFI引导恢复