cuda编程实例
矢量求和
include "error.cuh" #include <stdio.h> #include <cuda_runtime.h>#define N 100__global__ void add( int *a, int *b, int *c ) {int tid = blockIdx.x; // this thread handles the data at its thread idif (tid < N)c[tid] = a[tid] + b[tid]; }int main( void ) {int a[N], b[N], c[N];int *dev_a, *dev_b, *dev_c;// allocate the memory on the GPUCHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = -i;b[i] = i * i;}// copy the arrays 'a' and 'b' to the GPUCHECK( cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) );CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) );add<<<N,1>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUCHECK( cudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost ) );// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %d\n", a[i], b[i], c[i] );}// free the memory allocated on the GPUCHECK( cudaFree( dev_a ) );CHECK( cudaFree( dev_b ) );CHECK( cudaFree( dev_c ) );return 0; }julia集
在這里插入代碼片任意長度矢量求和
#include "error.cuh" #include <stdio.h> #include <cuda_runtime.h> #define N (33 * 1024)__global__ void add( int *a, int *b, int *c ) {//獲取線程id,網格大小1D,線程塊大小1D,gridDim.x=128,blockDim.x=128 //數據量:33792,線程數:16384int tid = threadIdx.x + blockIdx.x * blockDim.x;//方法1:for(int i=0;i<N;i++){if(tid==i%blockDim.x*gridDim.x){c[i] = a[i] + b[i];}} /*//方法2:while (tid < N) {c[tid] = a[tid] + b[tid];tid += blockDim.x * gridDim.x;} */ } int main( void ) {//定義主機指針和設備指針int *a, *b, *c;int *dev_a, *dev_b, *dev_c;//CPU上動態分配內存a = (int*)malloc( N * sizeof(int) );b = (int*)malloc( N * sizeof(int) );c = (int*)malloc( N * sizeof(int) );//GPU上動態分配內存CHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );//CPU上為a和b賦值for (int i=0; i<N; i++) {a[i] = i;b[i] = 2 * i;}//將a和b的數據從主機拷貝到設備CHECK( cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) );CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) );//調用核函數,設置線程,網格大小1D,線程塊大小1D,32add<<<128,128>>>( dev_a, dev_b, dev_c );//將數據從設備傳遞到主機上CHECK( cudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost ) );//結果校驗bool success = true;for (int i=0; i<N; i++) {if ((a[i] + b[i]) != c[i]) {success = false;}}if (success)printf( "We did it!\n" );elseprintf("Not Pass!Error!!!\n");//釋放GPU內存空間CHECK( cudaFree( dev_a ) );CHECK( cudaFree( dev_b ) );CHECK( cudaFree( dev_c ) );//釋放CPU內存空間free( a );free( b );free( c );return 0; }矩陣處理
矩陣轉置
點積
常量內存
總結
- 上一篇: mysql触发器更新前触发_mysql触
- 下一篇: win10 UEFI引导恢复