cuda笔记-一个Block多线程求卷积
生活随笔
收集整理的這篇文章主要介紹了
cuda笔记-一个Block多线程求卷积
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
最近在學cuda,發現自己數學方面的知識不太夠,C語言的知識也有待加強。
這里記錄個筆記對矩陣求卷積。
邏輯是這樣的:
1. 先CUDA生成一個16*16的矩陣;
2. 將這16*16的矩陣,外面包一層0,也就變成18*18的矩陣。
3. 然后再開18*18個線程,進行矩陣的卷積
?
程序運行截圖如下:
源碼如下:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "curand.h" #include "curand_kernel.h" #include <stdio.h> #include <iostream>using namespace std;#pragma comment(lib, "cudart.lib") #pragma comment(lib, "curand.lib")#define N 16__global__ void Matrix_convolution(float *a, float *b) {int x = threadIdx.x;int y = threadIdx.y;__shared__ float shared[(N + 2) * (N + 2)];shared[y * (N + 2) + x] = a[y * (N + 2) + x];__syncthreads();int convolution[9] = { 1, 1, 1, 1, -8, 1, 1, 1, 1 };//對shared進行卷積int pos = y * (N + 2) + x;//第一行和最后一行不要卷積if (pos < N + 2 || (pos >(N + 2) * (N + 1))) {return;}//最左邊和最右邊一行不要卷積if (pos % (N + 2) == 0 || pos % (N + 2) == N + 1) {return;}//卷積的9個值float a00 = shared[(y * (N + 2) + x) - (N + 2) - 1];float a01 = shared[(y * (N + 2) + x) - (N + 2)];float a02 = shared[(y * (N + 2) + x) - (N + 2) + 1];float a10 = shared[(y * (N + 2) + x) - 1];float a11 = shared[(y * (N + 2) + x)];float a12 = shared[(y * (N + 2) + x) + 1];float a20 = shared[(y * (N + 2) + x) + (N + 2) - 1];float a21 = shared[(y * (N + 2) + x) + (N + 2)];float a22 = shared[(y * (N + 2) + x) + (N + 2) + 1];float ret = convolution[0] * a00 + convolution[1] * a01 + convolution[2] * a02+ convolution[3] * a10 + convolution[4] * a11 + convolution[5] * a12 + convolution[6] * a20+ convolution[7] * a21 + convolution[8] * a22;//目前在多少行int rowCount = (y * (N + 2) + x) / (N + 2);int posOffset = (N + 2) + (rowCount - 1) * 2 + 1;b[y * (N + 1) + x - posOffset] = ret; }void Matrix_init_gpu(float *a) {curandGenerator_t gen;curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);curandSetPseudoRandomGeneratorSeed(gen, 11ULL);curandGenerateUniform(gen, a, N * N); }int main() {float *p_original_gpu;float *p_original_cpu;float *p_final_cpu;float *p_final_gpu;float *p_ret_gpu;float *p_ret_cpu;p_original_cpu = (float*)malloc(N * N * sizeof(float));p_final_cpu = (float*)malloc((N + 2) * (N + 2) * sizeof(float));p_ret_cpu = (float*)malloc(N * N * sizeof(float));cudaMalloc((void**)&p_original_gpu, N * N * sizeof(float));cudaMalloc((void**)&p_final_gpu, (N + 2) * (N + 2) * sizeof(float));cudaMalloc((void**)&p_ret_gpu, N * N * sizeof(float));Matrix_init_gpu(p_original_gpu);cudaMemcpy(p_original_cpu, p_original_gpu, N * N * sizeof(float), cudaMemcpyDeviceToHost);for(int i = 0; i < N * N; i++){if (i % N == 0) printf("\n");cout << p_original_cpu[i] << " ";}//開始填充數據for (int i = 0; i < (N + 2) * (N + 2); i++) {p_final_cpu[i] = 0;}int pos = 0;for (int i = N + 2; i < (N + 2) * (N + 1); i++) {if (i % (N + 2) != 0 && i % (N + 2) != N + 1) {p_final_cpu[i] = p_original_cpu[pos++];}}cout << "\n\n填充數據:" << endl;for (int i = 0; i < (N + 2) * (N + 2); i++) {if (i % (N + 2) == 0) printf("\n");cout << p_final_cpu[i] << " ";}cout << "\n\n最后結果:" << endl;cudaMemcpy(p_final_gpu, p_final_cpu, (N + 2) * (N + 2) * sizeof(float), cudaMemcpyHostToDevice);Matrix_convolution << <1, (N + 2) * (N + 2) >> >(p_final_gpu, p_ret_gpu);cudaMemcpy(p_ret_cpu, p_ret_gpu, N * N * sizeof(float), cudaMemcpyDeviceToHost);for (int i = 0; i < N * N; i++) {if (i % N == 0) printf("\n");cout << p_ret_cpu[i] << " ";}cudaFree(p_original_gpu);cudaFree(p_final_gpu);cudaFree(p_ret_gpu);free(p_original_cpu);free(p_final_cpu);free(p_ret_cpu);getchar();return 0; }?
總結
以上是生活随笔為你收集整理的cuda笔记-一个Block多线程求卷积的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 信息安全工程师笔记-数据库安全
- 下一篇: Python文档阅读笔记-OpenCV中