一、前言
本文介紹CUDA編程的共享內存和同步。共享內存中的變量(核函數中用__shared__聲明),在GPU上啟動的每個線程塊,編譯器都創建該變量的副本,若啟動N個線程塊,則有N個該變量副本,為每個線程塊私有;同步則是使線程塊中所有的線程能夠在執行完某些語句后,才執行后續語句。
二、線程塊、線程索引
以下為線程塊與線程的層次結構圖
每個線程均獨自執行核函數,若在核函數中聲明共享變量,則每個線程塊均擁有該變量的一個副本,且該副本為該程序塊內的所有線程所共享。
三、共享變量和同步例子
(1)以下程序實現了點積運算,計算公式為 f(n) = 1+2*2+ 3*3+ … (n-1)*(n-1),使用共享變量計算各個程序塊內所有線程的求和運算結果。
#include <cuda_runtime.h> #include <iostream> //main1.cu#include "book.h"using namespace std;#define N 33*1024 //數組長度const int threadsPerBlock = 64; //每個線程塊的線程數量const int blocksPerGrid = 64; //第一維線程格內線程數量__global__ void add(float *a, float *b, float *c){ __shared__ float cache[threadsPerBlock]; //__shared__聲明共享變量,每個線程塊均有自己的副本,被其所有 //線程共享,這里用于存放每個線程塊內各個線程所計算得的點積和 int index =threadIdx.x + blockIdx.x *blockDim.x; //將線程塊、線程索引轉換為數組的索引 int cacheIdx = threadIdx.x; float temp = 0; while (index < N){ temp += a[index] * b[index]; index += gridDim.x * blockDim.x; } cache[cacheIdx] = temp; //存放每個線程塊內各個線程所計算得的點積和 __syncthreads(); //cuda內置函數,使所有線程均執行完該命令前代碼,才執行后面語句,也即保持同步 //目的為獲得各個cache副本,此時共有64個cache副本 //規約運算,將每個cache副本求和,結果保存于cache[0] int i = blockDim.x / 2; while (i != 0){ if (cacheIdx < i){ cache[cacheIdx] += cache[i + cacheIdx]; } __syncthreads(); //所有線程完成一次規約運算,方可進行下一次 i /= 2; } if (cacheIdx == 2) //一個操作只需一個線程完成即可 c[blockIdx.x] = cache[0]; //所有副本的cache[0] 存放于數組c}int main(){ float a[N], b[N]; float *c = new float[blocksPerGrid]; float *dev_a, *dev_b, *dev_c; //gpu上分配內存 HANDLE_ERROR(cudaMalloc((void**)&dev_a, N*sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&dev_b, N*sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&dev_c, N*sizeof(float))); //為數組a,b初始化 for (int i = 0; i < N; ++i){ a[i] = i; b[i] = i; } //講數組a,b數據復制至gpu (cudaMemcpy(dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice)); (cudaMemcpy(dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice)); add <<< blocksPerGrid, threadsPerBlock >> >(dev_a, dev_b, dev_c); //將數組dev_c復制至cpu HANDLE_ERROR(cudaMemcpy(c, dev_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost)); //進一步求和 double sums = 0.0; for (int i = 0; i < blocksPerGrid; ++i){ sums += c[i]; } //顯示結果 cout << "gpu dot compute result:" << sums << "/n"; sums = 0.0; for (int i = 0; i < N; ++i){ sums += i*i; } cout << "cpu dot compute result:" << sums << "/n"; //釋放在gpu分配的內存 cudaFree( dev_a); cudaFree(dev_b); cudaFree(dev_c); delete c; return 0;}運行結果
(2)以下程序使用二維程序塊共享變量計算圖像數據,生成圖像
//main2.cu#include <cuda_runtime.h> #include <iostream> #include "book.h"#include <opencv2/opencv.hpp>using namespace cv;using namespace std;#define PI 3.1415926#define DIM 1024 //灰度圖像的長與寬__global__ void kernel(uchar * _ptr ){ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int idx = x + y *gridDim.x *blockDim.x; __shared__ float shared [16][16] ; //每個線程塊中每個線程的共享內存緩沖區 const float period = 128.0f; shared[threadIdx.x][threadIdx.y] = 255 * (sinf(x*2.0f*PI / period) + 1.0f)*(sinf(y*2.0f*PI / period) + 1.0f) / 4.0f; __syncthreads(); //使所有shared副本均被計算完成 _ptr[idx] = shared[15 - threadIdx.x][15 - threadIdx.y];}int main(){ Mat src(DIM,DIM , CV_8UC1 , Scalar::all(0)); uchar *ptr_dev; HANDLE_ERROR(cudaMalloc((void**)&ptr_dev, DIM * DIM*sizeof(uchar))); dim3 blocks(DIM / 16, DIM / 16); dim3 threads(16 ,16); kernel << < blocks, threads >> >( ptr_dev ); HANDLE_ERROR(cudaMemcpy(src.data, ptr_dev, DIM * DIM*sizeof(uchar), cudaMemcpyDeviceToHost)); cudaFree(ptr_dev); namedWindow("Demo", 0); imshow("Demo" , src); waitKey(0); return 0;}運行結果:
|
新聞熱點
疑難解答