CUDA并行編程的基本思路是把一個很大的任務劃分成N個簡單重復的操作,創建N個線程分別執行執行,每個網格(Grid)可以最多創建65535個線程塊,每個線程塊(Block)一般最多可以創建512個并行線程,在第一個CUDA程序中對核函數的調用是:
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
這里的<<<>>>運算符內是核函數的執行參數,告訴編譯器運行時如何啟動核函數,用于說明內核函數中的線程數量,以及線程是如何組織的。
<<<>>>運算符完整的執行配置參數形式是<<<Dg, Db, Ns, S>>>
參數Dg用于定義整個grid的維度和尺寸,即一個grid有多少個block。為dim3類型。Dim3 Dg(Dg.x, Dg.y, 1)表示grid中每行有Dg.x個block,每列有Dg.y個block,第三維恒為1(目前一個核函數只有一個grid)。整個grid中共有Dg.x*Dg.y個block,其中Dg.x和Dg.y最大值為65535。參數Db用于定義一個block的維度和尺寸,即一個block有多少個thread。為dim3類型。Dim3 Db(Db.x, Db.y, Db.z)表示整個block中每行有Db.x個thread,每列有Db.y個thread,高度為Db.z。Db.x和Db.y最大值為512,Db.z最大值為62。 一個block中共有Db.x*Db.y*Db.z個thread。計算能力為1.0,1.1的硬件該乘積的最大值為768,計算能力為1.2,1.3的硬件支持的最大值為1024。參數Ns是一個可選參數,用于設置每個block除了靜態分配的shared Memory以外,最多能動態分配的shared memory大小,單位為byte。不需要動態分配時該值為0或省略不寫。參數S是一個cudaStream_t類型的可選參數,初始值為零,表示該核函數處在哪個流之中。在第一個CUDA程序中使用了1個線程塊,每個線程塊包含size個并行線程,每個線程的索引是threadIdx.x。也可以選擇創建size個線程塊,每個線程塊包含1個線程,核函數的調用更改為:addKernel<<<size, 1>>>(dev_c, dev_a, dev_b);線程的索引更改為blockIdx.x。完整程序如下:#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);__global__ void addKernel(int *c, const int *a, const int *b){ int i = blockIdx.x; c[i] = a[i] + b[i];}int main(){ const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fPRintf(stderr, "addWithCuda failed!"); return 1; } printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}/n", c[0], c[1], c[2], c[3], c[4]); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } getchar(); return 0;}// Helper function for using CUDA to add vectors in parallel.cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size){ int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element. //addKernel<<<1, size>>>(dev_c, dev_a, dev_b); addKernel << <size, 1 >> > (dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s/n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!/n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus;}執行結果一致:更普遍的情況是需要創建多個線程塊,每個線程塊包含多個并行線程,這種情況下線程索引的計算為:int tid=threadIdx.x+blockIdx.x*blockDim.x;blockIdx代表線程塊在網格中的索引值,blockDim代表線程塊的尺寸大小,另外還有gridDim代表網格的尺寸大小。如果有N個并行的任務,我們希望每個線程塊固定包含6個并行的線程,則可以使用以下的核函數調用:addKernel<<<(N+5)/6, 6>>>(dev_c, dev_a, dev_b);把第一個CUDA程序的向量個數增加到15個,修改成以上調用方式:
#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);__global__ void addKernel(int *c, const int *a, const int *b){ int i = threadIdx.x + blockIdx.x*blockDim.x; if (i < 15) c[i] = a[i] + b[i];}int main(){ const int arraySize = 15; const int a[arraySize] = { 1, 2, 3, 4, 5,6,7,8,9,10,11,12,13,14,15 }; const int b[arraySize] = { 10, 20, 30, 40, 50,60,70,80,90,100,110,120,130,140,150 }; int c[arraySize] = { 0 }; // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; } printf("{ 1, 2, 3, 4, 5,6,7,8,9,10,11,12,13,14,15}+/n{ 10, 20, 30, 40, 50,60,70,80,90,100,110,120,130,140,150}=/n{%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d}/n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14]); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } getchar(); return 0;}// Helper function for using CUDA to add vectors in parallel.cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size){ int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element. //addKernel<<<1, size>>>(dev_c, dev_a, dev_b); addKernel << <(size + 5) / 6, 6 >> > (dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s/n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!/n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus;}執行結果:以下CUDA和OpenCV混合編程,對一幅圖像上每個像素點的顏色執行一次運算,生成一幅規則的圖形。新建了一個 dim3類型的變量grid(DIM, DIM),代表一個二維的網格,尺寸大小是DIM*DIM個線程塊:
#include "cuda_runtime.h" #include <highgui.hpp> using namespace cv;#define DIM 600 //圖像長寬 __global__ void kernel(unsigned char *ptr){ // map from blockIdx to pixel position int x = blockIdx.x; int y = blockIdx.y; int offset = x + y * gridDim.x; //BGR設置 ptr[offset * 3 + 0] = 999 * x*y % 255; ptr[offset * 3 + 1] = 99 * x*x*y*y % 255; ptr[offset * 3 + 2] = 9 * offset*offset % 255;}// globals needed by the update routine struct DataBlock{ unsigned char *dev_bitmap;};int main(void){ DataBlock data; cudaError_t error; Mat image = Mat(DIM, DIM, CV_8UC3, Scalar::all(0)); data.dev_bitmap = image.data; unsigned char *dev_bitmap; error = cudaMalloc((void**)&dev_bitmap, 3 * image.cols*image.rows); data.dev_bitmap = dev_bitmap; dim3 grid(DIM, DIM); //DIM*DIM個線程塊 kernel <<<grid, 1 >>> (dev_bitmap); error = cudaMemcpy(image.data, dev_bitmap, 3 * image.cols*image.rows, cudaMemcpyDeviceToHost); error = cudaFree(dev_bitmap); imshow("CUDA Grid/Block/Thread)", image); waitKey();}執行效果:
新聞熱點
疑難解答