我編寫了一個內核,用於使用約簡計算約100,000個浮點數的最小值和最大值(請參見下面的代碼)。我使用線程塊將1024個值的塊減少爲單個值(在共享內存中),然後在CPU上的塊之間進行最終減少。CUDA中的最小/最大浮點比CPU版本慢。爲什麼?
然後,我將它與僅在CPU上進行的串行計算進行了比較。 CUDA版本需要2.2ms,CPU版本需要0.21ms。爲什麼CUDA版本要慢得多?數組大小不夠大,無法利用並行性,或者我的代碼沒有經過優化?
這是Udacity並行編程課程練習的一部分。我通過他們的網站運行它,所以我不知道確切的硬件是什麼,但他們聲稱代碼在實際的GPU上運行。
這裏是CUDA代碼:
__global__ void min_max_kernel(const float* const d_logLuminance,
const size_t length,
float* d_min_logLum,
float* d_max_logLum) {
// Shared working memory
extern __shared__ float sh_logLuminance[];
int blockWidth = blockDim.x;
int x = blockDim.x * blockIdx.x + threadIdx.x;
float* min_logLuminance = sh_logLuminance;
float* max_logLuminance = sh_logLuminance + blockWidth;
// Copy this block's chunk of the data to shared memory
// We copy twice so we compute min and max at the same time
if (x < length) {
min_logLuminance[threadIdx.x] = d_logLuminance[x];
max_logLuminance[threadIdx.x] = min_logLuminance[threadIdx.x];
}
else {
// Pad if we're out of range
min_logLuminance[threadIdx.x] = FLT_MAX;
max_logLuminance[threadIdx.x] = -FLT_MAX;
}
__syncthreads();
// Reduce
for (int s = blockWidth/2; s > 0; s /= 2) {
if (threadIdx.x < s) {
if (min_logLuminance[threadIdx.x + s] < min_logLuminance[threadIdx.x]) {
min_logLuminance[threadIdx.x] = min_logLuminance[threadIdx.x + s];
}
if (max_logLuminance[threadIdx.x + s] > max_logLuminance[threadIdx.x]) {
max_logLuminance[threadIdx.x] = max_logLuminance[threadIdx.x + s];
}
}
__syncthreads();
}
// Write to global memory
if (threadIdx.x == 0) {
d_min_logLum[blockIdx.x] = min_logLuminance[0];
d_max_logLum[blockIdx.x] = max_logLuminance[0];
}
}
size_t get_num_blocks(size_t inputLength, size_t threadsPerBlock) {
return inputLength/threadsPerBlock +
((inputLength % threadsPerBlock == 0) ? 0 : 1);
}
/*
* Compute min, max over the data by first reducing on the device, then
* doing the final reducation on the host.
*/
void compute_min_max(const float* const d_logLuminance,
float& min_logLum,
float& max_logLum,
const size_t numRows,
const size_t numCols) {
// Compute min, max
printf("\n=== computing min/max ===\n");
const size_t blockWidth = 1024;
const size_t numPixels = numRows * numCols;
size_t numBlocks = get_num_blocks(numPixels, blockWidth);
printf("Num min/max blocks = %d\n", numBlocks);
float* d_min_logLum;
float* d_max_logLum;
int alloc_size = sizeof(float) * numBlocks;
checkCudaErrors(cudaMalloc(&d_min_logLum, alloc_size));
checkCudaErrors(cudaMalloc(&d_max_logLum, alloc_size));
min_max_kernel<<<numBlocks, blockWidth, sizeof(float) * blockWidth * 2>>>
(d_logLuminance, numPixels, d_min_logLum, d_max_logLum);
float* h_min_logLum = (float*) malloc(alloc_size);
float* h_max_logLum = (float*) malloc(alloc_size);
checkCudaErrors(cudaMemcpy(h_min_logLum, d_min_logLum, alloc_size, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_max_logLum, d_max_logLum, alloc_size, cudaMemcpyDeviceToHost));
min_logLum = FLT_MAX;
max_logLum = -FLT_MAX;
// Reduce over the block results
// (would be a bit faster to do it on the GPU, but it's just 96 numbers)
for (int i = 0; i < numBlocks; i++) {
if (h_min_logLum[i] < min_logLum) {
min_logLum = h_min_logLum[i];
}
if (h_max_logLum[i] > max_logLum) {
max_logLum = h_max_logLum[i];
}
}
printf("min_logLum = %.2f\nmax_logLum = %.2f\n", min_logLum, max_logLum);
checkCudaErrors(cudaFree(d_min_logLum));
checkCudaErrors(cudaFree(d_max_logLum));
free(h_min_logLum);
free(h_max_logLum);
}
這裏是主機版本:
void compute_min_max_on_host(const float* const d_logLuminance, size_t numPixels) {
int alloc_size = sizeof(float) * numPixels;
float* h_logLuminance = (float*) malloc(alloc_size);
checkCudaErrors(cudaMemcpy(h_logLuminance, d_logLuminance, alloc_size, cudaMemcpyDeviceToHost));
float host_min_logLum = FLT_MAX;
float host_max_logLum = -FLT_MAX;
printf("HOST ");
for (int i = 0; i < numPixels; i++) {
if (h_logLuminance[i] < host_min_logLum) {
host_min_logLum = h_logLuminance[i];
}
if (h_logLuminance[i] > host_max_logLum) {
host_max_logLum = h_logLuminance[i];
}
}
printf("host_min_logLum = %.2f\nhost_max_logLum = %.2f\n",
host_min_logLum, host_max_logLum);
free(h_logLuminance);
}
爲什麼不直接以不同的數組大小運行代碼並查看相對性能差異是否更改? – talonmies