2013-05-18 28 views
0
using namespace std; 
#include <iostream> 
#include <stdio.h> 
#include <stdlib.h> 

const int threadsPerBlock = 256; 
const int N = 40000; 

void generateArray(double *data, int count) { 
    for (int i = 0; i < count; i++) 
     data[i] = rand()/((rand() + rand())/2.0 + 1); 
} 

double maxCPU(double *arr, int count) { 

    int max = arr[0]; 

    for (int i = 0; i < count; i++) 
     if (arr[i] > max) 
      max = arr[i]; 
    return max; 
} 

__global__ void MaxGPU(double *a, int count, double *result){ 

    __shared__ double cache[threadsPerBlock]; 

    int tid = threadIdx.x + blockIdx.x * blockDim.x; 
    int cacheIndex = threadIdx.x; 

    int temp = a[tid]; 
    tid+= blockDim.x * gridDim.x; 

    while(tid < count){ 
     if(a[tid] > temp) 
      temp = a[tid]; 
     tid+= blockDim.x * gridDim.x; 
    } 

    cache[cacheIndex] = temp; 

    __syncthreads(); 

    int i = blockDim.x/2; 
    while(i!=0){ 
     if(cacheIndex < i) 
      if(cache[cacheIndex + i] > cache[cacheIndex]) 
       cache[cacheIndex] = cache[cacheIndex + i]; 
     __syncthreads(); 
     i/=2; 
    } 

    if(cacheIndex == 0) 
     result[blockIdx.x] = cache[0]; 
} 

int main(void) { 
    double *arr = new double[N], resultGPU; 
    generateArray(arr, N); 
    double *devA, *dev_partial_result; 

    double resultCPU = maxCPU(arr, N); 

    cudaMalloc((void**)&devA, N * sizeof(double)); 
    cudaMalloc((void**)&dev_partial_result, 512 * sizeof(double)); 

    cudaMemcpy(devA, arr, N * sizeof(double), cudaMemcpyHostToDevice); 

    MaxGPU<<<1, 256>>>(devA, N, dev_partial_result); 

    cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost); 

    cout << "Max CPU: " << resultCPU << endl; 
    cout << "Max GPU: " << resultGPU << endl; 

    cudaFree(devA); 
    cudaFree(dev_partial_result); 

    delete [] arr; 
    return 0; 
} 

我寫了上面的代碼。我不是爲什麼,但它只適用於一個街區。它不適用於256或512塊。爲什麼?怎麼了?最大GPU內核功能只能使用一個塊

+0

你可能已經錯過的是,這個代碼產生每塊*一個最大值*,所以你需要分配並複製足夠的內存來保存來自您啓動的每個塊的結果。 – talonmies

回答

1

嘗試改變

double resultGPU; to 
double* resultGPU = new double[blocks_count]; 

cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost); to 
cudaMemcpy(resultGPU, dev_partial_result,blocks_count*sizeof(double), cudaMemcpyDeviceToHost);