我寫了一段代碼來調用內核gpugem3前綴掃描gpugems3中的CUDA示例代碼是否正確?
但我得到的結果是一堆負數而不是前綴掃描。我想知道我的內核調用是錯誤的還是gpugem3代碼有問題?
這裏是我的代碼:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int *g_idata, int n, int dim)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += g_idata[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int *h_in,int num_items)
{
int j;
for(j=0;j<num_items;j++)
h_in[j]=j;
printf(" input: ");
printf("\n\n");
}
int main(int argc, char** argv)
{
int num_items = 512;
int* h_in = new int[num_items];
// Initialize problem
Initialize(h_in, num_items);
int *d_in = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * num_items);
if(cudaSuccess!= cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
// Allocate device output array
int *d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
int* h_out= new int[num_items+1];
if(cudaSuccess !=cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
int i;
printf(" \n");
for(i=0;i<num_items;i++)
printf(" ,%d ",h_out[i]);
// Cleanup
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
printf("\n\n");
return 0;
}
你的內核使用動態共享內存和你甚至有「在調用上分配的註釋」,但是你不在內核調用上分配共享內存,它應該是內核調用的第3個參數,參見http://docs.nvidia.com/cuda/cuda-c-programming- guide/index.html#execution-configuration –
所以我通過調用kernel <<< 1,256,256 * sizeof(int)>>>(g_odata,g_idata,n)在內核調用上分配共享內存,但仍然得到一些負數作爲我的結果。有誰知道我做錯了什麼? – dibid
請發佈您正在運行到您的問題的*實際*代碼。在發佈的問題中,版本中存在很多嚴重錯誤,但顯然這不是您現在正在運行的內容。 – talonmies