我有一個數組的大小爲3000的數組包含0和1.我想要找到第一個數組的位置,有1存儲在該位置從第0個索引開始.i將此數組傳遞給主機和此數組是在設備上計算出來的,然後我順序計算了Host.in上的索引,我的程序中我希望重複計算4000次或更多次。我想減少這個過程所花費的時間。有沒有其他方式可以做到這一點?而且這個陣列實際上是在GPU上計算的,所以我必須每次都傳輸它。如何減少CudaMemcpy開銷
int main()
{
for(int i=0;i<4000;i++)
{
cudaMemcpy(A,dev_A,sizeof(int)*3000,cudaMemcpyDeviceToHost);
int k;
for(k=0;k<3000;k++)
{
if(A[k]==1)
{
break;
}
}
printf("got k is %d",k);
}
}
完整代碼是這樣 的#include 「cuda.h」 的#include 的#define SIZE 2688 的#define BLOCKS 14 的#define THREADS 192
__global__ void kernel(int *A,int *d_pos)
{
int thread_id=threadIdx.x+blockIdx.x*blockDim.x;
while(thread_id<SIZE)
{
if(A[thread_id]==INT_MIN)
{
*d_pos=thread_id;
return;
}
thread_id+=1;
}
}
__global__ void kernel1(int *A,int *d_pos)
{
int thread_id=threadIdx.x+blockIdx.x*blockDim.x;
if(A[thread_id]==INT_MIN)
{
atomicMin(d_pos,thread_id);
}
}
int main()
{
int pos=INT_MAX,i;
int *d_pos;
int A[SIZE];
int *d_A;
for(i=0;i<SIZE;i++)
{
A[i]=78;
}
A[SIZE-1]=INT_MIN;
cudaMalloc((void**)&d_pos,sizeof(int));
cudaMemcpy(d_pos,&pos,sizeof(int),cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_A,sizeof(int)*SIZE);
cudaMemcpy(d_A,A,sizeof(int)*SIZE,cudaMemcpyHostToDevice);
cudaEvent_t start_cp1,stop_cp1;
cudaEventCreate(&stop_cp1);
cudaEventCreate(&start_cp1);
cudaEventRecord(start_cp1,0);
kernel1<<<BLOCKS,THREADS>>>(d_A,d_pos);
cudaEventRecord(stop_cp1,0);
cudaEventSynchronize(stop_cp1);
float elapsedTime_cp1;
cudaEventElapsedTime(&elapsedTime_cp1,start_cp1,stop_cp1);
cudaEventDestroy(start_cp1);
cudaEventDestroy(stop_cp1);
printf("\nTime taken by kernel is %f\n",elapsedTime_cp1);
cudaDeviceSynchronize();
cudaEvent_t start_cp,stop_cp;
cudaEventCreate(&stop_cp);
cudaEventCreate(&start_cp);
cudaEventRecord(start_cp,0);
cudaMemcpy(A,d_A,sizeof(int)*SIZE,cudaMemcpyDeviceToHost);
cudaEventRecord(stop_cp,0);
cudaEventSynchronize(stop_cp);
float elapsedTime_cp;
cudaEventElapsedTime(&elapsedTime_cp,start_cp,stop_cp);
cudaEventDestroy(start_cp);
cudaEventDestroy(stop_cp);
printf("\ntime taken by copy of an array is %f\n",elapsedTime_cp);
cudaEvent_t start_cp2,stop_cp2;
cudaEventCreate(&stop_cp2);
cudaEventCreate(&start_cp2);
cudaEventRecord(start_cp2,0);
cudaMemcpy(&pos,d_pos,sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stop_cp2,0);
cudaEventSynchronize(stop_cp2);
float elapsedTime_cp2;
cudaEventElapsedTime(&elapsedTime_cp2,start_cp2,stop_cp2);
cudaEventDestroy(start_cp2);
cudaEventDestroy(stop_cp2);
printf("\ntime taken by copy of a variable is %f\n",elapsedTime_cp2);
cudaMemcpy(&pos,d_pos,sizeof(int),cudaMemcpyDeviceToHost);
printf("\nminimum index is %d\n",pos);
return 0;
}
我該如何減少此代碼與其他任何性能選項所花費的總時間。
什麼是產生內容的內核速度設備陣列相對於複製操作?它是更快還是更慢? – talonmies
目前形式的代碼並不合理。因此,我假設*在循環中調用'cudaMemcpy'之前,啓動內核(每次都用新數據填充'dev_A') - 這是正確的嗎? – Marco13
是否可以交替更新的設備陣列? – hubs