我想計算 '出來=阿爾法* PX +的β*吡啶', 'PX' 和 'PY' 是數組*CUDA循環展開添加
我有一個簡單的內核:
__global__
void saxpyGPU2(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
size_t i = blockDim.x*blockIdx.x + threadIdx.x;
while (i < N)
{
out[i] = alpha * px[i] + beta * py[i];
i += blockDim.x*gridDim.x;
}
}
它的工作原理,所以我想循環展開。
在CUDA的手冊中的代碼是:
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
float x[n], y[n];
size_t i;
for (i = n*blockIdx.x*blockDim.x+threadIdx.x; i < N-n*blockDim.x*gridDim.x; i += n*blockDim.x*gridDim.x) {
for (int j = 0; j < n; j++) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for (int j = 0; j < n; j++) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for (int j = 0; j < n; j++) {
for (int j = 0; j < n; j++) {
size_t index = i+j*blockDim.x;
if (index<N) {
x[j] = px[index];
y[j] = py[index];
}
}
for (int j = 0; j < n; j++) {
size_t index = i+j*blockDim.x;
if (index<N) out[index] = alpha*x[j]+beta* y[j];
}
}
}
__global__
void saxpyGPU(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
saxpy_unrolled<4>(out, px, py, N, alpha ,beta);
}
我不在第二分支理解當我> N-正* blockDim.x * gridDim.x。爲什麼要使用外循環
for (int j = 0; j < n; j++) { for (int j = 0; j < n; j++)....}
我測試這兩個內核,第一個是OK的,但第二個我從書上覆制不正確。
我初始化了兩個數組while(i<1024) a[i] = i; b[i] = 10*i;i++
,我想用上面的兩個內核計算c = alpha * a + beta * b,但是對於c中的所有元素,循環展開內核的結果是4.3e8。
這是我的測試代碼:
int main(){
int arraySize = 1024;
float* a =new float[arraySize];
float* b =new float[arraySize];
float* c =new float[arraySize];
for (int i =0;i<arraySize;i++)
{
a[i] = 1.0* i;
b[i] = 10.0*i;
c[i] = 0.0;
}
float* d_a;
float* d_b;
float* d_c;
cudaMalloc((void**)&d_a,sizeof(float)*arraySize);
cudaMemcpy(d_a,a,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b,sizeof(float)*arraySize);
cudaMemcpy(d_b,b,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_c,sizeof(float)*arraySize);
for (int i=0;i<arraySize;i++)
{
c[i] = a[i] + b[i];
}
dim3 block_size(256,1,1);
dim3 grid_size((arraySize -1)/block_size.x+1,1,1);
float alpha = 1.0;
float beta = 1.0;
bool flag = true;
if(flag)
{
saxpyGPU<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
float* temp = new float[arraySize];
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
}
else
{
saxpyGPU2<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
這兩個內核表現出不同的結果
你的問題是什麼? – talonmies
@talonmies,我編輯了問題 – Zziggurats
您添加的測試代碼不完整,無法編譯。 CUDA手冊中的代碼沒有任何問題。它工作正常。 – talonmies