我很困惑,爲什麼我的紋理版本比我的全局內存的版本更慢,因爲質地版本應該充分利用空間局部性慢紋理內存版本。我正試圖在下面的情況下計算點積。因此,如果一個線程訪問索引i,則其鄰居應訪問i + 1。因此,我們看到空間局部性。爲什麼下面的程序比全局內存版
下面是紋理內存版本:
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
texture<float> arr1;
texture<float> arr2;
const int n = 4;
__global__ void addVal(float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += tex1Dfetch(arr1,tid) * tex1Dfetch(arr2,tid);
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for(i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaBindTexture(NULL,arr1, deva,sizeof(float) * n); // note: deva shd be in gpu
cudaBindTexture(NULL,arr2, devb,sizeof(float) * n); // note: deva shd be in gpu
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for (i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
cudaUnbindTexture(arr1);
cudaUnbindTexture(arr2);
cudaFree(devc);
getchar();
return 0;
}
全球內存的版本:
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__global__ void addVal(float *a, float *b, float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for(i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for (i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
有兩個問題:爲什麼在內核工作在浮點時使用整型紋理?爲什麼你的代碼不包含任何錯誤檢查? – talonmies
@Talons:謝謝你的回覆。使紋理浮動幫助。另外,我的cudaBindTexture調用是錯誤的。然而,雖然我的程序現在正在運行,但我很困惑爲什麼我的紋理版本比我的全局內存版本慢,因爲紋理版本應該利用空間局部性。我已經發布了上述兩個程序。請看看 – Programmer
@Talonmies:我沒有任何錯誤處理,因爲直到現在,我不知道任何錯誤處理。如果你能教我一些,我會很高興知道:) – Programmer