talonmies已經回答了您的問題,所以我只想分享一個代碼,這個代碼的靈感來自上面答案中提到的V. Volkov第一部分的介紹。
這是代碼:
#include<stdio.h>
#define N_ITERATIONS 8192
//#define DEBUG
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if (tid < N) {
int a = d_a[tid];
int b = d_b[tid];
int c = d_c[tid];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/2];
int b2 = d_b[tid+N/2];
int c2 = d_c[tid+N/2];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid+N/2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/4) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/4];
int b2 = d_b[tid+N/4];
int c2 = d_c[tid+N/4];
int a3 = d_a[tid+N/2];
int b3 = d_b[tid+N/2];
int c3 = d_c[tid+N/2];
int a4 = d_a[tid+3*N/4];
int b4 = d_b[tid+3*N/4];
int c4 = d_c[tid+3*N/4];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
d_a[tid] = a1;
d_a[tid+N/4] = a2;
d_a[tid+N/2] = a3;
d_a[tid+3*N/4] = a4;
}
}
/********/
/* MAIN */
/********/
void main() {
const int N = 1024;
int *h_a = (int*)malloc(N*sizeof(int));
int *h_a_result_host = (int*)malloc(N*sizeof(int));
int *h_a_result_device = (int*)malloc(N*sizeof(int));
int *h_b = (int*)malloc(N*sizeof(int));
int *h_c = (int*)malloc(N*sizeof(int));
for (int i=0; i<N; i++) {
h_a[i] = 2;
h_b[i] = 1;
h_c[i] = 2;
h_a_result_host[i] = h_a[i];
for(unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* KERNEL0 */
/***********/
cudaEventRecord(start, 0);
kernel0<<<1, N>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
/***********/
/* KERNEL1 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(start, 0);
kernel1<<<1, N/2>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
/***********/
/* KERNEL2 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(start, 0);
kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
cudaDeviceReset();
}
在我的GeForce GT540M,其結果是
kernel0 GFlops = 21.069281 Occupancy = 66%
kernel1 GFlops = 21.183354 Occupancy = 33%
kernel2 GFlops = 21.224517 Occupancy = 16.7%
這意味着與較低的入住內核仍然可以具有高性能,如果指令級並行( ILP)被利用。
很好的答案。佔用只是隱藏全局內存訪問延遲的嚴重問題;對於計算綁定線程,每個SP的一些活動線程應該足夠了。這是你的理解嗎? – Patrick87
我真的不這麼認爲,帕特里克。對於所有類型的內核來說都不是這樣。對於計算綁定的內核,更高的佔用率仍可能會提高性能。爲了隱藏算術等待時間,需要多少主動warps並不是那麼簡單。這取決於操作的類型以及它們如何相互交錯。 – Zk1001