考慮哪些9US下運行在K20下面的代碼:__shared__可變奇怪的行爲CUDA
__global__ void histogram(unsigned char *inputPointer, int *outputPointer)
{
__shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ];
__shared__ unsigned int partHist[ 256 ] ;
int i ;
int tx = threadIdx.x;
int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t val = inputPointer[ pixelOffset ];
uint8_t data = val/ 8 ;
uint8_t position = val % 8 ;
/**Trying to avoid loops thats why this code */
localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ;
__syncthreads();
TURN_ON(localDispersedHistogram [ tx ] [ data ] , position);
__syncthreads();
partHist[ tx ] = 0;
int k = 0 ;
for (int i = 0 ; i < 256 ; i ++) {
k++;
}
}
現在下面的代碼採取72us共享變量的訪問:
__global__ void histogram(unsigned char *inputPointer, int *outputPointer)
{
__shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ];
__shared__ unsigned int partHist[ 256 ] ;
int i ;
int tx = threadIdx.x;
int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t val = inputPointer[ pixelOffset ];
uint8_t data = val/ 8 ;
uint8_t position = val % 8 ;
/**Trying to avoid loops thats why this code */
localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ;
__syncthreads();
TURN_ON(localDispersedHistogram [ tx ] [ data ] , position);
__syncthreads();
partHist[ tx ] = 0;
for (int i = 0 ; i < 256 ; i ++) {
partHist[ tx ]++;
}
}
爲什麼共享訪問會產生如此巨大的差異? 我明白共享訪問比寄存器訪問貴,但如果你在上面的代碼,
TURN_ON(localDispersedHistogram [ tx ] [ data ] , position);
也使用共享變量的行,localDispersedHistogram的操縱怎麼來的花費較少的時間看,只有partHist訪問正在瘋狂的時間?
幫助。
更新: 我的歉意:
我的內核配置爲< < < 256,256 >>>
全碼:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <conio.h>
#define SIZE_OF_OUTPUT_ARRAY 256 * 256 * 256
#define SIZE_OF_INPUT_ARRAY 256 * 256
#define TURN_ON(DATA,POSITION) (DATA|=(1<<(POSITION)))
__global__ void histogram(unsigned char *inputPointer, int *outputPointer)
{
#if 1
__shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ];
__shared__ long long partHist[ 256 ] ;
int i ;
int tx = threadIdx.x;
int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t val = inputPointer[ pixelOffset ];
uint8_t data = val/ 8 ;
uint8_t position = val % 8 ;
for (int j = 0 ; j < 32 ; j++) {
localDispersedHistogram[ tx ] [ j ] = 0;
}
__syncthreads();
TURN_ON(localDispersedHistogram [ tx ] [ data ] , position);
__syncthreads();
//partHist[ tx ] = 0;
int sum = 0 ;
for (int i = 0 ; i < 256 ; i ++) {
sum += (localDispersedHistogram [ i ] [ tx/ 8 ] & (1 << (tx % 8))) >> (tx % 8) ;
}
partHist[ tx ] = sum;
atomicAdd(&outputPointer[ tx ] , partHist[ tx ] );
#endif
}
int main()
{
#if 1
printf(" Code Name, Sad buddy 17 ");
unsigned char *inputPointer = (unsigned char *) malloc (SIZE_OF_INPUT_ARRAY);
for (int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i ++) {
int t = rand() % 256 ;
//int t = 0;
inputPointer [ i ] = t;
}
unsigned char *device_inputPointer;
int *device_outputPointer;
cudaMalloc((void**)&device_inputPointer, SIZE_OF_INPUT_ARRAY);
cudaMemcpy(device_inputPointer, inputPointer , SIZE_OF_INPUT_ARRAY, cudaMemcpyHostToDevice);
cudaMalloc((void**)&device_outputPointer, 256 * sizeof (int));
cudaMemset(device_outputPointer,0,256 * sizeof (int));
histogram <<< 256 , 256 >>> (device_inputPointer , device_outputPointer );
unsigned int *output = (unsigned int *)malloc (256 * sizeof(int));
cudaMemcpy(output, device_outputPointer , 256 * sizeof(int), cudaMemcpyDeviceToHost);
unsigned int CPUHist [ 256 ] ;
unsigned int GPUHist [ 256 ] ;
for (int i = 0 ; i < 256 ;i ++) {
CPUHist[ i ] = 0;
GPUHist [ i ] = 0;
//printf(" %d " , inputPointer[ i ]);
}
for (int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i++) {
CPUHist[ inputPointer [ i ] ] ++;
}
int flag = 0 ;
for (int i = 0 ; i < 256 ;i ++) {
printf(" %d GPUHist %d CPUHist\n" , output[ i ] , CPUHist[i]);
if (output[ i ] != CPUHist[i] ) {
flag = 1 ;
}
}
printf("\n\n======================\n\n");
if (flag) {
printf("TEST CASE FAIL ");
}
else {
printf("TEST CASE Pass");
}
printf("\n\n======================\n\n");
cudaDeviceReset();
#endif
getch();
return 0;
}
你是如何設置你的線程塊的?也許這是銀行衝突的問題? – hubs
你是否只讀取一次計時,或者你是否重複幾次同一工作臺並取平均值? –
什麼是「TURN_ON」,它有什麼作用? – talonmies