0
我被困在這一整天。 以下程序將給出「超出範圍的共享或本地地址」錯誤。 評論這條線將解決這個問題。cuda奇怪的內存訪問錯誤共享內存
hist[tidx] = 0;
但是,我不認爲分配大小爲88 * 4字節的共享內存將是任何問題。
註釋掉此行也將解決這個問題
NVMatrix Acts(acts, true);
看來,如果我分配在全局存儲器的行爲矩陣,共享內存的行爲異常。任何想法?
int main(int argc, char ** argv)
{
float * act = new float[2985984];
for (int i=0; i<2985984; i++)
act[i] = 0.0001*(i+1);
Matrix acts(act, 23328, 128); // use act as the data to initialize the 23328x128, matrix in cpu
NVMatrix Acts(acts, true); // create a Acts Matrix which uses GPU global memory, and copies the value from CPU to GPU
// If comment out this line, there is no problem to execute the program
float cost = Calculate();
}
float Calculate()
{
dim3 blocks(4,96);
dim3 threads(32,8);
cudaFuncSetCacheConfig(createShare<8, 32>, cudaFuncCachePreferShared);
int numLabels = 88;
createShare<8, 32><<<blocks, threads, numLabels>>>(numLabels);
return 0;
}
template <int B_Y, int B_X>
__global__ void createShare(int numLabels)
{
extern __shared__ float hist[];
int tidx = threadIdx.y * B_X + threadIdx.x;
if (tidx<numLabels) {
printf("block %d %d %d\n", blockIdx.x, blockIdx.y, tidx);
hist[tidx] = 0;
}
}