我正在編寫我的第一個內核並設置一個簡單的程序來將線索索引分配給數組中的該位置,它適用於第一個1/4數組,即它的大小爲100時停止在25和大小爲50的時候停止。然後剩下的都是零。我進行了測試,以確保blockdim正確地出來,而且,這個程序非常簡單,並且緊跟着例子,我不知道它會有什麼問題。CUDA線索索引爲非常簡單的內核VS 2015更新返回意外的結果3
__global__ void index_initialize(int* data)
// set data at index to index
data[threadIdx.x] = threadIdx.x;
void zero_initialize(int* data, int size)
for (int i = 0; i < size; i++)
data[i] = 0;
void print_array(int* data, int size)
for (int i = 0; i < size; i++)
std::cout << data[i] << " ";
if (i % 20 == 0 && i > 0) std::cout << '\n';
int main()
GpuTimer timer;
// size
int size = 100;
// host array
int* host = new int[size];
// device array
int* device = new int[size];
// zero out device and host
zero_initialize(host, size);
zero_initialize(device, size);
// allocate size ints on device
cudaMalloc(&device, size * sizeof(int));
// call kernel on one thread block of size
index_initialize<<<1, size>>> (device);
// synchronize
// copy device to host
cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
// reset device
// print out host
print_array(host, size);
// free memory
#include <iostream>
#include <typeinfo>
#include "cs344\Lesson Code Snippets\Lesson 2 Code Snippets\gputimer.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void add_arrays(int* A, int* B, int* C)
C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];
__global__ void index_initialize(int* data)
// set data at index to index
data[threadIdx.x] = threadIdx.x;
void zero_initialize(int* data, int size)
for (int i = 0; i < size; i++)
data[i] = 0;
void print_array(int* data, int size)
for (int i = 0; i < size; i++)
std::cout << data[i] << " ";
if (i % 20 == 0 && i > 0) std::cout << '\n';
int main()
GpuTimer timer;
// size
int size = 100;
// host arrays
int* hostA = new int[size];
int* hostB = new int[size];
int* hostC = new int[size];
// device arrays
int* deviceA = new int[size];
int* deviceB = new int[size];
int* deviceC = new int[size];
// zero out host
zero_initialize(hostA, size);
zero_initialize(hostB, size);
zero_initialize(hostC, size);
// set to index
for (int i = 0; i < size; i++)
hostB[i] = i;
// allocate size ints on device
cudaMalloc(&deviceA, size * sizeof(int));
cudaMalloc(&deviceB, size * sizeof(int));
cudaMalloc(&deviceC, size * sizeof(int));
cudaMemcpy(deviceA, hostA, size, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, size, cudaMemcpyHostToDevice);
// call kernel on one thread block of size
//index_initialize<<<1, size>>> (device);
// call add kernel
add_arrays<<< 1, size >>> (deviceA, deviceB, deviceC);
// synchronize
// copy device to host
cudaMemcpy(hostC, deviceC, size, cudaMemcpyDeviceToHost);
// reset device
// print out host
print_array(hostC, size);
// free memory
'cudaMemcpy(主機,設備,大小,cudaMemcpyDeviceToHost);' - 你只能複製四分之一的數組。 [SO]不是一個免費的小錯誤發現服務,請不要把它當作一個。 – talonmies
謝謝,我意識到我在做什麼 –