-1
我正在編寫我的第一個內核並設置一個簡單的程序來將線索索引分配給數組中的該位置,它適用於第一個1/4數組,即它的大小爲100時停止在25和大小爲50的時候停止。然後剩下的都是零。我進行了測試,以確保blockdim正確地出來,而且,這個程序非常簡單,並且緊跟着例子,我不知道它會有什麼問題。CUDA線索索引爲非常簡單的內核VS 2015更新返回意外的結果3
計算機是將我的線程分成四個塊還是什麼?
__global__ void index_initialize(int* data)
{
// set data at index to index
data[threadIdx.x] = threadIdx.x;
}
void zero_initialize(int* data, int size)
{
for (int i = 0; i < size; i++)
{
data[i] = 0;
}
}
void print_array(int* data, int size)
{
for (int i = 0; i < size; i++)
{
std::cout << data[i] << " ";
if (i % 20 == 0 && i > 0) std::cout << '\n';
}
}
int main()
{
GpuTimer timer;
// size
int size = 100;
// host array
int* host = new int[size];
// device array
int* device = new int[size];
// zero out device and host
zero_initialize(host, size);
zero_initialize(device, size);
// allocate size ints on device
cudaMalloc(&device, size * sizeof(int));
// call kernel on one thread block of size
index_initialize<<<1, size>>> (device);
// synchronize
cudaDeviceSynchronize();
// copy device to host
cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
// reset device
cudaDeviceReset();
// print out host
print_array(host, size);
// free memory
cudaFree(device);
}
該程序做同樣的事情,它停止在25
#include <iostream>
#include <typeinfo>
#include "cs344\Lesson Code Snippets\Lesson 2 Code Snippets\gputimer.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void add_arrays(int* A, int* B, int* C)
{
C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];
}
__global__ void index_initialize(int* data)
{
// set data at index to index
data[threadIdx.x] = threadIdx.x;
}
void zero_initialize(int* data, int size)
{
for (int i = 0; i < size; i++)
{
data[i] = 0;
}
}
void print_array(int* data, int size)
{
for (int i = 0; i < size; i++)
{
std::cout << data[i] << " ";
if (i % 20 == 0 && i > 0) std::cout << '\n';
}
}
int main()
{
GpuTimer timer;
// size
int size = 100;
// host arrays
int* hostA = new int[size];
int* hostB = new int[size];
int* hostC = new int[size];
// device arrays
int* deviceA = new int[size];
int* deviceB = new int[size];
int* deviceC = new int[size];
// zero out host
zero_initialize(hostA, size);
zero_initialize(hostB, size);
zero_initialize(hostC, size);
// set to index
for (int i = 0; i < size; i++)
{
hostB[i] = i;
}
// allocate size ints on device
cudaMalloc(&deviceA, size * sizeof(int));
cudaMalloc(&deviceB, size * sizeof(int));
cudaMalloc(&deviceC, size * sizeof(int));
cudaMemcpy(deviceA, hostA, size, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, size, cudaMemcpyHostToDevice);
// call kernel on one thread block of size
//index_initialize<<<1, size>>> (device);
// call add kernel
add_arrays<<< 1, size >>> (deviceA, deviceB, deviceC);
// synchronize
cudaDeviceSynchronize();
// copy device to host
cudaMemcpy(hostC, deviceC, size, cudaMemcpyDeviceToHost);
// reset device
cudaDeviceReset();
// print out host
print_array(hostC, size);
// free memory
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
}
'cudaMemcpy(主機,設備,大小,cudaMemcpyDeviceToHost);' - 你只能複製四分之一的數組。 [SO]不是一個免費的小錯誤發現服務,請不要把它當作一個。 – talonmies
謝謝,我意識到我在做什麼 –