0
我目前工作的一個網格的插值多線程,並具有多線程方面的一些問題。該代碼假設讀取由2x2矩陣表示的映射,然後對其進行插值以將點數增加100倍。在內核中使用for循環時,它的效果很好。從循環更改爲內核
插值之前:http://bildr.no/view/OWV1UDRO
插值後:http://bildr.no/view/eTlmNmpo
當我試圖改變一個與線程循環,它產生了一些怪異的結果。在數字代替,它與-1填充生成的矩陣。#QNAN
這裏是我工作的代碼與for循環內核
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"
using namespace std;
float Z[41][41];
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<float, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
for (float i=0; i<n*k; i++)
{
for (float j=0; j<m*k; j++)
{
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
}
}
int main (void)
{
// Start timer
clock_t tStart = clock();
// Size of map
int n=41;
int m=41;
int g = 0;
float numberOfInterpolationsPerSquare = 100;
float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);
size_t pitch, tex_ofs;
float *f;
float *r;
float *map_d = 0;
// Build read-Streams
ifstream map;
//Create and open a txt file for MATLAB
ofstream file;
// Open data
map.open("Map.txt", ios_base::in);
file.open("Bilinear.txt");
// Store the map in a 2D array
for (int i=0; i<n; i++)
{
for (int j=0; j<m; j++)
{
map >> Z[i][j];
}
}
// Allocate memory on host and device
CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
r = (float*)malloc(numberOfElements*sizeof(float));
// Copy map from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));
// Set texture mode to bilinear interpolation
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
// Bind the map to texture
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));
// Checking for offset
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
// Launch Kernel
kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
// Copy result from device to host
cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);
// Write results to file
for(int h=0;h<numberOfElements;h++)
{
if(g==sqrt(numberOfElements))
{
file << endl;
g=0;
}
file << r[h] << " ";
g++;
}
// Free memory
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (map_d));
CUDA_SAFE_CALL (cudaFree (f));
free(r);
// Print out execution time
printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
這裏的多線程內核,不工作
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j= blockIdx.y * blockDim.y + threadIdx.y;
if(i>=n*k || j>=m*k)
return;
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
有誰知道爲什麼多線程版本不起作用?
問候
鬆德雷
如果有人想嘗試的代碼,這裏的地圖:http://codepad.org/fe8aWGMt – user2594166
你是如何啓動多線程內核?另外,在第二個內核中,'i'和'j'是'int'而不是'float'。所以'tex2D'中的'j/k'和'i/k'將會導致整數除法。考慮將'k'聲明爲'float'。 – sgarizvi
謝謝你的回覆!我改變了i和j以浮動,現在它產生了一些數字,但仍然是-1。#QNAN。 內核被稱爲是這樣的: //查找塊 INT來確定nthreads = 1024的號碼; int blocksize = 512; INT Nblocks屬=小區((N * M * numberOfInterpolationsPerSquare)/來確定nthreads); //啓動內核 內核<<< Nblocks屬,塊大小>>>(M,N,F,numberOfInterpolationsPerSquare); – user2594166