GPU上的內存分配動態數組結構

我有傳遞數組結構到GPU內核的問題。我根據這個主題 - cudaMemcpy segmentation fault和我寫了某事像這樣：GPU上的內存分配動態數組結構

#include <stdio.h> 
#include <stdlib.h> 

struct Test { 
    char *array; 
}; 

__global__ void kernel(Test *dev_test) { 
    for(int i=0; i < 5; i++) { 
     printf("Kernel[0][i]: %c \n", dev_test[0].array[i]); 
    } 
} 

int main(void) { 

    int n = 4, size = 5; 
    Test *dev_test, *test; 

    test = (Test*)malloc(sizeof(Test)*n); 
    for(int i = 0; i < n; i++) 
     test[i].array = (char*)malloc(size * sizeof(char)); 

    for(int i=0; i < n; i++) { 
     char temp[] = { 'a', 'b', 'c', 'd' , 'e' }; 
     memcpy(test[i].array, temp, size * sizeof(char)); 
    } 

    cudaMalloc((void**)&dev_test, n * sizeof(Test)); 
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice); 
    for(int i=0; i < n; i++) { 
     cudaMalloc((void**)&(test[i].array), size * sizeof(char)); 
     cudaMemcpy(&(dev_test[i].array), &(test[i].array), size * sizeof(char), cudaMemcpyHostToDevice); 
    } 

    kernel<<<1, 1>>>(dev_test); 
    cudaDeviceSynchronize(); 

    // memory free 
    return 0; 
}

沒有錯誤，但顯示核心價值觀是不正確的。我做錯了什麼？提前感謝您的幫助。

來源

2015-05-06 Bakus123

爲什麼'cudaMalloc（（無效**）＆（試驗[Ⅰ] .array），大小*的sizeof（char）的）;'而不是'cudaMalloc（（void **）＆（dev_test [i]）。數組），size * sizeof（char））;'？另外，它應該是'cudaMemcpy（dev_test [i] .array，test [i] .array，size * sizeof（char），cudaMemcpyHostToDevice）;'。 – francis

@francis，它不起作用（分段錯誤（核心轉儲））。在GPU上，我們不能以標準方式分配內存。 – Bakus123

其他友好建議：除非您已理解提問者面臨的問題，否則不要從代碼中挑選代碼...對不起，如果我的建議無效。我的建議是爲'dev_test [i] .array'分配內存，而不是爲'test [i] .array =（char *）malloc（size *的sizeof（char）的）;'。 – francis

這被分配一個新的指針到主機存儲器：

test[i].array = (char*)malloc(size * sizeof(char));

這在主機存儲器的數據複製到該區域：

memcpy(test[i].array, temp, size * sizeof(char));

這是重寫的先前分配的指向主機存儲器的指針（來自上面的步驟1）與新的 poi NTER到設備存儲器：
```
cudaMalloc((void**)&(test[i].array), size * sizeof(char)); 
```

步驟3之後，在步驟2中設置的數據被完全丟失，並以任何方式不再可訪問。參照步驟3和4的question/answer您鏈接：

3.創建主機上的一個單獨的INT指針，姑且稱之爲myhostptr

4.cudaMalloc int型存儲設備上的myhostptr

你還沒有做到這一點。你沒有創建一個單獨的指針。您重用（擦除，覆蓋）了一個現有指針，該指針指向您在主機上關注的數據。 This question/answer，也鏈接到您鏈接的答案，幾乎可以提供您需要遵循的步驟，代碼爲。

下面是您的代碼的修改版本，它可以正確實現缺失的步驟3和4（和5），但根據所鏈接的問題/答案您沒有正確實施：（請參閱劃定步驟3,4 5）

$ cat t755.cu 
#include <stdio.h> 
#include <stdlib.h> 

struct Test { 
    char *array; 
}; 

__global__ void kernel(Test *dev_test) { 
    for(int i=0; i < 5; i++) { 
     printf("Kernel[0][i]: %c \n", dev_test[0].array[i]); 
    } 
} 

int main(void) { 

    int n = 4, size = 5; 
    Test *dev_test, *test; 

    test = (Test*)malloc(sizeof(Test)*n); 
    for(int i = 0; i < n; i++) 
     test[i].array = (char*)malloc(size * sizeof(char)); 

    for(int i=0; i < n; i++) { 
     char temp[] = { 'a', 'b', 'c', 'd' , 'e' }; 
     memcpy(test[i].array, temp, size * sizeof(char)); 
    } 

    cudaMalloc((void**)&dev_test, n * sizeof(Test)); 
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice); 

    // Step 3: 
    char *temp_data[n]; 
    // Step 4: 
    for (int i=0; i < n; i++) 
     cudaMalloc(&(temp_data[i]), size*sizeof(char)); 
    // Step 5: 
    for (int i=0; i < n; i++) 
     cudaMemcpy(&(dev_test[i].array), &(temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice); 
    // now copy the embedded data: 
    for (int i=0; i < n; i++) 
     cudaMemcpy(temp_data[i], test[i].array, size*sizeof(char), cudaMemcpyHostToDevice); 

    kernel<<<1, 1>>>(dev_test); 
    cudaDeviceSynchronize(); 

    // memory free 
    return 0; 
} 

$ nvcc -o t755 t755.cu 
$ cuda-memcheck ./t755 
========= CUDA-MEMCHECK 
Kernel[0][i]: a 
Kernel[0][i]: b 
Kernel[0][i]: c 
Kernel[0][i]: d 
Kernel[0][i]: e 
========= ERROR SUMMARY: 0 errors 
$

由於上述方法可以爲初學者挑戰性，通常建議不這樣做，而是壓扁你的數據結構。平展通常意味着重新排列數據存儲，以去除必須單獨分配的嵌入式指針。

扁平化這個數據結構的簡單的例子將改用此：

struct Test { 
    char array[5]; 
};

它是公認的，當然，這特定做法將無助於各種用途，但應說明的總體思路/意圖。與該變形例中，作爲一個例子，代碼變得更簡單：

$ cat t755.cu 
#include <stdio.h> 
#include <stdlib.h> 

struct Test { 
    char array[5]; 
}; 

__global__ void kernel(Test *dev_test) { 
    for(int i=0; i < 5; i++) { 
     printf("Kernel[0][i]: %c \n", dev_test[0].array[i]); 
    } 
} 

int main(void) { 

    int n = 4, size = 5; 
    Test *dev_test, *test; 

    test = (Test*)malloc(sizeof(Test)*n); 

    for(int i=0; i < n; i++) { 
     char temp[] = { 'a', 'b', 'c', 'd' , 'e' }; 
     memcpy(test[i].array, temp, size * sizeof(char)); 
    } 

    cudaMalloc((void**)&dev_test, n * sizeof(Test)); 
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice); 

    kernel<<<1, 1>>>(dev_test); 
    cudaDeviceSynchronize(); 

    // memory free 
    return 0; 
} 
$ nvcc -o t755 t755.cu 
$ cuda-memcheck ./t755 
========= CUDA-MEMCHECK 
Kernel[0][i]: a 
Kernel[0][i]: b 
Kernel[0][i]: c 
Kernel[0][i]: d 
Kernel[0][i]: e 
========= ERROR SUMMARY: 0 errors 
$

來源

2015-05-06 18:11:18

非常感謝。「扁平你的數據結構」是什麼意思？ – Bakus123

更新了我對這個問題的回答。但是，如果您在CUDA標籤上搜索，您會發現許多「展平」參考和示例。 –

GPU上的內存分配動態數組結構

回答

相關問題