在OpenCL中實現

我一直在嘗試編寫一個計算Cantor集合中的有理數的程序，並使用特定的分母。我發現用我的電腦需要20個小時或更長的時間來計算3^14到3^15之間的數字。我認爲，由於這是測試大量單獨的值，因此使用OpenCL在圖形卡上實現將是一件好事。當我試圖實現它時，雖然我的性能比我的CPU實現慢了幾個數量級。這是我的嘗試代碼。在OpenCL中實現

#define __CL_ENABLE_EXCEPTIONS 
#include <CL/cl.hpp> 
#include <functional> 
#include <ctime> 
#include <iostream> 
#include <fstream> 
#include <exception> 
#include <cstdlib> 
#include <vector> 
#include <thread> 
#include <cmath> 
#include <string> 
#include <algorithm> 
#include <thread> 
#include <cmath> 
#include <sstream> 

#define SUCCESS 0 
#define FAILURE 1 
#define EXPECTED_FAILURE 2 

const int NUM_ELEMENTS = 32768; 

void printOutput(unsigned long long start, unsigned long long *values){ 
    for(unsigned int i = 0; i < NUM_ELEMENTS; i++) 
     if (values[i] != 0) 
      std::cout << start+i << ',' << values[i] << std::endl; 
} 

void newList(unsigned long long start, unsigned long long *dataList){ 
    for(int i=0; i < NUM_ELEMENTS; ++i) 
     dataList[i] = start + i; 
} 

using namespace cl; 

Kernel kernelA; 
Context context; 
CommandQueue queue; 
Buffer inputBuffer; 
Buffer outputBuffer; 

int init() { 
    cl_int status = 0; 
    const char* buildOption ="-x clc++ "; 
    std::vector<Platform> platforms; 
    status = Platform::get(&platforms); 
    if (status != CL_SUCCESS){ 
     std::cout<<"Error: Getting platforms!"<<std::endl; 
     return FAILURE; 
    } 
    std::vector<cl::Platform>::iterator iter; 
    for(iter = platforms.begin(); iter != platforms.end(); ++iter) 
     if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) 
      break; 
    cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter)(), 0}; 
    bool gpuNotFound = false; 
    try{ 
     context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status); 
    } 
    catch(std::exception e){ 
     gpuNotFound = true; 
    } 
    if(gpuNotFound){ 
     std::cout<<"GPU not found, falling back to CPU!"<<std::endl; 
     context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status); 
     if (status != CL_SUCCESS){ 
      std::cout<<"Error: Creating context!"<<std::endl; 
      return FAILURE; 
     } 
    } 
    Program program; 
    try{ 
     std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); 
     queue = CommandQueue(context, devices[0]); 
     std::ifstream sourceFile("Rationals.cl"); 
     std::string sourceCode(
      std::istreambuf_iterator<char>(sourceFile), 
      (std::istreambuf_iterator<char>())); 
     Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1)); 
     program = Program(context, source); 
     program.build(devices, buildOption); 
     kernelA = Kernel(program, "countRationals"); 
     inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long)); 
     outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long)); 
    }catch(cl::Error e){ 
     std::cout << e.what() << std::endl; 
     std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl; 
     std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl; 
     std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl; 
     return FAILURE; 
    } 
    return SUCCESS; 
} 

int execute(unsigned long long* inputList, unsigned long long* outputList) { 
    try{ 
     queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList); 
     kernelA.setArg(0, inputBuffer); 
     kernelA.setArg(1, outputBuffer); 
     NDRange global(NUM_ELEMENTS/2); 
     NDRange local(256); 
     queue.enqueueNDRangeKernel(kernelA, NullRange, global, local); 
     queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList); 
    }catch(cl::Error e){ 
     std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl; 
     return FAILURE; 
    } 
    return SUCCESS; 
} 

using namespace std; 

int main(int argc, char* argv[]){ 
    unsigned long long minNum, maxNum; 
    if (argc == 2){ 
     minNum = pow(3, atoi(argv[1])); 
     maxNum = pow(3, atoi(argv[1]) + 1); 
    } 
    else if (argc == 3){ 
     minNum = pow(3, atoi(argv[1])); 
     maxNum = pow(3, atoi(argv[2])); 
    } 
    else if (argc == 4){ 
     minNum = pow(3, atoi(argv[1])); 
     maxNum = pow(3, atoi(argv[2])); 
    } 
    else return -1; 
    unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr; 
    thread workThread, outThread, genThread; 
    q = new unsigned long long[NUM_ELEMENTS]; 
    newList(minNum, q); 
    result = new unsigned long long[NUM_ELEMENTS]; 
    newq = new unsigned long long[NUM_ELEMENTS]; 
    init(); 
    genThread = thread(newList, minNum+NUM_ELEMENTS, newq); 
    workThread = thread(execute, q, result); 
    workThread.join(); 
    genThread.join(); 
    for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum + NUM_ELEMENTS; i += NUM_ELEMENTS){ 
     old = result; 
     q = newq; 
     result = new unsigned long long[NUM_ELEMENTS]; 
     newq = new unsigned long long[NUM_ELEMENTS]; 
     genThread = thread(newList, i+NUM_ELEMENTS, newq); 
     workThread = thread(execute, q, result); 
     outThread = thread(printOutput, i-NUM_ELEMENTS, old); 
     workThread.join(); 
     outThread.join(); 
     genThread.join(); 
     delete[] old; 
     delete[] q; 
     q = old = nullptr; 
    } 
    delete[] newq; 
    delete[] result; 
    return 0; 
}

和內核代碼

bool testCantor(unsigned long p, unsigned long q){ 
    while(q % 3 == 0){ 
     q /= 3; 
     if (p/q == 1) return p==q; 
     p %= q; 
    } 
    unsigned long p_start = p; 
    do{ 
     unsigned long p3 = p * 3; 
     if(p3/q == 1) return false; 
     p = p3 % q; 
    } while(p != p_start); 
    return true; 
} 

int coprime(unsigned long a, unsigned long b){ 
    unsigned long c; 
    while (a != 0){ 
     c = a; 
     a = b % a; 
     b = c; 
    } 
    return 2*((b == 1)&1); 
} 

__kernel 
void countRationals(__global unsigned long *input, __global unsigned long *output){ 
    int gid = get_global_id(0); 
    unsigned long q = input[gid], p = 1; 
    output[gid] = 0; 
    for(p = 1; p <= q/3; p++){ 
     if(p % 3 != 0 && testCantor(p, q)) 
      for(unsigned long i = p; i <= q/3; i *= 3) 
        output[gid] += coprime(i,q); 
    } 
    gid = 32767 - get_global_id(0); 
    q = input[gid]; 
    output[gid] = 0; 
    for(p = 1; p <= q/3; p++){ 
     if(p % 3 != 0 && testCantor(p, q)) 
      for(unsigned long i = p; i <= q/3; i *= 3) 
        output[gid] += coprime(i,q); 
    } 
}

有沒有更好的辦法，我實現這個？我對OpenCL相當陌生（我不到24小時前就開始使用它），所以可能會犯一些相當明顯的錯誤。

編輯：我想通了，我只是產卵2線程。我已經改變它產生32個線程與每個256 q。它現在從13到14運行時崩潰，我不知道爲什麼。它不會從10崩潰到11

編輯2：我實現了大部分的建議（無法弄清楚如何刪除if（coprime（p，q））），現在它運行得更快一點在n = 10時小於第二差）。我能做些什麼來加速它嗎？它在同一項任務上的運行速度只比我的處理器快33％。

EDIT3：設法實現它與位twiddling。不知道是否有任何其他條件我可以做到這一點。仍然沒有看到非常大的性能提升（有什麼建議？）

來源

2014-02-12 ruler501

int execute(unsigned long long* inputList, unsigned long long* outputList) { 
    try 
    { 
     ... 
    } 
    catch(cl::Error e) 
    { 
     ... 
    } 
    return SUCCESS;

正在創建緩衝區。如果你多次使用execute（），它會有一個緩衝區創建/垃圾收集開銷。此外，您的全局範圍僅爲本地範圍的兩倍，這意味着您的GPU只有兩個計算單元將被使用。如果你的卡有20個計算單位，那麼全球範圍應至少爲40 *本地範圍。只有512個元素不足以保持gpu繁忙。至少有一半的核心。 for（p = 1; p < = q/3; p ++）循環對於所有內核都不相同。有些核心數量爲10，而另一個核心數量爲100，這破壞了核心之間的執行順序。你應該做一個更平衡的內核。例如：

提供第一個內核來計算第一個和最後一個元素，第二個內核計算第二個和第N-1個......因此所有內核都做幾乎相同的工作，而不是空閒地等待後面的內核。

__kernel 
void countRationals(__global unsigned long *input, __global unsigned long *output) 
{ 
    // computing first element (least workload among the array) 
    int gid = get_global_id(0); 
    unsigned long q = input[gid], p = 1; 
    output[gid] = 0; 
    for(p = 1; p <= q/3; p++) // counts to 10 .... 
    { 
     if(p % 3 != 0 && testCantor(p, q)) 
      for(unsigned long i = p; i <= q/3; i *= 3) 
       if(coprime(i,q)) 
        output[gid] += 2; 
    } 

    //+ computing (N-gid) element (heaviest workload among the array) 
    int N_gid = findOtherIndex(get_global_id(0)); 
    unsigned long N_q = input[N_gid], N_p = 1; 
    output[N_gid] = 0; 
    for(N_p = 1; N_p <= N_q/3; N_p++) // counts to 100? 
    { 
     if(N_p % 3 != 0 && testCantor(N_p, N_q)) 
      for(unsigned long i = p; i <= q/3; i *= 3) 
       if(coprime(i,N_q)) 
        output[N_gid] += 2; 
    } 

    //this way, adjacent cores will have "closer to equal" work. 

}

所以，如果你有4096元，第一核將計算第1次和4096元，第二個核心將計算第2次和第4095元素，... 64局部範圍，而4096全球範圍應該罰款一個開始。如果你使用了太多的「if」，那麼你應該爲它們中的每一個添加一個「else」來做假工作以保持核心之間的計算順序。或者你可以刪除一些「如果」，如果他們是那樣簡單：

if(a>b)c+=d;

可以截獲

c+=d*bitTwiddle_and_absoluteValue(a,b); // does only computation, not branching is good for gpu. 
implement bitTwiddle_and_absoluteValue(a,b) such that it returns zero when a<=b and 1 when a>b

編輯：

giving global size a multiple of number of cores of GPU could give an extre performance.

編輯：讓優化

for(p = 1; p <= q/3; p++){ 
     if(p % 3 != 0 && testCantor(p, q)) 
      for(unsigned long i = p; i <= q/3; i *= 3) 
        output[gid] += coprime(i,q); 
    }

p％3！= 0僅表示1或2。

P％3 == 1由P = 1,4,7,10，... =納>我們的第一環路

P％3 == 2用p滿足= 2,5， 8,11 ... =>我們的第二個循環

讓串連這些：

for(p = 1; p <= q/3; p+=3){ // p%3==1 is satisfied 
     if(testCantor(p, q)) // so no need for testing modulus 
      for(unsigned long i = p; i <= q/3; i *= 3) 
        output[gid] += coprime(i,q); 
    } 

for(p = 2; p <= q/3; p+=3){ // p%3==2 is satisfied 
     if(testCantor(p, q)) // so no need for testing modulus 
      for(unsigned long i = p; i <= q/3; i *= 3) 
        output[gid] += coprime(i,q); 
    } 

//so we got rid of this part: 
for(p = 0; p <= q/3; p+=3){  // p%3==0 is not engaging "if" so we dont need 
      if(testCantor(p, q))      // this loop anymore lol :D 
       for(unsigned long i = p; i <= q/3; i *= 3) 
         output[gid] += coprime(i,q); 
     }

作爲獎勵，總循環迭代的1/3，應提高一點點減少。

編輯：while循環有一個模數，並且不使用GPU的浮點電位。

//here convert integers to floats a,b,c 
while (a != 0){ // this will need a tolerance range, exact zero is nearly impossible 
     c = a; 
     a = b % a; //emulate this using fp 
     // example: 5%3 --> 5.0/3.0 gives 1.yyy so we have 1 at least 
     // then we subtract like: 5.0 - floor(5.0/3.0)*3.0 
     // we have 2.0 which is 5%3 
     // this is just a single condition 
     // looks like b%a can be b-floor(b/a)*a but Im not sure 
     // good luck! 
     b = c; 
    } 
// here convert floats back to integers again 

How can one emulate modulus with using only fp arithmetics without losing precision?

來源

2014-02-12 06:41:54

我不知道如何實現bitTwiddle，但否則我實現了大部分的建議，並獲得了相當大的速度提升。 – ruler501

因此，「線程數組」上的「數據數組摺疊」工作？ Bittwiddling需要一些帶有絕對值函數的signum函數，就像這個問題一樣。http://stackoverflow.com/questions/21465367/how-to-optimize-this-cuda-kernel/21484128#21484128。 –

我設法用2 *（（b == 1）＆1）來實現它。 OpenCL格式布爾值真的很奇怪 – ruler501

在OpenCL中實現

回答

相關問題