我一直在嘗試編寫一個計算Cantor集合中的有理數的程序,並使用特定的分母。我發現用我的電腦需要20個小時或更長的時間來計算3^14到3^15之間的數字。我認爲,由於這是測試大量單獨的值,因此使用OpenCL在圖形卡上實現將是一件好事。當我試圖實現它時,雖然我的性能比我的CPU實現慢了幾個數量級。這是我的嘗試代碼。在OpenCL中實現
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>
#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2
const int NUM_ELEMENTS = 32768;
void printOutput(unsigned long long start, unsigned long long *values){
for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
if (values[i] != 0)
std::cout << start+i << ',' << values[i] << std::endl;
}
void newList(unsigned long long start, unsigned long long *dataList){
for(int i=0; i < NUM_ELEMENTS; ++i)
dataList[i] = start + i;
}
using namespace cl;
Kernel kernelA;
Context context;
CommandQueue queue;
Buffer inputBuffer;
Buffer outputBuffer;
int init() {
cl_int status = 0;
const char* buildOption ="-x clc++ ";
std::vector<Platform> platforms;
status = Platform::get(&platforms);
if (status != CL_SUCCESS){
std::cout<<"Error: Getting platforms!"<<std::endl;
return FAILURE;
}
std::vector<cl::Platform>::iterator iter;
for(iter = platforms.begin(); iter != platforms.end(); ++iter)
if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
break;
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter)(), 0};
bool gpuNotFound = false;
try{
context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
}
catch(std::exception e){
gpuNotFound = true;
}
if(gpuNotFound){
std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
if (status != CL_SUCCESS){
std::cout<<"Error: Creating context!"<<std::endl;
return FAILURE;
}
}
Program program;
try{
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = CommandQueue(context, devices[0]);
std::ifstream sourceFile("Rationals.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
program = Program(context, source);
program.build(devices, buildOption);
kernelA = Kernel(program, "countRationals");
inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
}catch(cl::Error e){
std::cout << e.what() << std::endl;
std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl;
return FAILURE;
}
return SUCCESS;
}
int execute(unsigned long long* inputList, unsigned long long* outputList) {
try{
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
kernelA.setArg(0, inputBuffer);
kernelA.setArg(1, outputBuffer);
NDRange global(NUM_ELEMENTS/2);
NDRange local(256);
queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
}catch(cl::Error e){
std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
return FAILURE;
}
return SUCCESS;
}
using namespace std;
int main(int argc, char* argv[]){
unsigned long long minNum, maxNum;
if (argc == 2){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[1]) + 1);
}
else if (argc == 3){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else if (argc == 4){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else return -1;
unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr;
thread workThread, outThread, genThread;
q = new unsigned long long[NUM_ELEMENTS];
newList(minNum, q);
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
init();
genThread = thread(newList, minNum+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
workThread.join();
genThread.join();
for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum + NUM_ELEMENTS; i += NUM_ELEMENTS){
old = result;
q = newq;
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
genThread = thread(newList, i+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
outThread = thread(printOutput, i-NUM_ELEMENTS, old);
workThread.join();
outThread.join();
genThread.join();
delete[] old;
delete[] q;
q = old = nullptr;
}
delete[] newq;
delete[] result;
return 0;
}
和內核代碼
bool testCantor(unsigned long p, unsigned long q){
while(q % 3 == 0){
q /= 3;
if (p/q == 1) return p==q;
p %= q;
}
unsigned long p_start = p;
do{
unsigned long p3 = p * 3;
if(p3/q == 1) return false;
p = p3 % q;
} while(p != p_start);
return true;
}
int coprime(unsigned long a, unsigned long b){
unsigned long c;
while (a != 0){
c = a;
a = b % a;
b = c;
}
return 2*((b == 1)&1);
}
__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
int gid = get_global_id(0);
unsigned long q = input[gid], p = 1;
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
gid = 32767 - get_global_id(0);
q = input[gid];
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
}
有沒有更好的辦法,我實現這個?我對OpenCL相當陌生(我不到24小時前就開始使用它),所以可能會犯一些相當明顯的錯誤。
編輯:我想通了,我只是產卵2線程。我已經改變它產生32個線程與每個256 q。它現在從13到14運行時崩潰,我不知道爲什麼。它不會從10崩潰到11
編輯2:我實現了大部分的建議(無法弄清楚如何刪除if(coprime(p,q))),現在它運行得更快一點在n = 10時小於第二差)。我能做些什麼來加速它嗎?它在同一項任務上的運行速度只比我的處理器快33%。
EDIT3:設法實現它與位twiddling。不知道是否有任何其他條件我可以做到這一點。仍然沒有看到非常大的性能提升(有什麼建議?)
我不知道如何實現bitTwiddle,但否則我實現了大部分的建議,並獲得了相當大的速度提升。 – ruler501
因此,「線程數組」上的「數據數組摺疊」工作? Bittwiddling需要一些帶有絕對值函數的signum函數,就像這個問題一樣。http://stackoverflow.com/questions/21465367/how-to-optimize-this-cuda-kernel/21484128#21484128。 –
我設法用2 *((b == 1)&1)來實現它。 OpenCL格式布爾值真的很奇怪 – ruler501