我很好奇,看到使用OpenGL配置紋理上傳的性能,並注意到我認爲很奇怪的事情。我使用glTexStorage2D
創建了一個4K紋理,格式爲GL_RGBA8
。然後,我使用每個幀glTexSubImage2D
重新上傳靜態圖像緩衝區到紋理。基於幀速率,我得到大約5.19GB/s。接下來,我將紋理格式更改爲GL_SRGB8_ALPHA8
並重新嘗試實驗。這一次我得到了2.81GB/s,顯着下降。這看起來很奇怪,因爲據我所知,上傳sRGB數據和上傳RGB數據應該沒有什麼不同,因爲沒有應該發生的轉換(在採樣過程中,sRGB轉換應該在着色器中進行)。glTexSubImage2D性能怪異
一些附加信息。對於第一次測試,我在撥打glTexSubImage2D
時使用GL_RGBA
和GL_UNSIGNED_INT_8_8_8_8_REV
,因爲這是驅動程序(通過glGetInternalformativ
)告訴我的理想選擇。根據司機的建議,我使用GL_UNSIGNED_INT_8_8_8_8
進行第二次測試。一些測試證實這些是分別使用的最快格式。這是使用332.21驅動程序在Windows 7 x64上使用Nvidia GeForce GTX 760。
#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include <vector>
#include <cstdlib>
#include <cstdio>
#define SCREEN_SIZE_X 1024
#define SCREEN_SIZE_Y 1024
#define GLSL(src) "#version 440 core\n" #src
const char* vertex_shader = GLSL(
const vec2 data[4] = vec2[]
(
vec2(-1.0, 1.0),
vec2(-1.0, -1.0),
vec2(1.0, 1.0),
vec2(1.0, -1.0)
);
void main()
{
gl_Position = vec4(data[gl_VertexID], 0.0, 1.0);
}
);
const char* fragment_shader = GLSL(
layout(location = 0) uniform sampler2D texture0;
layout(location = 1) uniform vec2 screenSize;
out vec4 frag_color;
void main()
{
frag_color = texture(texture0, gl_FragCoord.xy/screenSize);
}
);
int main(int argc, char *argv[])
{
if(!glfwInit())
exit(EXIT_FAILURE);
glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 4);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
GLFWwindow* window = glfwCreateWindow(SCREEN_SIZE_X, SCREEN_SIZE_Y, "OpenGL Texture Upload", nullptr, nullptr);
if(!window)
{
glfwTerminate();
exit(EXIT_FAILURE);
}
glfwMakeContextCurrent(window);
glfwSwapInterval(0);
glewExperimental = GL_TRUE;
if(glewInit() != GLEW_OK)
{
glfwTerminate();
exit(EXIT_FAILURE);
}
GLuint vao = 0;
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
GLuint vs = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vs, 1, &vertex_shader, nullptr);
glCompileShader(vs);
GLuint fs = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fs, 1, &fragment_shader, nullptr);
glCompileShader(fs);
GLuint shader_program = glCreateProgram();
glAttachShader(shader_program, fs);
glAttachShader(shader_program, vs);
glLinkProgram(shader_program);
glUseProgram(shader_program);
glProgramUniform2f(shader_program, 1, SCREEN_SIZE_X, SCREEN_SIZE_Y);
GLuint texture = 0;
glGenTextures(1, &texture);
#ifdef USE_SRGB
glTextureStorage2DEXT(texture, GL_TEXTURE_2D, 1, GL_SRGB8_ALPHA8, 4096, 4096);
#else
glTextureStorage2DEXT(texture, GL_TEXTURE_2D, 1, GL_RGBA8, 4096, 4096);
#endif
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glBindMultiTextureEXT(GL_TEXTURE0, GL_TEXTURE_2D, texture);
glProgramUniform1i(shader_program, 0, 0);
std::vector<unsigned int> image_buffer(4096*4096, 0xFF0000FFul);
double lastTime = glfwGetTime();
double nbFrames = 0;
while(!glfwWindowShouldClose(window))
{
double currentTime = glfwGetTime();
nbFrames++;
if (currentTime - lastTime >= 1.0)
{
char cbuffer[50];
snprintf(cbuffer, sizeof(cbuffer), "OpenGL Texture Upload [%.1f fps, %.3f ms]", nbFrames, 1000.0/nbFrames);
glfwSetWindowTitle(window, cbuffer);
nbFrames = 0;
lastTime++;
}
#ifdef USE_SRGB
glTextureSubImage2DEXT(texture, GL_TEXTURE_2D, 0, 0, 0, 4096, 4096, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, image_buffer.data());
#else
glTextureSubImage2DEXT(texture, GL_TEXTURE_2D, 0, 0, 0, 4096, 4096, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, image_buffer.data());
#endif
glClear(GL_COLOR_BUFFER_BIT);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glfwSwapBuffers(window);
glfwPollEvents();
}
glfwDestroyWindow(window);
glfwTerminate();
exit(EXIT_SUCCESS);
}
這不是解釋。首先,'glGetInternalformativ'在兩種情況下推薦'GL_RGBA',而不是'GL_BGRA'。其次,實驗表明,在任何情況下'GL_BGRA'都不會更快。 –