Lite2D 是我从零编写的轻量级 2D 游戏引擎。本文记录在引擎中引入独立渲染线程的尝试——将逻辑更新与 GL 渲染分离到不同线程,通过渲染命令队列进行通信。
项目地址:github.com/dreamyouxi/Lite2D
整体架构
⚠️ GL 线程约束
- OpenGL 上下文只能绑定到一个线程,纹理创建和初始化必须在渲染线程完成
- 双缓冲默认开启垂直同步,可通过
glfwSwapInterval调整
核心思路:逻辑线程生成渲染指令,渲染线程消费并执行,两者通过命令队列解耦。
逻辑线程 (Main Thread)
遍历场景树 Node
▼
封装 RenderCmd
顶点 · 纹理坐标 · 透明度
顶点 · 纹理坐标 · 透明度
▼
addRenderCmd()
▼
NextTick() 信号
clear() 清空上一帧
▼
addFuncToRenderThread()
注册 GL 回调(纹理创建等)
注册 GL 回调(纹理创建等)
std::mutex
⟶
RenderCmdQueue
vector<RenderCmd*>
atomic<bool> isNextTick
vector<function> 回调队列
⟶
lock_guard
渲染线程 (GL Thread)
QueryPerformanceCounter
高精度帧率控制 (120 FPS)
高精度帧率控制 (120 FPS)
▼
检查 isNextTick
▼
render() 执行命令
glBindTexture · glBegin(GL_QUADS)
glBindTexture · glBegin(GL_QUADS)
▼
SwapBuffers
tex_pool 纹理池
unordered_map<string, Texture2D*>
unordered_map<string, Texture2D*>
▼
processOtherThreadFunc()
执行逻辑线程注册的 GL 回调
执行逻辑线程注册的 GL 回调
多线程渲染架构 — 逻辑线程与渲染线程通过 RenderCmdQueue 解耦通信
具体流程:
- 渲染线程维护一份独立的纹理池(从逻辑线程的纹理拷贝而来)
- 逻辑线程遍历场景树时,将每个节点的渲染参数封装为
RenderCmd,放入RenderCmdQueue - 所有 GL 相关操作通过回调在渲染线程中处理;逻辑线程通过
NextTick信号与渲染线程同步
运行截图 — 逻辑帧与渲染帧分离
60
逻辑帧数
11999
渲染帧数
0.47μs
逻辑延时
测试平台:i7 4790K + GTX 950
帧数限制
渲染线程通过高精度计时器(QueryPerformanceCounter)控制最大帧率。当渲染速度快于逻辑更新时自动丢帧,保证每次渲染的始终是最新帧。
static void ThreadFunc(RenderCmdQueue * reder)
{
Director::getInstance()->getGLView()->init(); // init gl in render thread
LARGE_INTEGER nLast;
LARGE_INTEGER nNow;
QueryPerformanceFrequency(&nFreq);
perFrame.QuadPart = (LONGLONG)(1.0 / DEFAULT_FPS* nFreq.QuadPart);
QueryPerformanceCounter(&nLast);
float delta = 0.0f;
while (true)
{
QueryPerformanceCounter(&nNow);
if (nNow.QuadPart - nLast.QuadPart > perFrame.QuadPart)//default fps is 120
{
delta = (float)(nNow.QuadPart - nLast.QuadPart) / (float)nFreq.QuadPart;
reder->render();
reder->setRenderFPS(1.0f / delta + 1);
nLast.QuadPart = nNow.QuadPart;
}
else
{
//std::this_thread::sleep_for(std::chrono::microseconds(0));
}
}
}
Demo 运行效果
渲染命令(RenderCmd)
RenderCmd 是渲染指令的抽象基类,封装了顶点坐标、纹理坐标、透明度等参数。RenderCmd_Quad 是最常用的实现,负责绘制一个带纹理的四边形。
//.h
class RenderCmd :public Ref
{
public:
virtual void exec(Texture2D*) = 0;
float *_coord2f = 0;
Vec2 _vertex[4];
float _opacity = 0.0f;
Texture2D*tex=nullptr;
};
class RenderCmd_Quad :public RenderCmd
{
public:
virtual void exec(Texture2D*)override;
};
//.cpp
void RenderCmd_Quad::exec(Texture2D *tex)
{
GLuint id = tex->getTextureID();
tex->getFileName().c_str();
glLoadIdentity();// vetex can work once
glBindTexture(GL_TEXTURE_2D, id);
// able alpha blend for the texture who has alpha
glEnable(GL_BLEND);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
//able opacity
glEnable(GL_ALPHA_TEST);
glAlphaFunc(GL_GREATER, 0.0f);
glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_MODULATE);
glColor4f(1.0f, 1.0f, 1.0f, _opacity);
//start to render
glBegin(GL_QUADS);
glTexCoord2f(_coord2f[0], _coord2f[1]); glVertex2f(_vertex[0].x, _vertex[0].y);
glTexCoord2f(_coord2f[2], _coord2f[3]); glVertex2f(_vertex[1].x, _vertex[1].y);
glTexCoord2f(_coord2f[4], _coord2f[5]); glVertex2f(_vertex[2].x, _vertex[2].y);
glTexCoord2f(_coord2f[6], _coord2f[7]); glVertex2f(_vertex[3].x, _vertex[3].y);
glDisable(GL_BLEND);
glDisable(GL_ALPHA_TEST);
glEnd();
}
渲染队列(RenderCmdQueue)
RenderCmdQueue 是整个多线程渲染的核心——它持有渲染命令列表和纹理池,通过 std::mutex 保证线程安全。
| 方法 | 线程 | 职责 |
|---|---|---|
addRenderCmd | 逻辑线程 | 将渲染命令压入队列 |
NextTick | 逻辑线程 | 通知渲染线程可以开始绘制 |
clear | 逻辑线程 | 清空上一帧命令,准备下一帧 |
render | 渲染线程 | 执行所有命令,SwapBuffers |
addFuncToRenderThread | 逻辑线程 | 注册需要在 GL 线程执行的回调(如纹理创建) |
类声明:
class RenderCmdQueue
{
public:
void addFuncToRenderThread(const std::function<Texture2D*(void)> &func);
void addCustomFuncToRenderThread(const std::function<void(void)> &func);
private:
std::unordered_map<std::string , Texture2D*> tex_pool;
std::vector<std::function<Texture2D*(void)>> _func;
std::vector <std::function<void(void)>> _func1;
void processOtherThreadFunc();
std::mutex _mutex;
public:
static RenderCmdQueue*create();
void addRenderCmd(RenderCmd*cmd);
void clear();
void NextTick();//thread safe
void draw();
void setVerticalSynchronization(bool enable);
private:
std::atomic<bool> isNextTick = false;
std::vector<RenderCmd*> _queue;
void clearAllRenderCmd();
RenderCmdQueue()
{
_queue.reserve(200);
}
};
完整实现:
//cpp
static void ThreadFunc(RenderCmdQueue * reder)
{
Director::getInstance()->getGLView()->init(); // init gl in render thread
LARGE_INTEGER nLast;
LARGE_INTEGER nNow;
QueryPerformanceFrequency(&nFreq);
perFrame.QuadPart = (LONGLONG)(1.0 / 120 * nFreq.QuadPart);
QueryPerformanceCounter(&nLast);
float delta = 0.0f;
while (true)
{
QueryPerformanceCounter(&nNow);
if (nNow.QuadPart - nLast.QuadPart > perFrame.QuadPart)
{
delta = (float)(nNow.QuadPart - nLast.QuadPart) / (float)nFreq.QuadPart;
reder->render();
reder->setRenderFPS(1.0f / delta + 1);
nLast.QuadPart = nNow.QuadPart;
}
else
{
//std::this_thread::sleep_for(std::chrono::microseconds(0));
}
}
}
RenderCmdQueue* RenderCmdQueue::create()
{
RenderCmdQueue*ret = new RenderCmdQueue;
//Director::getInstance()->getGLView()->init();
/*std::thread t([ret]()
{
Director::getInstance()->getGLView()->init();
while (true)
{
ret->draw();
// Sleep(100);
}
}); t.detach();*/
static std::thread t(&ThreadFunc, ret); t.detach();
return ret;
}
void RenderCmdQueue::addRenderCmd(RenderCmd*cmd)
{
_mutex.lock();
_queue.push_back(cmd);
_mutex.unlock();
}
void RenderCmdQueue::clear()
{
isNextTick = false;
_mutex.lock();
++_tick_status;
if (_tick_status > 5)
{
_tick_status = 0;
auto dir = Director::getInstance();
_cache_fps = dir->getFPS();
_cache_last_fps = this->getRenderFPS();
_cache_redertime = dir->getRenderTime();
}
this->clearAllRenderCmd();
_mutex.unlock();
}
void RenderCmdQueue::setVerticalSynchronization(bool enable)
{
if (enable)
{
glfwSwapInterval(0xff);
}
else
{
glfwSwapInterval(0x0);
}
}
void RenderCmdQueue::NextTick()//thread safe
{
this->isNextTick = true;
}
void RenderCmdQueue::clearAllRenderCmd()
{
for (int i = 0; i < _queue.size(); i++)
{
_queue[i]->release();
}
_queue.clear();
}
void RenderCmdQueue::render()
{
do
{
if (isNextTick == false)break;
std::lock_guard<std::mutex> lock(_mutex);
// if (_queue.empty())break;
glClear(GL_COLOR_BUFFER_BIT);
// glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glEnable(GL_TEXTURE_2D);
for (int i = 0; i < _queue.size(); i++)
{
RenderCmd *cmd = _queue[i];
auto it = tex_pool.find(cmd->tex->getFileName());
if (it == tex_pool.end())
{
Texture2D* tex = new Texture2D;
auto image = new Image;
image->initWithFile(cmd->tex->getFileName());
tex->initWithImage(image);
cmd->exec(tex);
tex_pool[cmd->tex->getFileName()] = tex;
}
else
{
cmd->exec((*it).second);
}
}
glDisable(GL_TEXTURE_2D);
Director::getInstance()->getGLView()->swapBuffers();
} while (false);
this->processOtherThreadFunc();
}
void RenderCmdQueue::processOtherThreadFunc()
{
_mutex.lock();
for (auto &func : _func)
{
Texture2D* tex = func();
tex_pool[tex->getFileName()] = tex;
}
_func.clear();
for (auto &func : _func1)
{
func();
}
_func1.clear();
_mutex.unlock();
}
void RenderCmdQueue::addFuncToRenderThread(const std::function<Texture2D*(void)> &func)
{
this->_mutex.lock();
_func.push_back(func);
this->_mutex.unlock();
}
void RenderCmdQueue::addCustomFuncToRenderThread(const std::function<void(void)> &func)
{
_mutex.lock();
_func1.push_back(func);
_mutex.unlock();
}
多对象渲染测试
性能结论
⚠️ 多线程不一定更快
实测发现,当场景中对象数量很多时(例如 1 万个同纹理对象),多线程渲染效率反而低于单线程:单线程可维持 50+ FPS,多线程版本只有 20+ FPS。
原因在于:大量对象意味着大量渲染命令的生成、入队和锁竞争开销。对于 2D 引擎这种 draw call 密集的场景,线程间同步的成本抵消了并行带来的收益。多线程渲染更适合 draw call 少但单次渲染耗时长的场景。