From fb0a45c4fd42114950c73ee71c59dd33af01689b Mon Sep 17 00:00:00 2001 From: yumemiso Date: Mon, 22 Jun 2026 21:57:06 +0800 Subject: [PATCH] add client endpoint, ckpt, hook methods for gpu virtualization in cloud --- .../gpu-remoting/include/asyncRequest.h | 93 +++ .../gpu-remoting/include/elasticscheduler.h | 358 ++++++++++++ .../gpu-remoting/include/gpuIdMap.h | 239 ++++++++ .../gpu-remoting/include/mapper.h | 331 +++++++++++ .../gpu-remoting/include/msghandler.h | 184 ++++++ .../gpu-remoting/include/ptxExtractor.h | 411 +++++++++++++ .../gpu-remoting/include/registerIOV.h | 236 ++++++++ .../gpu-remoting/include/requestBuffer.h | 238 ++++++++ .../gpu-remoting/include/requestIOV.h | 307 ++++++++++ .../gpu-remoting/include/scheduler.h | 544 ++++++++++++++++++ .../gpu-remoting/include/serverEndpoint.h | 422 ++++++++++++++ .../src/client/CMakeLists.txt(clientApp) | 25 + .../gpu-remoting/src/client/clientApp.cc.bak | 278 +++++++++ .../gpu-remoting/src/client/clientCkpt.cc | 275 +++++++++ .../gpu-remoting/src/client/clientEndpoint.cc | 345 +++++++++++ .../gpu-remoting/src/client/clientHook.cc | 224 ++++++++ 16 files changed, 4510 insertions(+) create mode 100644 GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/mapper.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/msghandler.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/registerIOV.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/requestIOV.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/scheduler.h create mode 100644 GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp) create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc diff --git a/GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h b/GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h new file mode 100644 index 0000000..e028a99 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h @@ -0,0 +1,93 @@ +#ifndef ASYNC_REQUEST_H +#define ASYNC_REQUEST_H + +#include "configure.h" +#include +#include +#include +#include + +class AsyncRequest{ + private: + const char* myName_ = "AsyncRequest"; + boost::mutex _mutex; + boost::condition_variable _cv; // wake up waiting thread + bool _stop; // stop the worker thread + bool _newReq; // whether there is a new request + boost::thread* _worker = nullptr; // worker thread + + public: + AsyncRequest() : _stop(false), _newReq(false) { + } + + void Start(boost::function func){ + boost::thread_attributes attrs; + attrs.set_stack_size(THREAD_STACK_SIZE); + _worker = new boost::thread(attrs, func); + } + + inline bool CheckStart() { + return _worker != nullptr; + } + + void Wait() { + boost::unique_lock lock(_mutex); + _cv.wait(lock, [this] { + return !_newReq; // wait until there is a new request + }); + } + + void Notify() { + boost::unique_lock lock(_mutex); + _newReq = !_newReq; // set new request flag + _cv.notify_one(); // notify the worker thread + } + + bool Check() { + boost::unique_lock lock(_mutex); + _cv.wait(lock, [this] { + return _newReq || _stop; + }); + if (_stop && !_newReq) { // receive stop signal and no new request + return false; + } + return true; + } + + bool CheckStop(boost::chrono::seconds timeout) { + boost::unique_lock lock(_mutex); + bool res = _cv.wait_for(lock, timeout, [this] { + return _stop; + }); + return res; + } + + void Stop() { + boost::unique_lock lock(_mutex); + _stop = true; + _cv.notify_one(); + } + + void Lock() { + _mutex.lock(); + } + + void Unlock() { + _mutex.unlock(); + } + + ~AsyncRequest(){ + Stop(); + tool::Logging(LOG_DEBUG, myName_, "ready to destroy\n"); + if (_worker != nullptr) { + if (_worker->joinable()) { + _worker->join(); + } + tool::Logging(LOG_DEBUG, myName_, "worker thread is joined\n"); + delete _worker; + } + tool::Logging(LOG_DEBUG, myName_, "finish destroying\n"); + } +}; + +#endif // ASYNC_REQUEST_H \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h b/GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h new file mode 100644 index 0000000..a7ca013 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h @@ -0,0 +1,358 @@ +#include "configure.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "constVar.h" +#include "define.h" +#include +#include +using namespace std; + +#define GPU_MAX_NUMe 2 +//高低优先级宏定义,可修改成其他值 +#define HighPriority 0//高优先级 +#define LowPriority 1//低优先级 +#define ProfilingIter 20//Profiling的Iteration数 + +typedef struct eJob_Info{ + uint64_t client_id; + int device; + int priority; + int lastIt;//记录上一次迭代数 + int curIt;//记录当前迭代数 + int totalIt;//记录总迭代数 + unordered_map coloThrougput;//记录与当前高优先级作业的共置吞吐量 + chrono::steady_clock::time_point job_start_time;//记录当前作业开始时间 + int highColoIndex = 0;//高优先级作业用于记录当前profiling到第几个低优先级作业 + int finishProFlag = 0;//低优先级作业用于记录当前作业是否已经profiling + //dummy + int tmpFlag; + +}eJob_Info_; + + +class eScheduler{ + private: + const char* myName_ = "eScheduler"; + static set jobs; + static chrono::steady_clock::time_point timers[GPU_MAX_NUMe];//计时器,判断处理哪个队列 + static unordered_map JobIsRunning;//判断任务是否运行,JobIsRunning[job] = 1表示job正在运行 + static std::mutex mtx_used[GPU_MAX_NUMe];//一堆锁 + static std::mutex mtx_time[GPU_MAX_NUMe]; + static std::mutex mtx_low[GPU_MAX_NUMe]; + static std::mutex mtx_high[GPU_MAX_NUMe]; + static std::mutex mtx_del[GPU_MAX_NUMe]; + static vector HighPriorityQueue[GPU_MAX_NUMe];//高优先级队列,存放高优先级任务 + static vector LowPriorityQueue[GPU_MAX_NUMe];//低优先级队列,存放低优先级任务 + static eJob_Info_* nowHighjob[GPU_MAX_NUMe];//当前 <正在运行> 的 <高优先级任务> + static int HighPriorityRunningFlag[GPU_MAX_NUMe];//当前有几个高优先级任务在运行 + static int LowPriorityRunningFlag[GPU_MAX_NUMe];//当前有几个低优先级任务在运行 + static int ColoRunningFlag[GPU_MAX_NUMe];//当前是否有任务在 <共置> 运行 + static int FinishProfiling[GPU_MAX_NUMe];//当前高优先级任务是否已经完成了所有的低优先级任务的profiling + static int OneLastLowJob[GPU_MAX_NUMe];//当前是否只有一个低优先级任务,用于最后一个低优先级任务唤醒自己 + + static vector LowPriorityWaitingQueue[GPU_MAX_NUMe];//低优先级队列,存放低优先级任务 + + eJob_Info_* job; + + public: + eScheduler(uint64_t clientId){ + tool::Logging(LOG_INFO,myName_,"eScheduler::eScheduler(): client_ID:%lu\n",clientId); + job = new eJob_Info_(); + job->client_id = clientId; + } + + ~eScheduler(){ + tool::Logging(LOG_INFO,myName_,"eScheduler::~eScheduler(): client_ID:%lu\n",job->client_id); + delete job; + } + + void enqueue(uint64_t client_id, int device, int priority){//每次接受一个劫持的API就调用一次 + if(jobs.count(job) == 0){//作业第一次到达。 + job->device = device; + job->priority = priority; + jobs.insert(job);//插入所有作业的队列 + if(job->priority == HighPriority){//高优先级任务 + HighPriorityQueue[device].push_back(job); + JobIsRunning[job] = 0; + if(HighPriorityRunningFlag[device] == 0){//如果当前没有高优先级任务在运行,启动当前任务 + HighPriorityRunningFlag[device]++; + JobIsRunning[job] = 1; + nowHighjob[device] = job; + } + // nowHighjob[device] = job; + tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert HighPriorityQueue, device:%d, HighPriorityQueue_size:%d\n",device, client_id, device ,HighPriorityQueue[device].size()); + }else if(job->priority == LowPriority){//低优先级任务 + // LowPriorityQueue[device].push_back(job); + LowPriorityWaitingQueue[device].push_back(job);//低优先级任务先放在等待队列中 + tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert LowPriorityWaitingQueue, device:%d, LowPriorityWaitingQueue_size:%d\n",device, client_id, device ,LowPriorityWaitingQueue[device].size()); + while(LowPriorityQueue[device].size() >= 3 && LowPriorityQueue[device].front() != job){//如果当前有低优先级任务在运行,直接返回 + sleep(2); + } + LowPriorityQueue[device].push_back(job);//低优先级任务放入低优先级队列中 + LowPriorityWaitingQueue[device].erase(LowPriorityWaitingQueue[device].begin());//从等待队列中删除 + tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert LowPriorityQueue, device:%d, LowPriorityQueue_size:%d, LowPriorityWaitingQueue_size:%d\n",device, client_id, device ,LowPriorityQueue[device].size(), LowPriorityWaitingQueue[device].size()); + if(FinishProfiling[device] == 1){ + FinishProfiling[device] = 0; + } + if(LowPriorityQueue[device].size() > 1){ + OneLastLowJob[device] = 0; + } + } + }else{ + while(job->priority == HighPriority && JobIsRunning[job] == 0){ + sleep(5); + } + if(job->curIt < 5){//前5个Iteration属于初始化阶段,不参与调度。 + return; + } + if(job->curIt == 5 && job->priority == LowPriority){ + job->lastIt = job->curIt; + } + while(job->priority == LowPriority && JobIsRunning[job] == 0 && HighPriorityRunningFlag[device] > 0){//高优先级任务突然到来的情况 + if(OneLastLowJob[device]){//最后一个低优先级任务唤醒自己 + JobIsRunning[job] = 1; + return; + } + sleep(1); + } + if(job->priority == LowPriority && FinishProfiling[device] == 0 && job->finishProFlag == 1 && JobIsRunning[job] == 1){ + JobIsRunning[job] = 0; + LowPriorityRunningFlag[device]--; + tool::Logging(LOG_INFO,myName_,"new low priority job comming, stop 2 let new job profiling\n"); + } + if(job->priority == LowPriority && HighPriorityRunningFlag[device] > 0 && LowPriorityRunningFlag[device] > 1 && JobIsRunning[job] == 1){//高优先级任务突然到来的情况,低优先级作业暂停自己 + job->lastIt = job->curIt; + JobIsRunning[job] = 0; + LowPriorityRunningFlag[device]--; + } + { + std::lock_guard lock(mtx_used[device]); + if(job->priority == HighPriority && FinishProfiling[device] == 1 && LowPriorityRunningFlag[device] == 0){ + ToChooseBestColoJob(device); + } + if(job->priority == HighPriority && job->highColoIndex >= LowPriorityQueue[device].size() && LowPriorityRunningFlag[device] == 0 && FinishProfiling[device] == 0){ + tool::Logging(LOG_INFO,myName_,"HighPriority Job:%lu has new Job to profiling\n", nowHighjob[device]->client_id); + // nowHighjob[device] = HighPriorityQueue[device][0]; + job->highColoIndex = LowPriorityQueue[device].size() - 1; + auto lowJob = LowPriorityQueue[device][job->highColoIndex]; + tool::Logging(LOG_INFO,myName_,"choosing low priority job 2 profiling : %lu\n",lowJob->client_id); + JobIsRunning[lowJob] = 1; + LowPriorityRunningFlag[device]++; + lowJob->lastIt = lowJob->curIt;//记录当前迭代数 + } + if(job->priority == HighPriority && job->highColoIndex < LowPriorityQueue[device].size() && LowPriorityRunningFlag[device] == 0){ + nowHighjob[device] = HighPriorityQueue[device][0]; + auto lowJob = LowPriorityQueue[device][job->highColoIndex]; + tool::Logging(LOG_INFO,myName_,"choosing low priority job 2 profiling : %lu\n",lowJob->client_id); + JobIsRunning[lowJob] = 1; + LowPriorityRunningFlag[device]++; + lowJob->lastIt = lowJob->curIt;//记录当前迭代数 + } + } + if(job->priority == LowPriority && job->curIt - job->lastIt == 0 && HighPriorityRunningFlag[device] != 0){ + if(job->job_start_time == chrono::steady_clock::time_point()){ + tool::Logging(LOG_INFO,myName_,"client id %lu start profiling\n",job->client_id); + job->job_start_time = chrono::steady_clock::now(); + nowHighjob[device]->lastIt = nowHighjob[device]->curIt; + } + }else if(job->priority == LowPriority && job->curIt - job->lastIt == ProfilingIter && job->finishProFlag == 0 && HighPriorityRunningFlag[device] != 0){ + JobIsRunning[job] = 0; + chrono::steady_clock::time_point now_time = chrono::steady_clock::now(); + // int duration = (chrono::duration_cast(now_time - job->job_start_time).count() * 1.0 / 1000); + double duration = (chrono::duration_cast(now_time - job->job_start_time).count() * 1.0 / 1000); + tool::Logging(LOG_INFO,myName_,"--------------------------------------------------------\n"); + tool::Logging(LOG_INFO,myName_,"duration:%f\n",duration); + tool::Logging(LOG_INFO,myName_,"nowHighjob->curIt:%d\n",nowHighjob[device]->curIt - nowHighjob[device]->lastIt); + job->coloThrougput[nowHighjob[device]] = 1.0 * 20 / duration; + tool::Logging(LOG_INFO,myName_,"LowPriority Job:client id %lu coloThrougput:%f\n",job->client_id,job->coloThrougput[nowHighjob[device]]); + nowHighjob[device]->coloThrougput[job] = 1.0 * (nowHighjob[device]->curIt - nowHighjob[device]->lastIt) / duration; + tool::Logging(LOG_INFO,myName_,"HighPriority Job: client id %lu coloThrougput:%f\n",nowHighjob[device]->client_id,nowHighjob[device]->coloThrougput[job]); + nowHighjob[device]->highColoIndex++; + LowPriorityRunningFlag[device]--; + job->job_start_time = chrono::steady_clock::time_point(); + job->finishProFlag = 1; + if(nowHighjob[device]->highColoIndex == LowPriorityQueue[device].size()){ + tool::Logging(LOG_INFO,myName_,"HighPriority Job:%lu Finish all Profiling\n", nowHighjob[device]->client_id); + FinishProfiling[device] = 1; + } + tool::Logging(LOG_INFO,myName_,"LowPriorityRunningFlag:%d\n",LowPriorityRunningFlag[device]); + // std::cout << "finishflag:" << FinishProfiling[device] << "low runing num :"<< LowPriorityRunningFlag[device] << std::endl; + tool::Logging(LOG_INFO,myName_,"--------------------------------------------------------\n"); + } + { + std::lock_guard lock(mtx_low[device]); + if(job->priority == LowPriority && JobIsRunning[job] == 1 && HighPriorityRunningFlag[device] == 0 && LowPriorityRunningFlag[device] == 1 && !OneLastLowJob[device]){ + for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){//启动所有低优先级作业 + if((*it)->client_id == job->client_id){ + continue; + } + auto tmplowjob = *it; + JobIsRunning[tmplowjob] = 1; + LowPriorityRunningFlag[device]++; + tool::Logging(LOG_INFO,myName_,"starting low job : %lu , LowPriorityRunningFlag:%d\n",tmplowjob->client_id,LowPriorityRunningFlag[device]); + } + tool::Logging(LOG_INFO,myName_,"all low jobs are running\n"); + } + } + + } + } + + void ToChooseBestColoJob(int device){ + std::lock_guard lock(mtx_del[device]); + if(ColoRunningFlag[device] != 0 && LowPriorityRunningFlag[device] != 0){ + return; + } + + if(nowHighjob[device]->coloThrougput.size() == 0){ + return; + } + + auto max_element = std::max_element( + nowHighjob[device]->coloThrougput.begin(), nowHighjob[device]->coloThrougput.end(), + [](const std::pair& a, const std::pair& b) { + return a.second < b.second; + } + ); + auto nowLowJob = max_element->first; + tool::Logging(LOG_INFO,myName_,"choosing best colo job: %lu\n",nowLowJob->client_id); + JobIsRunning[nowLowJob] = 1; + LowPriorityRunningFlag[device]++; + ColoRunningFlag[device]++; + } + + void cal_add_It(uint64_t client_id){ + ++job->curIt; + } + + void get_Iteration(int num){ + job->totalIt = int(num); + } + + void free_jobs(uint64_t client_id, int device){ + if(job->priority == HighPriority){ + std::lock_guard lock(mtx_high[device]); + for(auto it = jobs.begin(); it != jobs.end(); ++it){ + auto tmp = (*it)->coloThrougput.find(job); + if(tmp != (*it)->coloThrougput.end()){ + (*it)->coloThrougput.erase(tmp); + } + } + HighPriorityRunningFlag[device]--; + for(auto it = HighPriorityQueue[device].begin(); it != HighPriorityQueue[device].end(); ++it){ + if((*it)->client_id == client_id && (*it)->device == device){ + HighPriorityQueue[device].erase(it); + tool::Logging(LOG_INFO,myName_,"free clientId:%lu free job\n",client_id); + break; + } + } + if(HighPriorityQueue[device].size() == 0){ + nowHighjob[device] = NULL; + }else{ + nowHighjob[device] = HighPriorityQueue[device][0]; + JobIsRunning[nowHighjob[device]] = 1; + HighPriorityRunningFlag[device]++; + for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){//启动下一个高优先级作业,对所有低优先作业重新profiling + tool::Logging(LOG_INFO,myName_,"Stopping Low job : %lu\n",(*it)->client_id); + (*it)->finishProFlag = 0; + if(JobIsRunning[(*it)] == 1){ + JobIsRunning[(*it)] = 0; + LowPriorityRunningFlag[device]--; + } + (*it)->lastIt = (*it)->curIt; + } + FinishProfiling[device] = 0; + } + for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){ + (*it)->coloThrougput.erase(job); + } + }else if(job->priority == LowPriority){ + for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){ + if((*it)->client_id == client_id && (*it)->device == device){ + LowPriorityQueue[device].erase(it); + tool::Logging(LOG_INFO,myName_,"free clientId:%lu free job\n",client_id); + break; + } + } + { + std::lock_guard lock(mtx_used[device]); + for(auto it = jobs.begin(); it != jobs.end(); ++it){ + auto tmp = (*it)->coloThrougput.find(job); + if(tmp != (*it)->coloThrougput.end()){ + (*it)->coloThrougput.erase(tmp); + } + } + if(LowPriorityRunningFlag[device] > 0){ + LowPriorityRunningFlag[device]--; + } + ColoRunningFlag[device]--; + // if(FinishProfiling[device] == 0 && HighPriorityRunningFlag[device] == 1){//说明该低优先级任务在Profiling的时候就结束了执行 + // nowHighjob[device]->highColoIndex--; + // } + if(LowPriorityQueue[device].size() == 1){ + OneLastLowJob[device] = 1; + } + } + } + + auto it = JobIsRunning.find(job); + if(it != JobIsRunning.end()){ + JobIsRunning.erase(it); + } + + for(auto it = jobs.begin(); it != jobs.end(); ++it){ + auto tmp = (*it)->coloThrougput.find(job); + if(tmp != (*it)->coloThrougput.end()){ + (*it)->coloThrougput.erase(tmp); + } + } + for(auto it = jobs.begin(); it != jobs.end(); ++it){ + if((*it)->client_id == client_id && (*it)->device == device){ + jobs.erase(it); + tool::Logging(LOG_INFO,myName_,"free clientId:%lu free job\n",client_id); + break; + } + } + } + +}; + + + + +inline set eScheduler::jobs; +inline chrono::steady_clock::time_point eScheduler::timers[GPU_MAX_NUMe]; +inline unordered_map eScheduler::JobIsRunning; +inline std::mutex eScheduler::mtx_used[GPU_MAX_NUMe]; +inline std::mutex eScheduler::mtx_time[GPU_MAX_NUMe]; +inline std::mutex eScheduler::mtx_low[GPU_MAX_NUMe]; +inline std::mutex eScheduler::mtx_high[GPU_MAX_NUMe]; +inline std::mutex eScheduler::mtx_del[GPU_MAX_NUMe]; +inline vector eScheduler::HighPriorityQueue[GPU_MAX_NUMe]; +inline vector eScheduler::LowPriorityQueue[GPU_MAX_NUMe]; +inline eJob_Info_* eScheduler::nowHighjob[GPU_MAX_NUMe]; +inline int eScheduler::HighPriorityRunningFlag[GPU_MAX_NUMe]; +inline int eScheduler::LowPriorityRunningFlag[GPU_MAX_NUMe]; +inline int eScheduler::ColoRunningFlag[GPU_MAX_NUMe]; +inline int eScheduler::FinishProfiling[GPU_MAX_NUMe]; +inline int eScheduler::OneLastLowJob[GPU_MAX_NUMe]; +inline vector eScheduler::LowPriorityWaitingQueue[GPU_MAX_NUMe]; \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h b/GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h new file mode 100644 index 0000000..62d922a --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h @@ -0,0 +1,239 @@ +#ifndef GPU_ID_MAP_H +#define GPU_ID_MAP_H + +#include +#include +#include "chunkStructure.h" +#include +#include +#include +#include +#include +#include "define.h" +#include + +struct GpuInfoEntry_t { + int nodeDevIdx; + uint16_t nodePort; + char nodeIp[IP_STRING_LEN]; + uint16_t dataPort; + char dataIp[IP_STRING_LEN]; + cudaDeviceProp devprop; +}; + +class GPUidMap{ + private: + const char* myName_ = "GPUidMap"; + int proxySock_ = 0; + std::vector gpuInfoList_; + uint64_t clientID_; + size_t devNum_; + size_t reqCommIDcnt_ = 1; // request NCCL unique ID count (avoid request for old ID, start from 1) +#ifdef GV_MSGHANDLER + const char* model_; + size_t batch_Size_; +#endif + public: +#ifndef GV_MSGHANDLER + GPUidMap(size_t devNum, uint64_t clientID, const std::string& proxyIP, uint16_t proxyPort) { + clientID_ = clientID; + devNum_ = devNum; + gpuInfoList_.reserve(devNum_); + + struct sockaddr_storage serv_addr; + if ((proxySock_ = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + tool::Logging(LOG_ERROR, myName_, "Socket creation failed\n"); + exit(EXIT_FAILURE); + } + + tool::SetSockAddr(proxyIP.c_str(), proxyPort, &serv_addr, AF_INET); + if (connect(proxySock_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { + tool::Logging(LOG_ERROR, myName_, "Connection to GPU Proxy(%s:%d) failed\n", proxyIP.c_str(), proxyPort); + exit(EXIT_FAILURE); + } + else { + tool::Logging(LOG_INFO, myName_, "Connection to GPU Proxy success\n"); + } + + RequestGPU(); + } +#else + GPUidMap(size_t devNum, uint64_t clientID, const std::string& model, size_t batch_Size, const std::string& proxyIP, uint16_t proxyPort) { + clientID_ = clientID; + devNum_ = devNum; + model_ = model.c_str(); + batch_Size_ = batch_Size; + gpuInfoList_.reserve(devNum_); + + struct sockaddr_storage serv_addr; + if ((proxySock_ = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + tool::Logging(LOG_ERROR, myName_, "Socket creation failed\n"); + exit(EXIT_FAILURE); + } + + tool::SetSockAddr(proxyIP.c_str(), proxyPort, &serv_addr, AF_INET); + if (connect(proxySock_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { + tool::Logging(LOG_ERROR, myName_, "Connection to GPU Proxy(%s:%d) failed\n", proxyIP.c_str(), proxyPort); + exit(EXIT_FAILURE); + } + else { + tool::Logging(LOG_INFO, myName_, "Connection to GPU Proxy success\n"); + } + + RequestGPU(); + } +#endif + ~GPUidMap() { + tool::Logging(LOG_DEBUG, myName_, "Close socket to GPU Proxy\n"); + close(proxySock_); + } + + void RequestGPU() { +#ifndef GV_MSGHANDLER + std::string sendData = "GPUQuery:" + std::to_string(clientID_) + "," + std::to_string(devNum_); +#else + std::string sendData = "GPUQuery:" + std::to_string(clientID_) + "," + std::to_string(devNum_) + "," + model_ + "," + std::to_string(batch_Size_); +#endif + // + size_t sendDataLen = sendData.size(); + send(proxySock_, &sendDataLen, sizeof(size_t), 0); + send(proxySock_, sendData.c_str(), sendDataLen, 0); + tool::Logging(LOG_DEBUG, myName_, "Send GPU allocation request(%zu bytes) to GPU Proxy\n", sendDataLen); + + for (int i = 0; i < devNum_; i++) { + GpuInfoEntry_t* gpuInfo = (GpuInfoEntry_t*)malloc(sizeof(GpuInfoEntry_t)); + if (!tool::ReadSocketMessage(proxySock_, (uint8_t*)gpuInfo, sizeof(GpuInfoEntry_t))) { + // printf("Failed to read buffer length from GPU Proxy\n"); + tool::Logging(LOG_ERROR, myName_, "Failed to recv gpu info list from GPU Proxy\n"); + exit(EXIT_FAILURE); + } + else { + tool::Logging(LOG_INFO, myName_, "GPU ID: %d, IP Address: %s, Port: %d, DataIP: %s, DataPort: %d, prop->name: %s\n", gpuInfo->nodeDevIdx, gpuInfo->nodeIp, gpuInfo->nodePort, gpuInfo->dataIp, gpuInfo->dataPort, gpuInfo->devprop.name); + } + gpuInfoList_.emplace_back(gpuInfo); + } + } + + void ReallocGPU(){ + tool::Logging(LOG_DEBUG, myName_, "ReallocGPU: ready to realloc GPU\n"); + std::string sendData = "GPURealloc:" + std::to_string(clientID_) + "," + std::to_string(devNum_); + size_t sendDataLen = sendData.size(); + send(proxySock_, &sendDataLen, sizeof(size_t), 0); + send(proxySock_, sendData.c_str(), sendDataLen, 0); + tool::Logging(LOG_DEBUG, myName_, "Send GPU Reallocation request(%zu bytes) to GPU Proxy\n", sendDataLen); + for(int i = 0; i < devNum_; ++i){ + tool::Logging(LOG_INFO, myName_, "Waiting for GPU reallocation\n"); + GpuInfoEntry_t* gpuInfo = (GpuInfoEntry_t*)malloc(sizeof(GpuInfoEntry_t)); + if (!tool::ReadSocketMessage(proxySock_, (uint8_t*)gpuInfo, sizeof(GpuInfoEntry_t))) {//没得到GPU前阻塞 + tool::Logging(LOG_ERROR, myName_, "Failed to recv gpu info list from GPU Proxy\n"); + exit(EXIT_FAILURE); + } + else { + tool::Logging(LOG_INFO, myName_, "Reallocate GPU ID: %d, IP Address: %s, Port: %d, DataIP: %s, DataPort: %d, prop->name: %s\n", gpuInfo->nodeDevIdx, gpuInfo->nodeIp, gpuInfo->nodePort, gpuInfo->dataIp, gpuInfo->dataPort, gpuInfo->devprop.name); + } + gpuInfoList_[i] = gpuInfo; + } + // gpuInfoList_[0]->nodeDevIdx = 1; + + } + + void UpdateUniqueID(const uint8_t* uniqueID, const size_t len) { + std::string sendData = "CommUpdate:" + std::to_string(clientID_) + "," + std::to_string(len); + size_t sendDataLen = sendData.size(); + send(proxySock_, &sendDataLen, sizeof(size_t), 0); + send(proxySock_, sendData.c_str(), sendDataLen, 0); + send(proxySock_, uniqueID, len, 0); + tool::Logging(LOG_DEBUG, myName_, "Send NCCL unique ID update request(%zu bytes) to GPU Proxy\n", sendDataLen); + // for (size_t i = 0; i < len; i++) { + // printf("%02x", uniqueID[i]); + // } + // printf("\n"); + } + + void RequestUniqueID(uint8_t* uniqueID, const size_t len) { + reqCommIDcnt_++; + uint8_t isVaild = false; + std::string sendData = "CommQuery:" + std::to_string(clientID_) + "," + std::to_string(reqCommIDcnt_); + size_t sendDataLen = sendData.size(); + while (!(bool)isVaild) { + send(proxySock_, &sendDataLen, sizeof(size_t), 0); + send(proxySock_, sendData.c_str(), sendDataLen, 0); + tool::Logging(LOG_DEBUG, myName_, "Send NCCL unique ID request to GPU Proxy, waiting for response...\n"); + if (!tool::ReadSocketMessage(proxySock_, &isVaild, sizeof(uint8_t))) { + tool::Logging(LOG_ERROR, myName_, "Failed to recv NCCL unique ID from GPU Proxy\n"); + exit(EXIT_FAILURE); + } + } + + if (!tool::ReadSocketMessage(proxySock_, uniqueID, len)) { + tool::Logging(LOG_ERROR, myName_, "Failed to recv NCCL unique ID from GPU Proxy\n"); + exit(EXIT_FAILURE); + } + else { + tool::Logging(LOG_DEBUG, myName_, "Received NCCL unique ID from GPU Proxy\n"); + } + // for (size_t i = 0; i < len; i++) { + // printf("%02x", uniqueID[i]); + // } + // printf("\n"); + } + + void Print() { + tool::Logging(LOG_INFO, myName_, "GPU ID Map:\n"); + for (int i = 0; i < gpuInfoList_.size(); i++) { + tool::Logging(LOG_INFO, myName_, "[%d] GPU ID: %d, IP Address: %s, Port: %d\n", i, gpuInfoList_[i]->nodeDevIdx, gpuInfoList_[i]->nodeIp, gpuInfoList_[i]->nodePort); + } + } + + + inline bool Check(int virtDevIdx) { + if (virtDevIdx < 0 || virtDevIdx >= gpuInfoList_.size()) { + tool::Logging(LOG_ERROR, myName_, "Invalid virtual device index: %d (max: %d)\n", virtDevIdx, gpuInfoList_.size()); + return false; + } + return true; + } + + bool GetGPUKey(int gpuIdxInNode, int* virtDevIdx) { + tool::Logging(LOG_DEBUG, myName_, "GPUIdGetKey: ready to read the key(gpuIdx=%d) from GPU ID Map\n", gpuIdxInNode); + for (int i = 0; i < gpuInfoList_.size(); i++) { + if (gpuInfoList_[i]->nodeDevIdx == gpuIdxInNode) { + *virtDevIdx = i; + return true; + } + } + return false; + } + + bool GetGPUId(int virtDevIdx, int* gpuIdxInNode) { + tool::Logging(LOG_DEBUG, myName_, "GPUIdGetValue: ready read the key(virDev=%d) from GPU ID Map\n", virtDevIdx); + Check(virtDevIdx); + *gpuIdxInNode = gpuInfoList_[virtDevIdx]->nodeDevIdx; + return true; + } + + bool GetGPUprop(int virtDevIdx, cudaDeviceProp* prop) { + tool::Logging(LOG_DEBUG, myName_, "GetGpuIdMap: ready to read the key(virDev=%d) from GPU ID Map\n", virtDevIdx); + Check(virtDevIdx); + *prop = gpuInfoList_[virtDevIdx]->devprop; + return true; + } + + bool GetGPUinfo(int virtDevIdx, GpuInfoEntry_t** gpuInfo) { + tool::Logging(LOG_DEBUG, myName_, "GetGpuIdMap: ready to read the key(virDev=%d) from GPU ID Map\n", virtDevIdx); + Check(virtDevIdx); + *gpuInfo = gpuInfoList_[virtDevIdx]; + return true; + } + + bool SetGPUinfo(int virtDevIdx, const GpuInfoEntry_t& gpuInfo) { + tool::Logging(LOG_DEBUG, myName_, "SetGpuIdMap: ready to write the key(virDev=%d) to GPU ID Map\n", virtDevIdx); + Check(virtDevIdx); + memcpy(gpuInfoList_[virtDevIdx], &gpuInfo, sizeof(GpuInfoEntry_t)); + return true; + } + +}; + + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/mapper.h b/GPU-Virtual-Service/gpu-remoting/include/mapper.h new file mode 100644 index 0000000..9291077 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/mapper.h @@ -0,0 +1,331 @@ +#ifndef MAPPERS_H +#define MAPPERS_H + +#include "configure.h" +#include "./hashing/robin_hood.h" +#include +#include + +class DeviceBlockMapper { +private: + const char* myName_ = "BlockMapper"; + std::vector blocks_; // !: start from 0 + uint64_t baseAddr_; + +public: + DeviceBlockMapper(const size_t reserveSize = 200, uint64_t base = 0x7f12e0a60000) { + blocks_.reserve(reserveSize); + baseAddr_ = base; + } + + const std::vector& GetBlocks() const { + return blocks_; // only read + } + + std::vector& GetBlocks() { + return blocks_; // read and write + } + + void Resize(const size_t size) { + blocks_.resize(size); + } + + uint64_t AddBlock(const uint64_t devPtr, const size_t size, bool essential = false) { + if (blocks_.empty()) { + blocks_.emplace_back(Block_t{.start = baseAddr_, .devPtr = devPtr, .size = size, .valid = true, .essential = essential}); + } + else { + blocks_.emplace_back(Block_t{.start = blocks_.back().start + blocks_.back().size, .devPtr = devPtr, .size = size, .valid = true, .essential = essential}); + } + // printf("AddBlock: devPtr = %p, blockIdx = %zu\n", (void*)devPtr, blocks_.size() - 1); + return blocks_.back().start; + } + + int FindByVirAddr(const uint64_t userAddr, uint64_t& devPtr) { + auto it = std::lower_bound(blocks_.begin(), blocks_.end(), userAddr, + [](const Block_t& block, uint64_t addr) { return block.start + block.size <= addr; }); + + if (it != blocks_.end() && it->start <= userAddr && userAddr < it->start + it->size) { + if (it->valid == false) { + return -1; + } + else { + devPtr = it->devPtr + userAddr - it->start; + return it - blocks_.begin(); + } + + } + return -1; + } + + int FindByRealAddr(const uint64_t devPtr, uint64_t& userAddr) { + for (size_t i = 0; i < blocks_.size(); i++) { + if (blocks_[i].devPtr <= devPtr && devPtr < blocks_[i].devPtr + blocks_[i].size) { + userAddr = blocks_[i].start + devPtr - blocks_[i].devPtr; + return i; + } + } + return -1; + } + + + bool ResetBlock(const uint64_t userAddr, int wantIdx = -1) { + uint64_t devPtr = 0; + int idx = (wantIdx == -1) ? FindByVirAddr(userAddr, devPtr) : wantIdx; + if (idx != -1) { + // printf("ResetBlock: devPtr = %p, blockIdx = %d\n", (void*)devPtr, idx); + blocks_[idx].valid = false; + blocks_[idx].devPtr = 0; + return true; + } + else { + return false; + } + } + + void Print() { + tool::Logging(LOG_INFO, myName_, "BlockManager has %zu blocks\n", blocks_.size()); + for (size_t i = 0; i < blocks_.size(); i++) { + tool::Logging(LOG_INFO, myName_, "Block[%zu]: start = %p, devPtr = %p, size = %zu, valid = %d, essential = %d\n", i, (void*)blocks_[i].start, (void*)blocks_[i].devPtr, blocks_[i].size, blocks_[i].valid, blocks_[i].essential); + } + } +}; + +class HandleMapper { +private: + const char* myName_ = "HandleMapper"; + std::vector handleInfoList; // !: start from 1 + robin_hood::unordered_flat_map mapRealAddr2handleListIdx; + std::queue> freeHandleQueue; + +public: + HandleMapper(const size_t reserveSize = 1000) { + handleInfoList.reserve(reserveSize); + handleInfoList.emplace_back(Handle_t{.handlePtr = (uint64_t)NULL, .type = __CUDA_REGISTER, .valid = true, .stream = 0}); // let the handleNum start from 1 + } + + const std::vector& GetHandleInfoList() const { + return handleInfoList; // only read + } + + std::vector& GetHandleInfoList() { + return handleInfoList; // read and write + } + + void Resize(const size_t size) { + handleInfoList.resize(size); + } + + void Reset(const std::vector>& handleList) { + handleInfoList.resize(handleList.back().first + 1); // handleIdxList is sorted and has at least one element + mapRealAddr2handleListIdx.clear(); + while (!freeHandleQueue.empty()) { + freeHandleQueue.pop(); + } + for (size_t i = 0; i < handleList.size(); i++) { + size_t handleIdx = handleList[i].first; + handleInfoList[handleIdx] = handleList[i].second; + if (i == 0) { + continue; + } + mapRealAddr2handleListIdx.insert({(uint64_t)handleInfoList[handleIdx].handlePtr, handleIdx}); + // no need to push the free space into the queue since the replaying process has not been completed + // if (i == 1 && handleIdx != 1) { + // freeHandleQueue.push({1, handleIdx - 1}); + // } + // else if (i > 1 && handleIdx != handleList[i - 1].first + 1) { + // freeHandleQueue.push({handleList[i - 1].first + 1, handleIdx - 1}); + // } + } + } + + void Shrink() { + tool::Logging(LOG_DEBUG, myName_, "Shrink: handleInfoList size: %zu (mapRealAddr2handleListIdx size: %zu)\n", handleInfoList.size(), mapRealAddr2handleListIdx.size()); + mapRealAddr2handleListIdx.clear(); + while (!freeHandleQueue.empty()) { + freeHandleQueue.pop(); + } + + for (size_t i = 1; i < handleInfoList.size(); i++) { + if (handleInfoList[i - 1].valid == false && handleInfoList[i].valid == false) { + freeHandleQueue.back().second = i; + } + else if (handleInfoList[i - 1].valid == true && handleInfoList[i].valid == false) { + freeHandleQueue.push({i, i}); + } + + if (handleInfoList[i].valid == true) { + mapRealAddr2handleListIdx.insert({(uint64_t)handleInfoList[i].handlePtr, i}); + } + } + tool::Logging(LOG_DEBUG, myName_, "Shrink: mapRealAddr2handleListIdx (valid handles) is reorganized to %zu\n", mapRealAddr2handleListIdx.size()); + } + + void Indexing() { + mapRealAddr2handleListIdx.clear(); + for (size_t i = 1; i < handleInfoList.size(); i++) { + if (handleInfoList[i].valid == true) { + mapRealAddr2handleListIdx.insert({(uint64_t)handleInfoList[i].handlePtr, i}); + } + } + } + + void* AddHandle(void* realAddr, enum API_REQUEST_CODE_SET handleType) { + auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr); + size_t handleNum = 0; + if (it != mapRealAddr2handleListIdx.end()) { + handleNum = it->second; // find out the address previously created + handleInfoList[handleNum].valid = true; + handleInfoList[handleNum].type = handleType; + } + else { + if (freeHandleQueue.empty()) { // handleInfo List has no free space to add a new handle, so need to expand + handleNum = ( + handleInfoList.emplace_back(Handle_t{.handlePtr = (uint64_t)realAddr, .type = handleType, .valid = true, .stream = 0}), + handleInfoList.size() - 1 + ); + } + else { // reuse the free space in handleInfo List + handleNum = freeHandleQueue.front().first; + freeHandleQueue.front().first++; + if (freeHandleQueue.front().first > freeHandleQueue.front().second) { + freeHandleQueue.pop(); + } + handleInfoList[handleNum].handlePtr = (uint64_t)realAddr; + handleInfoList[handleNum].type = handleType; + handleInfoList[handleNum].valid = true; + handleInfoList[handleNum].stream = 0; + } + mapRealAddr2handleListIdx.insert({(uint64_t)realAddr, handleNum}); + + if (handleInfoList.size() >= HANDLE_MAX_NUM && freeHandleQueue.size() == 0) { + tool::Logging(LOG_INFO, myName_, "AddHandle: mapRealAddr2handleListIdx is too large(%zu), preparing to reorganize\n", mapRealAddr2handleListIdx.size()); + Shrink(); + } + } + // no need to check the handleNum, since ((1LL << 48) - 1) is enough large + uint64_t handleVirAddr = HANDLE_PREFIX | handleNum; + return (void*)handleVirAddr; + } + + void* FindRealAddrByVirAddr(uint64_t userAddr, bool reset) { + if (CHECK_HANDLE_PREFIX(userAddr) == 0) { + return NULL; + } + size_t handleID = GET_HANDLE_ID(userAddr); + if (handleID <= 0 || handleID >= handleInfoList.size() || handleInfoList[handleID].valid == false) { + return NULL; + } + void* realAddr = (void*)(handleInfoList[handleID].handlePtr); + if (reset) { + handleInfoList[handleID].valid = false; + } + return realAddr; + } + + Handle_t* GetHandleInfoByRealAddr(void* realAddr) { + auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr); + if (it != mapRealAddr2handleListIdx.end()) { + return &handleInfoList[it->second]; + } + else { + // for (size_t i = 1; i < handleInfoList.size(); i++) { + // if (handleInfoList[i].handlePtr == (uint64_t)realAddr) { + // return &handleInfoList[i]; + // } + // } + // tool::Logging(LOG_ERROR, myName_, "GetHandleInfoByRealAddr failed: realAddr(%p) is not in the map\n", realAddr); + return NULL; + } + } + + uint64_t FindIdxByRealAddr(void* realAddr) { + auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr); + if (it != mapRealAddr2handleListIdx.end()) { + uint64_t virtAddr = HANDLE_PREFIX | it->second; + return virtAddr; + } + else { + for (size_t i = 1; i < handleInfoList.size(); i++) { + if (handleInfoList[i].handlePtr == (uint64_t)realAddr) { + return HANDLE_PREFIX | i; + } + } + tool::Logging(LOG_ERROR, myName_, "FindIdxByRealAddr failed: realAddr(%p) is not in the map\n", realAddr); + return 0; + } + return 0; + } + + Handle_t* GetHandleInfoByVirAddr(uint64_t userAddr) { + if (CHECK_HANDLE_PREFIX(userAddr) == 0) { + return NULL; + } + size_t handleID = GET_HANDLE_ID(userAddr); + if (handleID <= 0 || handleID >= handleInfoList.size()) { + return NULL; + } + return &handleInfoList[handleID]; //! the pointer will be changed + } + + void UpdateHandle(const uint64_t userAddr, void* realAddr, enum API_REQUEST_CODE_SET handleType) { + if (CHECK_HANDLE_PREFIX(userAddr) == 0) { + tool::Logging(LOG_ERROR, myName_, "UpdateHandle failed: userAddr is not a handle(%p)\n", (void*)userAddr); + return; + } + size_t handleID = GET_HANDLE_ID(userAddr); + if (handleID <= 0) { + tool::Logging(LOG_ERROR, myName_, "UpdateHandle failed: handleID(%zu) is out of range\n", handleID); + return; + } + if (handleID == handleInfoList.size()) { + handleInfoList.emplace_back(Handle_t{.handlePtr = (uint64_t)realAddr, .type = handleType, .valid = true, .stream = 0}); + } + if (handleID > handleInfoList.size()) { + handleInfoList.resize(handleID + 1); // expand the boundary + handleInfoList[handleID] = Handle_t{.handlePtr = (uint64_t)realAddr, .type = handleType, .valid = true, .stream = 0}; + } // todo: to be optimized + + // No need to update the mapRealAddr2handleListIdx, since shrink() will be called after the replaying process + // auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr); + // if (it != mapRealAddr2handleListIdx.end()) { + // it->second = handleID; // update the handleID of the realAddr + // } + // else { + // mapRealAddr2handleListIdx.insert({(uint64_t)realAddr, handleID}); + // } // this handleID was created after the checkpoint, maybe was not in the map + + if (handleID < handleInfoList.size()) { // handleID is located in the existing range (not include the boundary) + handleInfoList[handleID].handlePtr = (uint64_t)realAddr; + handleInfoList[handleID].valid = true; + } + + tool::Logging(LOG_DEBUG, myName_, "UpdateHandle: userAddr=%p, original realAddr=%p, new realAddr=%p\n", (void*)userAddr, (void*)handleInfoList[handleID].handlePtr, realAddr); + } + + size_t GetValidHandleNum() { + size_t validHandleNum = 0; + for (size_t i = 1; i < handleInfoList.size(); i++) { + if (handleInfoList[i].valid == true) { + validHandleNum++; + } + } + return validHandleNum; + } + + std::vector> GetValidHandles() { + std::vector> validHandles; + for (size_t i = 0; i < handleInfoList.size(); i++) { + if (handleInfoList[i].valid == true) { + validHandles.emplace_back(i, handleInfoList[i]); + } + } + return validHandles; + } + + size_t GetCapacity() { + return handleInfoList.size(); + } +}; + +#endif // MAPPERS_H \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/msghandler.h b/GPU-Virtual-Service/gpu-remoting/include/msghandler.h new file mode 100644 index 0000000..9c92fe5 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/msghandler.h @@ -0,0 +1,184 @@ +#ifndef MSG_HANDLER_H +#define MSG_HANDLER_H + +#include "configure.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "constVar.h" +#include "define.h" +#include +#include + +extern Configure config_; +using namespace std; + +typedef struct Jobinfo { + size_t used_mem; // memory + uint sm_util; //SM utilization + double served_time;//按秒算 + double remain_time; + double perIt_time; + int cur_It;// current iterations + int num_It;// the number of iterations + int device; + uint64_t client_id; + bool stop_listen_flag = false; + int priority_; + + + bool operator<(const Jobinfo& other) const { + return remain_time < other.remain_time; + } +} Jobinfo_; + + +class MsgHandler { + private: + Jobinfo_* job; + const char *myName_ = "MsgHandler"; + int dispatch_sock = -1; + int monitor_sock = -1; + + public: + bool replay_flag = false; + bool stop_flag = false; + + + MsgHandler(uint64_t clientId) { + tool::Logging(LOG_INFO, myName_, "MsgHandler() client %lu, MsgHandler is created.\n", clientId); + job = new Jobinfo_(); + job->client_id = clientId; + initialize_connection(config_.GetDpcIp(), config_.GetDpcPort(), dispatch_sock); + + } + + ~MsgHandler() { + send_msg("TypeE:", 1); + tool::Logging(LOG_INFO, myName_, "~MSGHandler() client %lu, MsgHandler is destroyed.\n", job->client_id); + release_connection(dispatch_sock); + delete job; + } + + void initialize_connection(const std::string& ip, int port, int& sock) { + if (sock == -1) { + int tmptid = static_cast(::syscall(SYS_gettid)); + std::cout << "clientId: " << job->client_id << " tid:" << tmptid << " get in initialize_connection " <client_id); + send(sock, msg.c_str(), msg.size(), 0); + + } + } + + void release_connection(int _sock) { + // std::lock_guard lock(conn_mutex); + std::cout << "_sock:" << _sock << std::endl; + int tmptid = static_cast(::syscall(SYS_gettid)); + if (_sock != -1) { + std::cout << "clientId: " << job->client_id <<" tid:" << tmptid << " get in release_connection" << std::endl; + close(_sock); + tool::Logging(LOG_INFO, "Scheduler", "Disconnected from Monitor.\n"); + } + } + + void send_msg(const std::string& message, int sock_flag){ + int tmp_sock = -1; + if(sock_flag == 1){ + tmp_sock = dispatch_sock; + }else if(sock_flag == 2){ + tmp_sock = monitor_sock; + } + string msg = ""; + if (tmp_sock != -1) { + if(message == "TypeE:"){ + tool::Logging(LOG_INFO, myName_,"TypeE msg SendMeg::client %lu\n", job->client_id); + // if(replay_flag){ + // msg = message + to_string(job->client_id) + "," + to_string(1); + // }else{ + // msg = message + to_string(job->client_id) + "," + to_string(0); + // } + msg = message + to_string(job->client_id) + "," + to_string(1); + send(tmp_sock, msg.c_str(), msg.size(), 0); + } + } + } + + + bool receive_message(int sock, std::string& message) { + char buffer[1024] = {0}; + int bytes_received = recv(sock, buffer, sizeof(buffer), 0); + if (bytes_received > 0) { + message = std::string(buffer, bytes_received); + if (message == "stop"){ + // std::cout << "Received stop message." << std::endl; + tool::Logging(LOG_INFO,myName_,"client:%d, Received stop message.\n", job->client_id); + stop_flag = true; + } + return true; + } else if (bytes_received == 0) { + std::cout << "Connection closed by peer." << std::endl; + return false; + } else { + // std::cerr << "Receive error." << std::endl; + tool::Logging(LOG_ERROR,myName_,"client:%d, Receive error.\n", job->client_id); + release_connection(sock); + return false; + } + } + + void aysnc_receive_message() { + std::string message; + while (!job->stop_listen_flag) { + if (receive_message(dispatch_sock, message)) { + tool::Logging(LOG_INFO,myName_,"client:%d, Received message: %s\n", job->client_id, message.c_str()); + if (message == "stop") { + job->stop_listen_flag = true; + } + } + } + tool::Logging(LOG_INFO,myName_,"client:%d, Stop listening.\n", job->client_id); + } + +}; + + + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h b/GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h new file mode 100644 index 0000000..28f6c9d --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h @@ -0,0 +1,411 @@ +#ifndef PTX_EXTRACTOR_H +#define PTX_EXTRACTOR_H + +#include "configure.h" +#include "./hashing/robin_hood.h" +#include "./conqueue/readerwriterqueue.h" +#include +#include +#include + +size_t inline SafeStr2Ull(const std::string& str) { + try { + return std::stoull(str); + } catch (...) { + return 0; + } +} + +class PTXExtractor { +private: + const char* myName_ = "PTXExtractor"; + std::mutex mutex_; + boost::asio::thread_pool pool_; + // std::vector imageList_; + +public: + moodycamel::BlockingReaderWriterQueue< std::pair >* _imageQueue; + robin_hood::unordered_flat_map* _kernelDevMap; + volatile bool _readyClosed = false; + volatile bool _finished = false; + + + PTXExtractor(moodycamel::BlockingReaderWriterQueue< std::pair >* iq, + robin_hood::unordered_flat_map* kdm) + : _imageQueue(iq), _kernelDevMap(kdm), pool_(std::thread::hardware_concurrency()) { + // imageList_.reserve(240); + tool::Logging(LOG_INFO, myName_, "PTXExtractor is ready\n"); + } + + ~PTXExtractor() { + tool::Logging(myName_, "ready to close PTXExtractor\n"); + if (_finished == false) { // PTXExtractor is still working + if (_readyClosed == false) { // cudaLaunchKernel has not been called + _readyClosed = true; + } + while (_finished == false) { // waiting the Run() to finish + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + // for (auto& image : imageList_) { + // free(image); + // } + delete _imageQueue; + delete _kernelDevMap; + tool::Logging(LOG_INFO, myName_, "close the PTXExtractor\n"); + } + + void Extract(void* image, size_t imageSize) { + // step1: writing the image to a file + + // 创建一个临时文件来存储image内容 + char temp_filename[] = "/tmp/cuda_image_XXXXXX"; + int fd = mkstemp(temp_filename); + if (fd == -1) { + throw std::runtime_error("Failed to create temp file"); + } + // 使用RAII来确保文件描述符被关闭 + struct FDCloser { + void operator()(int *fd) const { if (*fd != -1) close(*fd); } + }; + std::unique_ptr fd_closer(&fd); + // 写入image内容到临时文件 + if (write(fd, image, imageSize) != static_cast(imageSize)) { + throw std::runtime_error("Failed to write image to temp file"); + } + + // step2: disassemble the image to PTX codes + + // 使用cuobjdump命令处理临时文件 + std::string ptxCodes; + ptxCodes.reserve(1024 * 1024); + std::array buffer; + std::string cmd = "cuobjdump -ptx " + std::string(temp_filename); + std::unique_ptr pipe(popen(cmd.c_str(), "r"), pclose); + if (!pipe) { + throw std::runtime_error("popen() failed!"); + } + + // 读取PTX代码 + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { + ptxCodes += buffer.data(); + } + + // 删除临时文件 + std::remove(temp_filename); + + // step3: match kernel body and extract dev ptr from kernel params + + // 提取PTX代码中的一系列Kernel + std::vector kernels; + kernels.reserve(150); + ExtractKernelsFromPtx(ptxCodes, kernels); + + for (const auto& kernel : kernels) { + tool::Logging(myName_, "Kernel name: %s\n", kernel.name.c_str()); + + // // if (kernel.name == "_ZN2at6native43_GLOBAL__N__eaa73ff3_10_Dropout_cu_f338191124fused_dropout_kernel_vecIffjLi1ELi4EbEEvNS_4cuda6detail10TensorInfoIT_T1_EES8_NS5_IT4_S7_EES7_T0_NS_15PhiloxCudaStateE" + // if (kernel.name == "_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_15gpu_kernel_implINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_" + // ) { + // std::ofstream outFile("flexgv-log-complex-ptx", std::ios::out | std::ios::app); + // outFile << kernel.name << std::endl; + // outFile << kernel.body << std::endl; + // outFile.close(); + // } + + /* + // 提取 ld.param.u64 行的寄存器、参数序号和参数偏移量 + robin_hood::unordered_flat_map param_map; + ExtractParamsFromKernelPtx(kernel.body, param_map); + + // 提取 cvta.to.global.u64 行的源寄存器 + auto dev_ptr_list = new std::vector>(); + dev_ptr_list->reserve(param_map.size()); + MatchDevPtrsFromKernelPtx(kernel.body, param_map, dev_ptr_list); + */ + + // 直接提取 ld.param.u64 行的参数序号和参数偏移量 + auto dev_ptr_list = new std::vector>(); + dev_ptr_list->reserve(200); + ExtractDevptrFromKernelPtx(kernel.body, kernel.name, dev_ptr_list); + + // 将Kernel名称和设备内存指针映射关系存入_kernelDevMap,共享给serverEp + mutex_.lock(); + _kernelDevMap->insert({kernel.name, reinterpret_cast(dev_ptr_list)}); + mutex_.unlock(); + + // 打印设备指针列表 + for (const auto& dev_ptr : *dev_ptr_list) { + tool::Logging(myName_, "\tParam #%zu, Offset: %zu\n", dev_ptr.first, dev_ptr.second); + } + } + tool::Logging(myName_, "finished Extract().\n"); + // free(image); + // imageList_.push_back(image); // the image will be loaded by cuModuleLoadData + } + + void Run() { + std::pair image; + while(!_readyClosed || _imageQueue->size_approx() > 0) { + if (_imageQueue->wait_dequeue_timed(image, std::chrono::milliseconds(5))) { + tool::Logging(LOG_DEBUG, myName_, "pop image: %p, size: %zu\n", image.first, image.second); + boost::asio::post(pool_, + std::bind(&PTXExtractor::Extract, this, image.first, image.second)); + } + } + pool_.join(); + _finished = true; + tool::Logging(LOG_INFO, myName_, "finished PTXExtractor work\n"); + } + + // Extract kernel body from PTX codes by matching brackets + size_t ExtractKernelBody(const std::string& ptx_code, size_t start_pos) { + std::stack brackets; + size_t end_pos = start_pos; + + for (; end_pos < ptx_code.length(); ++end_pos) { + if (ptx_code[end_pos] == '{') { + brackets.push('{'); + } else if (ptx_code[end_pos] == '}') { + if (!brackets.empty()) { + brackets.pop(); + } + if (brackets.empty()) { + ++end_pos; // include the last '}' + break; + } + } + } + return end_pos; + } + + // Extract kernel's PTX name and its PTX body from PTX codes + void ExtractKernelsFromPtx(const std::string& ptx_code, std::vector& kernels) { + static const boost::regex kernel_pattern(R"(\.entry\s+(\w+)\s*\()"); + boost::sregex_iterator kernels_begin(ptx_code.begin(), ptx_code.end(), kernel_pattern); + boost::sregex_iterator kernels_end; + + for (auto i = kernels_begin; i != kernels_end; ++i) { + const boost::smatch& match = *i; + const std::string& kernel_name = match[1].str(); + size_t body_start_pos = ptx_code.find('{', match.position()); + if (body_start_pos != std::string::npos) { + size_t body_end_pos = ExtractKernelBody(ptx_code, body_start_pos); + kernels.emplace_back( + kernel_name.c_str(), kernel_name.length(), + ptx_code.c_str() + body_start_pos, body_end_pos - body_start_pos + ); + } + } + } + + // Extract param index and param offset from ld.param.u64 line + static void ExtractDevptrFromKernelPtx(const std::string& kernel_body, const std::string& kernel_name, + std::vector>* dev_ptr_list) { + // static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[%\w+\+(\d+)\];)"); + static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[(%\w+)(?:\+(\d+))?\];)"); + static const boost::regex mov_b64_pattern(R"(mov\.b64\s+(%\w+),\s+\w+_param_(\d+);)"); + boost::sregex_iterator params_begin(kernel_body.begin(), kernel_body.end(), ld_pattern); + boost::sregex_iterator params_end; + + for (auto i = params_begin; i != params_end; ++i) { + const boost::smatch& match = *i; + + if (match[1].matched && match[2].matched) { + // First pattern: ld.param.u64 dst_register_name, [Kernel_param_0+offset]; + std::string dst_register_name(match[1].first, match[1].second); + size_t param_index = SafeStr2Ull(match[2].str()); + size_t param_offset = match[3].matched ? SafeStr2Ull(match[3].str()) : 0; + + dev_ptr_list->emplace_back(param_index, param_offset); + } else if (match[4].matched && match[5].matched) { + // Second pattern: ld.param.u64 dst_register_name, [Register+Offset]; + std::string dst_register_name(match[4].first, match[4].second); + size_t param_index = 0; // For this form, need to find the corresponding mov.u64 & mov.b64 + size_t param_offset = match[6].matched ? SafeStr2Ull(match[6].str()) : 0; + std::string related_register(match[5].first, match[5].second); + + // Step 1: Find pre_register based on related_register: mov.u64 pre_register, related_register; + const boost::regex mov_pattern(R"(mov\.u64\s+)" + related_register + R"(,\s+(%\w+);)"); + boost::sregex_iterator mov_begin(kernel_body.begin(), kernel_body.end(), mov_pattern); + boost::sregex_iterator mov_end; + std::string pre_register; + for (auto mov_it = mov_begin; mov_it != mov_end; ++mov_it) { + const boost::smatch& mov_match = *mov_it; + pre_register = mov_match[1].str(); + break; // Assuming only one match needed + } + + // Step 2: Find param_index based on pre_register: mov.b64 pre_register, Kernel_param_x; + if (!pre_register.empty()) { + // static const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)"); + const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)"); + boost::sregex_iterator b64_begin(kernel_body.begin(), kernel_body.end(), b64_pattern); + boost::sregex_iterator b64_end; + + for (auto b64_it = b64_begin; b64_it != b64_end; ++b64_it) { + const boost::smatch& b64_match = *b64_it; + param_index = SafeStr2Ull(b64_match[1].str()); + break; // Assuming only one match needed + } + } + + dev_ptr_list->emplace_back(param_index, param_offset); + } + } + + // Handle mov.b64 pattern: mov.b64 dst_register_name, Kernel_param_x; (no offset) + boost::sregex_iterator mov_b64_begin(kernel_body.begin(), kernel_body.end(), mov_b64_pattern); + for (auto i = mov_b64_begin; i != params_end; ++i) { + const boost::smatch& match = *i; + if (match[1].matched && match[2].matched) { + std::string dst_register_name(match[1].first, match[1].second); + size_t param_index = SafeStr2Ull(match[2].str()); + + // special case: gpu_kernel_impl >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&) + if (kernel_name.find("_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_15gpu_kernel_implINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_") != std::string::npos) { + continue; // skipping the mov.b64 pattern (the memory contains some device pointers, so it's not a real device pointer) + } + + dev_ptr_list->emplace_back(param_index, 0); // No offset in this case + + // special case: CatArrInputTensorMetadata related kernel, https://github.com/pytorch/pytorch/blob/feef057691b357481981fd24483a817ce87c8517/aten/src/ATen/native/cuda/Shape.cu#L155 + if (kernel_name.find("CatArrInputTensorMetadata") != std::string::npos) { + size_t size = 1000; + char realname[size]; + int status; + __cu_demangle(kernel_name.c_str(), realname, &size, &status); + std::string realname_str(realname); + boost::regex bracket_pattern("<[^<>]*>.*?<[^<>]*?,\\s*[^<>]*?,\\s*[^<>]*?,\\s*\\(int\\)(\\d+)"); + boost::smatch newmatch; + if (boost::regex_search(realname_str, newmatch, bracket_pattern)) { + size_t batchSize = SafeStr2Ull(newmatch[1].str()); // matching the Template parameter(batch size) + for (size_t i = 0; i < batchSize; ++i) { // push all the device pointers from input buffer + dev_ptr_list->emplace_back(param_index, i * 8); + } + } + } + } + } + std::sort((*dev_ptr_list).begin(), (*dev_ptr_list).end()); + (*dev_ptr_list).erase(std::unique((*dev_ptr_list).begin(), (*dev_ptr_list).end()), (*dev_ptr_list).end()); // avoid loading the same device pointer repeatedly + } + + // [not used] Extract register, param index and param offset from ld.param.u64 line + void ExtractParamsFromKernelPtx(const std::string& kernel_body, + robin_hood::unordered_flat_map& param_map) { + // static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[%\w+\+(\d+)\];)"); + static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[(%\w+)(?:\+(\d+))?\];)"); + boost::sregex_iterator params_begin(kernel_body.begin(), kernel_body.end(), ld_pattern); + boost::sregex_iterator params_end; + + for (auto i = params_begin; i != params_end; ++i) { + const boost::smatch& match = *i; + + if (match[1].matched && match[2].matched) { + // First pattern: ld.param.u64 dst_register_name, [Kernel_param_0+offset]; + std::string dst_register_name(match[1].first, match[1].second); + size_t param_index = SafeStr2Ull(match[2].str()); + size_t param_offset = match[3].matched ? SafeStr2Ull(match[3].str()) : 0; + + param_map.insert({std::move(dst_register_name), LdParamInfo_t(param_index, param_offset)}); + } else if (match[4].matched && match[5].matched) { + // Second pattern: ld.param.u64 dst_register_name, [Register+offset]; + std::string dst_register_name(match[4].first, match[4].second); + size_t param_index = 0; // For this form, need to find the corresponding mov.u64 & mov.b64 + size_t param_offset = match[6].matched ? SafeStr2Ull(match[6].str()) : 0; + std::string related_register(match[5].first, match[5].second); + + // Step 1: Find pre_register based on related_register + const boost::regex mov_pattern(R"(mov\.u64\s+)" + related_register + R"(,\s+(%\w+);)"); + boost::sregex_iterator mov_begin(kernel_body.begin(), kernel_body.end(), mov_pattern); + boost::sregex_iterator mov_end; + std::string pre_register; + for (auto mov_it = mov_begin; mov_it != mov_end; ++mov_it) { + const boost::smatch& mov_match = *mov_it; + pre_register = mov_match[1].str(); + break; // Assuming only one match needed + } + + // Step 2: Find param_index based on pre_register + if (!pre_register.empty()) { + // static const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)"); + const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)"); + boost::sregex_iterator b64_begin(kernel_body.begin(), kernel_body.end(), b64_pattern); + boost::sregex_iterator b64_end; + + for (auto b64_it = b64_begin; b64_it != b64_end; ++b64_it) { + const boost::smatch& b64_match = *b64_it; + param_index = SafeStr2Ull(b64_match[1].str()); + break; // Assuming only one match needed + } + } + + param_map.insert({std::move(dst_register_name), LdParamInfo_t(param_index, param_offset)}); + } + } + } + + // [not used] Extract source register from cvta.to.global.u64 line and match with ld.param.u64, to extract the location of devPtr + void MatchDevPtrsFromKernelPtx(const std::string& kernel_body, + robin_hood::unordered_flat_map& param_map, + std::vector>* dev_ptr_list) { + static const boost::regex cvta_pattern(R"(cvta\.to\.global\.u64\s+\%\w+,\s+(%\w+);)"); + boost::sregex_iterator cvta_begin(kernel_body.begin(), kernel_body.end(), cvta_pattern); + boost::sregex_iterator cvta_end; + + for (auto i = cvta_begin; i != cvta_end; ++i) { + const boost::smatch& match = *i; + boost::ssub_match source_register_match = match[1]; + + auto it = param_map.find(std::string(source_register_match.first, source_register_match.second)); + if (it != param_map.end()) { + if (it->second.isUsed) { + continue; // avoid extracting the same source register repeatedly + } + (*dev_ptr_list).emplace_back(it->second.index, it->second.offset); + it->second.isUsed = true; + } + } + + // for (const auto& entry : param_map) { + // (*dev_ptr_list).emplace_back(entry.second.index, entry.second.offset); + // } + + std::sort((*dev_ptr_list).begin(), (*dev_ptr_list).end()); + (*dev_ptr_list).erase(std::unique((*dev_ptr_list).begin(), (*dev_ptr_list).end()), (*dev_ptr_list).end()); // avoid loading the same device pointer repeatedly + } + + // // Extract param index and param offset from ld.param.u64 line + // static void ExtractDevptrFromKernelPtx(const std::string& kernel_body, + // std::vector>* dev_ptr_list) { + // // static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[%\w+\+(\d+)\];)"); + // static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|mov\.b64\s+%\w+,\s+\w+_param_(\d+);)"); + // boost::sregex_iterator params_begin(kernel_body.begin(), kernel_body.end(), ld_pattern); + // boost::sregex_iterator params_end; + + // for (auto i = params_begin; i != params_end; ++i) { + // const boost::smatch& match = *i; + + // if (match[1].matched && match[2].matched) { + // // First pattern: ld.param.u64 dst_register_name, [Kernel_param_0+offset]; + // std::string dst_register_name(match[1].first, match[1].second); + // size_t param_index = SafeStr2Ull(match[2].str()); + // size_t param_offset = match[3].matched ? SafeStr2Ull(match[3].str()) : 0; + + // dev_ptr_list->emplace_back(param_index, param_offset); + // } else if (match[4].matched && match[5].matched) { + // // Second pattern: mov.b64 dst_register_name, Kernel_param_0; + // std::string dst_register_name(match[4].first, match[4].second); + // size_t param_index = SafeStr2Ull(match[5].str()); + // size_t param_offset = match[6].matched ? SafeStr2Ull(match[6].str()) : 0; + // dev_ptr_list->emplace_back(param_index, param_offset); + // } + // } + + // std::sort((*dev_ptr_list).begin(), (*dev_ptr_list).end()); + // (*dev_ptr_list).erase(std::unique((*dev_ptr_list).begin(), (*dev_ptr_list).end()), (*dev_ptr_list).end()); // avoid loading the same device pointer repeatedly + // } +}; + +#endif diff --git a/GPU-Virtual-Service/gpu-remoting/include/registerIOV.h b/GPU-Virtual-Service/gpu-remoting/include/registerIOV.h new file mode 100644 index 0000000..ab44e52 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/registerIOV.h @@ -0,0 +1,236 @@ +#ifndef REGISTER_IOV_H +#define REGISTER_IOV_H + +#include "constVar.h" +#include "configure.h" +#include "ucpUtil.h" + +typedef struct { + size_t threadID; + size_t reqNum; + size_t iovNum; +} RegisterHeader_t; + +class RegisterIOV { + private: + const char* myName_ = "RegisterIOV"; + size_t headers_[REG_PARAM_MAX_NUM]; + ucp_dt_iov_t iovs_[REG_PARAM_MAX_NUM]; + int iovIdx_; + int requestType_; + int popIdx_; + + bool recvFlag_; + RegisterHeader_t ucpHeader_; + + public: + RegisterIOV() : iovIdx_(0), popIdx_(0), recvFlag_(false) { + PushRequestType(__CUDA_REGISTER); + ucpHeader_.reqNum = 0; + ucpHeader_.threadID = -1; + ucpHeader_.iovNum = 0; + iovs_[iovIdx_].length = 0; + iovs_[iovIdx_].buffer = headers_; + } + + RegisterIOV(int threadID) : RegisterIOV() { + ucpHeader_.threadID = threadID; + } + + RegisterIOV(const void *header, size_t header_length, void *data) : RegisterIOV() { + recvFlag_ = true; + ucpHeader_.threadID = ((size_t*)header)[0]; + ucpHeader_.reqNum = ((size_t*)header)[1]; + ucpHeader_.iovNum = ((size_t*)header)[2]; + if (unlikely(ucpHeader_.iovNum >= REG_PARAM_MAX_NUM)) { + tool::Logging(LOG_ERROR, myName_, "push failed: already full, iovNum=%zu\n", ucpHeader_.iovNum); + exit(EXIT_FAILURE); + } + + size_t offset = sizeof(size_t) * (ucpHeader_.iovNum + 1); // headers contain iovNum+1 elements (data starts from iovIdx_ = 1) + for (int i = 1; i <= ucpHeader_.iovNum; i++) { + iovIdx_++; // start from iovIdx_ = 1 + headers_[iovIdx_] = *(size_t*)UCS_PTR_BYTE_OFFSET(data, sizeof(size_t) * iovIdx_); + + iovs_[iovIdx_].length = headers_[iovIdx_]; + iovs_[iovIdx_].buffer = UCS_PTR_BYTE_OFFSET(data, offset); + offset += iovs_[iovIdx_].length; + } + tool::Logging(LOG_REGS, myName_, "recvFlag=%d, threadID=%d, reqNum=%d, iovNum=%d, iovIdx=%d\n", recvFlag_, ucpHeader_.threadID, ucpHeader_.reqNum, ucpHeader_.iovNum, iovIdx_); + } + + ~RegisterIOV() { + if (!recvFlag_) { + for (int i = 1; i <= ucpHeader_.iovNum; i++) { // not include the first iov + free(iovs_[i].buffer); + } + } + iovIdx_ = 0; + } + + void Reset() { + iovIdx_ = 0; + } + + int GetNum() { + return iovIdx_ + 1; + } + + ucp_dt_iov_t* GetIOVs() { + return iovs_; + } + + size_t* GetUcpHeaders() { + return (size_t*)&ucpHeader_; + } + + size_t GetUcpHeaderSize() { + return sizeof(ucpHeader_); + } + + void PushRequestType(int reqType){ + requestType_ = reqType; + } + + int GetRequestType() { + return requestType_; + } + + int GetThreadID() { + return ucpHeader_.threadID; + } + + int GetRequestNum() { + return ucpHeader_.reqNum; + } + + void PushThreadID (int threadID) { + ucpHeader_.threadID = threadID; + ucpHeader_.iovNum = iovIdx_; + iovs_[0].length = sizeof(size_t) * (ucpHeader_.iovNum + 1); + tool::Logging(LOG_DEBUG, myName_, "recvFlag=%d, threadID=%d, reqNum=%d, iovNum=%d, iovIdx=%d\n", recvFlag_, ucpHeader_.threadID, ucpHeader_.reqNum, ucpHeader_.iovNum, iovIdx_); + } + + void PushSubRequestType(int reqType){ + Push(reqType); + ucpHeader_.reqNum++; + } + + inline void CheckFull() { + if (unlikely(iovIdx_ + 1 >= REG_PARAM_MAX_NUM)) { + tool::Logging(LOG_ERROR, myName_, "push failed: already full, iovIdx=%zu\n", iovIdx_); + exit(EXIT_FAILURE); + } + } + + inline void CheckPopEnd() { + if (unlikely(popIdx_ + 1 > iovIdx_)) { + tool::Logging(LOG_ERROR, myName_, "pop failed: already pop to the end\n"); + exit(EXIT_FAILURE); + } + } + + template + void Push(const T& item) { + CheckFull(); + iovIdx_++; + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T); + iovs_[iovIdx_].buffer = malloc(iovs_[iovIdx_].length); + memcpy(iovs_[iovIdx_].buffer, &item, iovs_[iovIdx_].length); + } + + template + void Push64BitPointer(T& item) { + Push((uint64_t)item); + } + + template + void Push(T* item, size_t num = 1) { + CheckFull(); + iovIdx_++; + if (item == NULL || num == 0) { + headers_[iovIdx_] = iovs_[iovIdx_].length = 0; + } else { + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num; + iovs_[iovIdx_].buffer = malloc(iovs_[iovIdx_].length); + memcpy(iovs_[iovIdx_].buffer, item, iovs_[iovIdx_].length); + } + } + + template + void PushConst(const T* item, size_t num = 1) { + CheckFull(); + iovIdx_++; + if (item == NULL || num == 0) { + headers_[iovIdx_] = iovs_[iovIdx_].length = 0; + } else { + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num; + iovs_[iovIdx_].buffer = malloc(iovs_[iovIdx_].length); + memcpy(iovs_[iovIdx_].buffer, item, iovs_[iovIdx_].length); + } + } + + void PushCString(const char *s) { + PushConst(s, strlen(s) + 1); //! including the null-terminator + } + + template + T Pop() { + CheckPopEnd(); + popIdx_++; + return *(T*)iovs_[popIdx_].buffer; + } + + template + T* AssignAddr() { + CheckPopEnd(); + popIdx_++; + if (iovs_[popIdx_].length == 0) { + return NULL; + } + else { + return (T*)iovs_[popIdx_].buffer; + } + } + + template + T* AssignAddrForAll() { + return AssignAddr(); + } + + char* AssignCString(){ + CheckPopEnd(); + popIdx_++; + if (iovs_[popIdx_].length == 1) { // including the null-terminator + return NULL; + } + else { + return (char*)iovs_[popIdx_].buffer; + } + } + + void Print() { + tool::Logging(myName_, "%s has %zu elements, popIdx=%d\n", myName_, iovIdx_ + 1, popIdx_); + for (size_t i = 0; i <= iovIdx_; i++) { + + if (iovs_[i].length == 4){ + tool::Logging(myName_, "\t[%zu]: len=%zu, buffer=%d\n", i, iovs_[i].length, *(int*)iovs_[i].buffer); + } + else if (iovs_[i].length == 8){ + tool::Logging(myName_, "\t[%zu]: len=%zu, buffer=%p\n", i, iovs_[i].length, *(void**)iovs_[i].buffer); + } + else { + tool::Logging(myName_, "\t[%zu]: len=%zu, buffer=%p\n", i, iovs_[i].length, iovs_[i].buffer); + } + + if (i == 0) { + for (size_t j = 0; j <= iovIdx_; j++) { + tool::Logging(myName_, "\t\t[%zu]: header=%zu, iov[0].buffer=%zu\n", j, headers_[j], ((size_t*)(iovs_[0].buffer))[j]); + } + } + + } + } +}; + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h b/GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h new file mode 100644 index 0000000..275555e --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h @@ -0,0 +1,238 @@ +#ifndef REQUEST_BUFFER_H +#define REQUEST_BUFFER_H + +#include "constVar.h" +#include "configure.h" + +class RequestBuffer { + private: + const char* myName_ = "RequestBuffer"; + uint32_t popOffset_; // point to the first empty space of the buffer + uint32_t backOffset_; // [popOffset_, backOffset_) is the elements for the new request + + public: + uint8_t* _dataBuffer; + uint32_t _allocatedSize; + + RequestBuffer(uint32_t size, uint8_t* dataBuffer) : _allocatedSize(size), _dataBuffer(dataBuffer) { + popOffset_ = 0; + backOffset_ = size; + } + + void Reset(uint32_t size, uint8_t* dataBuffer) { + popOffset_ = 0; + backOffset_ = sizeof(int) + size; + _allocatedSize = sizeof(int) + size; + _dataBuffer = dataBuffer; //!: after reset, the dataBuffer should be freed by the caller + } + + void SetBackOffset(uint32_t offset) { + backOffset_ = offset; + } + + RequestBuffer(uint32_t size = REQUEST_BUFFER_SIZE) { + popOffset_ = 0; + backOffset_ = 0; + _allocatedSize = sizeof(int) + size; + _dataBuffer = (uint8_t*) malloc(_allocatedSize); + } + + RequestBuffer(const RequestBuffer* otherBuf){ + popOffset_ = 0; + backOffset_ = 0; + _allocatedSize = otherBuf->_allocatedSize; + _dataBuffer = (uint8_t*) malloc(_allocatedSize); + memcpy(_dataBuffer, otherBuf->_dataBuffer, _allocatedSize); + } + + ~RequestBuffer() { + if (_dataBuffer != NULL){ + //todo: free(_dataBuffer); + } + } + + // return the allocated size of the buffer + uint32_t GetSize() { + return _allocatedSize; + } + + void CheckFull(size_t size) { + if ((backOffset_ + size) > _allocatedSize) { + tool::Logging(myName_, "Already full: offset %zu, new element size %zu, total %zu\n", backOffset_, size, _allocatedSize); + exit(EXIT_FAILURE); + } + } + + void CheckPopEnd(size_t size) { + if ((popOffset_ + size) > backOffset_) { + tool::Logging(myName_, "Already pop to the end: offset %zu, pop element size %zu, total %zu\n", popOffset_, size, backOffset_); + exit(EXIT_FAILURE); + } + } + + size_t getRemainingSize() { + return backOffset_ - popOffset_; + } + + template + void Push(T item) { + CheckFull(sizeof(T)); + memcpy(_dataBuffer + backOffset_, (uint8_t*)&item, sizeof(T)); + backOffset_ += sizeof(T); + } + + template + void PushConst(const T item) { + CheckFull(sizeof(T)); + memcpy(_dataBuffer + backOffset_, (uint8_t*)&item, sizeof(T)); + backOffset_ += sizeof(T); + } + + template + void Push64BitPointer(T item) { + Push((uint64_t)item); //!: unified address is 64-bit + } + + void PushHostAddr(const void* ptr){ + char* addrChar = new char[HOST_POINTER_SIZE]; +#ifdef _WIN32 + sprintf_s(addrChar, 10, "%p", ptr); +#else + sprintf(addrChar, "%p", ptr); +#endif + PushCString(addrChar); + delete[] addrChar; + } + + void PushRequestType(int reqType){ + Push(reqType); + } + + // ! notice that the num is the number of elements + template + void Push(T* item, size_t num = 1) { + if (item == NULL) { + Push((size_t)0); + return; + } + size_t totalSize = sizeof(T) * num; + Push(totalSize); + memcpy(_dataBuffer + backOffset_, (uint8_t*)item, totalSize); + backOffset_ += totalSize; + } + + template + void PushConst(const T* item, size_t num = 1) { + if (item == NULL) { + Push((size_t)0); + return; + } + size_t totalSize = sizeof(T) * num; + Push(totalSize); //! notice that the size is storage space size, not the number of elements + memcpy(_dataBuffer + backOffset_, (uint8_t*)item, totalSize); + backOffset_ += totalSize; + } + + // push the len of char* , and then push all chars + void PushCString(const char *s){ + size_t len = strlen(s) + 1; //! including the null-terminator + PushConst(s, len); + } + + template + T Pop(){ + CheckPopEnd(sizeof(T)); + T res = *((T*)(_dataBuffer + popOffset_)); + popOffset_ += sizeof(T); + return res; + } + + // copy the array, and return the new begin address + template + T* Pop(size_t n) { + if(Pop() == 0) { // first, pop the size of the array + return NULL; + } + CheckPopEnd(sizeof(T) * n); + T* res = new T[n]; + memcpy((uint8_t*)res, _dataBuffer + popOffset_, sizeof(T) * n); + popOffset_ += sizeof(T) * n; + return res; + } + + // copy the array to the given address + template + void Pop(T* res, size_t n) { + if(Pop() == 0) { // first, pop the size of the array + return; + } + CheckPopEnd(sizeof(T) * n); + memcpy((uint8_t*)res, _dataBuffer + popOffset_, sizeof(T) * n); + popOffset_ += sizeof(T) * n; + } + + template + T* PopAll() { + size_t size = Pop(); // first, pop the size of the array + if(size == 0) { + return NULL; + } + CheckPopEnd(size); + T* res = (T*)malloc(size); + memcpy((uint8_t*)res, _dataBuffer + popOffset_, size); + popOffset_ += size; + return res; + } + + // pop the pointer with 64 bit address + template + T PopFromAddr() { + return (T)Pop(); + } + + // just assign the begin address, not copy the data + template + T* AssignAddr(size_t n = 1) { + if(Pop() == 0) { // first, pop the size of the array + return NULL; + } + CheckPopEnd(sizeof(T) * n); + T* res = (T*)(_dataBuffer + popOffset_); + popOffset_ += sizeof(T) * n; + return res; + } + + template + T* AssignAddrForAll() { + size_t size = Pop(); // first, pop the size of the array + if(size == 0) { + return NULL; + } + CheckPopEnd(size); + T* res = (T*)(_dataBuffer + popOffset_); + popOffset_ += size; + return res; + } + + template + T* AssignAddrForLast() { + T* res = (T*)(_dataBuffer + popOffset_); + popOffset_ = backOffset_; + return res; + } + + char* AssignCString(){ + return AssignAddrForAll(); + } + + void Print() { + tool::Logging(LOG_DEBUG, myName_, "RequestBuffer: popOffset_ = %u, backOffset_ = %u, _allocatedSize = %u\n", popOffset_, backOffset_, _allocatedSize); + size_t i = 0; + for (i = 0; i < backOffset_; i++) { + printf("%02x ", _dataBuffer[i]); + } + printf("\n"); + } +}; + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/requestIOV.h b/GPU-Virtual-Service/gpu-remoting/include/requestIOV.h new file mode 100644 index 0000000..8d3afd6 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/requestIOV.h @@ -0,0 +1,307 @@ +#ifndef REQUEST_IOV_H +#define REQUEST_IOV_H + +#include "constVar.h" +#include "configure.h" +#include "ucpUtil.h" + +class RequestIOV : public boost::intrusive::list_base_hook<> { + private: + const char* myName_ = "RequestIOV"; + size_t headers_[PARAM_MAX_NUM]; + ucp_dt_iov_t iovs_[PARAM_MAX_NUM]; + int iovIdx_; + int requestType_; + int threadID_; + int popIdx_; + size_t paramTotalSize_; + + void* dataBuffer_ = NULL; + + public: + RequestIOV() : iovIdx_(-1), popIdx_(-1), threadID_(-1), paramTotalSize_(0) {} + + RequestIOV(int threadID) : RequestIOV() { + threadID_ = threadID; + } + + RequestIOV(const void *header, size_t header_length, void *data) : RequestIOV() { + int iovNum = (header_length - sizeof(size_t)) / sizeof(size_t); // not including the threadID + size_t offset = 0; + for (int i = 0; i < iovNum; i++) { + CheckFull(); + iovIdx_++; + iovs_[iovIdx_].length = headers_[iovIdx_] = ((size_t*)header)[i]; + iovs_[iovIdx_].buffer = UCS_PTR_BYTE_OFFSET(data, offset); + offset += iovs_[iovIdx_].length; + } + threadID_ = ((size_t*)header)[iovNum]; + } + + ~RequestIOV() { + iovIdx_ = -1; + if (dataBuffer_ != NULL) { // means that this object is cloned + void* realData = ((ucp_dt_iov_t*)dataBuffer_)->buffer; + // printf("Release RequestIOV for requestType=%d, realData=%p(size=%zu)\n", requestType_, realData, ((ucp_dt_iov_t*)dataBuffer_)->length); + if (requestType_ == CUDA_MEMCPY_ASYNC_H2D) { + // printf("Release RequestIOV for requestType=CUDA_MEMCPY_ASYNC_H2D, realData=%p(size=%zu), iovs[0]=%p(buffer=%p)\n", realData, ((ucp_dt_iov_t*)dataBuffer_)->length, iovs_, iovs_[0].buffer); + } + if (realData != NULL) { + free(realData); + // realData = NULL; + } + // dataBuffer_ = NULL; + } + } + + void Reset() { + iovIdx_ = -1; + } + + int GetNum() { + return iovIdx_ + 1; + } + + ucp_dt_iov_t* GetIOVs() { + return iovs_; + } + + size_t* GetHeaders() { + return headers_; + } + + size_t GetHeaderSize() { + return sizeof(size_t) * (iovIdx_ + 1 + (threadID_ == -1 ? 0 : 1)); + } + + size_t GetParamTotalSize() { + return paramTotalSize_; + } + + int GetRequestType() { + return requestType_; + } + + int GetThreadID() { + return threadID_; + } + + inline void CheckFull() { + if (unlikely(iovIdx_ + 1 >= PARAM_MAX_NUM)) { + tool::Logging(LOG_ERROR, myName_, "push failed: already full, iovIdx=%zu\n", iovIdx_); + exit(EXIT_FAILURE); + } + } + + inline void CheckPopEnd() { + if (unlikely(popIdx_ + 1 > iovIdx_)) { + tool::Logging(LOG_ERROR, myName_, "pop failed: already pop to the end\n"); + exit(EXIT_FAILURE); + } + } + + void PushThreadID(int threadID) { + CheckFull(); // check the headers is full or not + headers_[iovIdx_ + 1] = threadID_ = threadID; // threadID is the last element of the headers + } + + template + void Push(T& item) { + CheckFull(); + iovIdx_++; + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T); + iovs_[iovIdx_].buffer = &item; + paramTotalSize_ += iovs_[iovIdx_].length; + } + + template + void PushConst(const T& item) { + CheckFull(); + iovIdx_++; + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T); + iovs_[iovIdx_].buffer = (void*)&item; + paramTotalSize_ += iovs_[iovIdx_].length; + } + + template + void Push64BitPointer(T& item) { + CheckFull(); + iovIdx_++; + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(uint64_t); + iovs_[iovIdx_].buffer = &item; + paramTotalSize_ += iovs_[iovIdx_].length; + } + + template + void Push64BitPointer(const T& item) { + CheckFull(); + iovIdx_++; + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(uint64_t); + iovs_[iovIdx_].buffer = (void*)&item; + paramTotalSize_ += iovs_[iovIdx_].length; + } + + void PushVar(void* ptr, size_t size) { + CheckFull(); + iovIdx_++; + headers_[iovIdx_] = iovs_[iovIdx_].length = size; + iovs_[iovIdx_].buffer = ptr; + paramTotalSize_ += iovs_[iovIdx_].length; + } + + void PushRequestType(int reqType){ + requestType_ = reqType; + } + + template + void Push(T* item, size_t num = 1) { + CheckFull(); + iovIdx_++; + if (item == NULL || num == 0) { + headers_[iovIdx_] = iovs_[iovIdx_].length = 0; + } else { + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num; + iovs_[iovIdx_].buffer = item; + } + paramTotalSize_ += iovs_[iovIdx_].length; + } + + template + void PushConst(const T* item, size_t num = 1) { + CheckFull(); + iovIdx_++; + if (item == NULL || num == 0) { + headers_[iovIdx_] = iovs_[iovIdx_].length = 0; + } else { + headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num; + iovs_[iovIdx_].buffer = (void*)item; + } + paramTotalSize_ += iovs_[iovIdx_].length; + } + + void PushCString(const char *s) { + CheckFull(); + iovIdx_++; + headers_[iovIdx_] = iovs_[iovIdx_].length = strlen(s) + 1; //! including the null-terminator + iovs_[iovIdx_].buffer = (void*)s; + paramTotalSize_ += iovs_[iovIdx_].length; + } + + template + T Pop() { + CheckPopEnd(); + popIdx_++; + return *(T*)iovs_[popIdx_].buffer; + } + + template + T* AssignAddr() { + CheckPopEnd(); + popIdx_++; + if (iovs_[popIdx_].length == 0) { + return NULL; + } + else { + return (T*)iovs_[popIdx_].buffer; + } + } + + template + T* AssignAddrForAll() { + return AssignAddr(); + } + + char* AssignCString(){ + CheckPopEnd(); + popIdx_++; + if (iovs_[popIdx_].length == 1) { // including the null-terminator + return NULL; + } + else { + return (char*)iovs_[popIdx_].buffer; + } + } + + void Print() { + tool::Logging(LOG_INFO, myName_, "RequestIOV has %zu elements, popIdx=%d\n", iovIdx_ + 1, popIdx_); + for (size_t i = 0; i <= iovIdx_; i++) { + tool::Logging(LOG_INFO, myName_, "\t[%zu]: len=%zu, buffer=%p\n", i, iovs_[i].length, *(void**)iovs_[i].buffer); + } + } + + RequestIOV* Clone(uint8_t* H2Dheaders = NULL, size_t headerSize = 0) { + RequestIOV* newReq = new RequestIOV(); + newReq->iovIdx_ = iovIdx_; + newReq->requestType_ = requestType_; + newReq->threadID_ = threadID_; + newReq->popIdx_ = popIdx_; + newReq->paramTotalSize_ = paramTotalSize_; + newReq->dataBuffer_ = newReq->iovs_; + + if (H2Dheaders) { + memcpy((uint8_t*)newReq->headers_, H2Dheaders, headerSize); + // newReq->iovs_[0].length = iovs_[0].length; + // newReq->iovs_[0].buffer = iovs_[0].buffer; + // } + // else if (requestType_ == CUDA_MEMCPY_ASYNC_H2D) { + // memcpy(newReq->headers_, headers_, 4+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t)+sizeof(uint64_t)); + void* dataBuffer = malloc(paramTotalSize_); + size_t offset = 0; + for (size_t i = 0; i <= iovIdx_; i++) { + newReq->iovs_[i].length = iovs_[i].length; + // newReq->iovs_[i].buffer = iovs_[i].buffer; + newReq->iovs_[i].buffer = UCS_PTR_BYTE_OFFSET(dataBuffer, offset); + memcpy(newReq->iovs_[i].buffer, iovs_[i].buffer, newReq->iovs_[i].length); + offset += newReq->iovs_[i].length; + } + } + else { + void* dataBuffer = malloc(paramTotalSize_); + size_t offset = 0; + for (size_t i = 0; i <= iovIdx_; i++) { + newReq->iovs_[i].length = newReq->headers_[i] = headers_[i]; + newReq->iovs_[i].buffer = UCS_PTR_BYTE_OFFSET(dataBuffer, offset); + memcpy(newReq->iovs_[i].buffer, iovs_[i].buffer, newReq->iovs_[i].length); + offset += newReq->iovs_[i].length; + } + newReq->headers_[iovIdx_ + 1] = threadID_; + } + + return newReq; + } + + bool ContainsHandle(uint64_t* handleList, size_t handleCnt) { + for (size_t i = 0; i <= iovIdx_; i++) { + if (iovs_[i].length != sizeof(uint64_t) || *(void**)iovs_[i].buffer == NULL) { + continue; + } + for (size_t j = 0; j < handleCnt; j++) { // todo: may be optimized + if (*(uint64_t*)iovs_[i].buffer == handleList[j]) { + return true; + } + } + } + return false; + } + + uint64_t GetHandleByIndex(size_t idx) { + if (idx > iovIdx_) { + return 0; + } + if (iovs_[idx].length != sizeof(uint64_t) || *(void**)iovs_[idx].buffer == NULL) { + return 0; + } + return *(uint64_t*)iovs_[idx].buffer; + } + + bool SetElement(size_t idx, void* buffer, size_t length) { + if (idx > iovIdx_) { + return false; + } + iovs_[idx].buffer = buffer; + headers_[idx] = iovs_[idx].length = length; + return true; + } +}; + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/scheduler.h b/GPU-Virtual-Service/gpu-remoting/include/scheduler.h new file mode 100644 index 0000000..3759829 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/scheduler.h @@ -0,0 +1,544 @@ +#ifndef SCHEDULER_H +#define SCHEDULER_H + +#include "configure.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "constVar.h" +#include "define.h" +#include +#include +#define GPU_MAX_NUM 8 +#define THRESH_ITER 11 +// #define TIME_QUANTUM 600 +#define TIME_QUANTUM 360 +extern Configure config_; +#define GPUNum 2 + +using namespace std; + + +typedef struct JobInfo { + size_t used_mem; // memory + uint sm_util; //SM utilization + double served_time;//按秒算 + double remain_time; + double perIt_time; + int cur_It;// current iterations + int num_It;// the number of iterations + int device; + uint64_t client_id; + + bool operator<(const JobInfo& other) const { + return remain_time < other.remain_time; + } +} JobInfo_; + +class Scheduler { + private: + const char* myName_ = "Scheduler"; + static size_t free_mem[GPU_MAX_NUM]; + static size_t total_mem[GPU_MAX_NUM]; + static set jobs; + static queue unknown_queue[GPU_MAX_NUM]; // 未知任务信息的队列 + static vector known_queue[GPU_MAX_NUM]; // 已知任务信息队列 + static unordered_map known_map; // token=1表示当前已知信息的任务在运行 + // static int flag;//判断是否有已知信息的任务在运行,0表示没有 + static int flag[GPUNum];// + // static int queues;//0表示轮到unknown_queue,1表示轮到known_queue + static int queues[GPUNum];//不同GPU下的队列情况,0表示轮到unknown_queue,1表示轮到known_queue + // static chrono::steady_clock::time_point timer;//计时器,判断处理哪个队列 + static chrono::steady_clock::time_point timers[GPUNum];//计时器,判断处理哪个队列 + static set client_ids;//防止重放入 + + int dev_cnt; + JobInfo_* job; + pid_t pid; + size_t* shm_util = 0; + chrono::steady_clock::time_point start; + int cur_pid; + // int token = 0;//1表示当前已知信息的任务在运行 + + int _sock = -1; // 用于连接monitor的静态socket + + //建立与monitor连接,并确保连接仅建立一次 + void initialize_connection() { + + if (_sock == -1) { + int tmptid = static_cast(::syscall(SYS_gettid)); + std::cout << "clientId: " << job->client_id <<" tid:" << tmptid << " get in initalize_connection" << std::endl; + + struct sockaddr_in serv_addr; + + if ((_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + tool::Logging(LOG_ERROR, "Scheduler", "Socket creation error.\n"); + return; + } + memset(&serv_addr, 0, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(config_.GetMonPort()); + + + if (inet_pton(AF_INET, config_.GetMonIp().c_str(), &serv_addr.sin_addr) <= 0) { + tool::Logging(LOG_ERROR, "Scheduler", "Invalid address/ Address not supported.\n"); + return; + } + + if (connect(_sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { + tool::Logging(LOG_ERROR, "Scheduler", "Connection Failed.\n"); + return; + } + + tool::Logging(LOG_INFO, "Scheduler", "Connected to Monitor.\n"); + + } + } + + void release_connection() { + // std::lock_guard lock(conn_mutex); + std::cout << "_sock:" << _sock << std::endl; + int tmptid = static_cast(::syscall(SYS_gettid)); + if (_sock != -1) { + std::cout << "clientId: " << job->client_id <<" tid:" << tmptid << " get in release_connection" << std::endl; + close(_sock); + tool::Logging(LOG_INFO, "Scheduler", "Disconnected from Monitor.\n"); + } + } + + void delayed_send(const std::string& message) { + std::this_thread::sleep_for(std::chrono::seconds(3)); // 延迟发送,dummy? 发现初始化时间大概十几秒往上,甚至几分钟都有 + send(_sock, message.c_str(), message.size(), 0); + } + + + static std::mutex mtx_used[GPU_MAX_NUM]; + static std::mutex mtx_sche[GPU_MAX_NUM]; + static std::mutex mtx_exc[GPU_MAX_NUM]; + + public: + Scheduler(uint64_t clientId) { + int tmptid = static_cast(::syscall(SYS_gettid)); + std::cout << "Scheduler::Scheduler() threadId = " << tmptid << "clientID: "<< clientId << std::endl; + cudaGetDeviceCount(&dev_cnt); + size_t free_byte, total_byte; + + for(int i = 0; i < dev_cnt; i++) { + cudaSetDevice(i); + cudaMemGetInfo(&free_byte, &total_byte); + free_mem[i] = free_byte; + total_mem[i] = total_byte; + } + std::set::iterator iter; + for(iter = jobs.begin(); iter != jobs.end(); ++iter) { + if((*iter)->client_id == clientId) { + job = *iter; + tool::Logging(LOG_INFO,myName_,"get cid = %d\n",job->client_id); + break; + } + } + if(iter == jobs.end() || jobs.empty()) { + job = new JobInfo_(); + job->client_id = clientId; + } + // job->client_id = clientId; + cur_pid = getpid(); + +#ifdef GV_Monitor + //建立与Monitor的tcp连接 + initialize_connection(); +#endif + + } + + ~Scheduler() { + // tool::Logging(LOG_INFO,myName_,"client %lu delete\n",job->client_id); + int tmptid = static_cast(::syscall(SYS_gettid)); + std::cout << "Scheduler::~Scheduler() threadId = " << tmptid << "clientID: "<< job->client_id << std::endl; + + delete job; + // release_connection(); +#ifdef GV_Monitor + close(_sock); +#endif + } + + void send_message(const std::string& message, int curDev_) { + // std::lock_guard lock(conn_mutex); + int tmptid = static_cast(::syscall(SYS_gettid)); + std::cout << "sock:" << _sock << " tid: " << tmptid << std::endl; + string msg = ""; + if (_sock != -1) { + if(message == "client_runjob:"){ + msg = message + to_string(curDev_); + // std::thread(&Scheduler::delayed_send, this, msg).detach(); + send(_sock, msg.c_str(), msg.size(), 0); + }else if(message == "client_stop:"){ + msg = message + to_string(curDev_); + send(_sock, msg.c_str(), msg.size(), 0); + } + // send(_sock, message.c_str(), message.size(), 0); + tool::Logging(LOG_INFO, "Scheduler", "Message sent: %s\n", msg.c_str()); + } else { + tool::Logging(LOG_ERROR, "Scheduler", "Socket is not connected.\n"); + } + } + + bool ready_to_del(){ + char buffer[256]; + size_t n = read(_sock, buffer, sizeof(buffer) - 1); + if(n > 0) { + buffer[n] = '\0'; + tool::Logging(LOG_INFO, "Scheduler", "Received message: %s\n", buffer); + return true; + } + tool::Logging(LOG_ERROR, "Scheduler", "Failed to receive message.\n"); + return false; + } + + + int get_free_gpu() { + int max_index = -1; + int max_free = 0; + + for(int i = 0; i < dev_cnt; i++) { + cudaSetDevice(i); + size_t free_byte, total_byte; + cudaMemGetInfo(&free_byte, &total_byte); + if(max_free < free_byte) { + max_free = free_byte; + max_index = i; + } + } + return max_index; + } + + void enqueue(uint64_t client_id, int device) { + //处理任务入队,可能是新任务,可能是已知信息任务。 + { + std::lock_guard lock(mtx_used[device]); + if(timers[device] == std::chrono::steady_clock::time_point()){ + tool::Logging(LOG_INFO, myName_, "device:%d , first time std::chrono::steady_clock::time_point\n", device); + timers[device] = chrono::steady_clock::now(); + } + else { + chrono::steady_clock::time_point now_time = chrono::steady_clock::now(); + int duration = (chrono::duration_cast(now_time - timers[device]).count() * 1.0 / 1000); + // if(duration % 100 == 0){ + // tool::Logging(LOG_INFO, myName_, "duration = %d\n",duration); + // } + if(duration >= TIME_QUANTUM) { + queues[device] = (queues[device] == 0) ? 1 : 0; + tool::Logging(LOG_INFO, myName_, "device:%d, change to handle queue %d\n",device,queues[device]); + // SendNewInfoToDispatcher(config_); + timers[device] = chrono::steady_clock::now(); + } + } + } + + if(jobs.count(job) == 0 && client_ids.count(job->client_id) == 0) { //新来的任务 + { + std::lock_guard lock(mtx_used[device]); + if(queues[device] == 1 && known_queue[device].empty()) { + queues[device] = 0; + tool::Logging(LOG_INFO, myName_, "device:%d,there is no job in queue1, change to handle queue %d\n",device, queues[device]); + } + } + + jobs.insert(job); + client_ids.insert(job->client_id); + unknown_queue[device].push(job); + tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert unknown_queue, device:%d, flag = %d, cid = %lu, unknown_queue_size:%d\n",device, client_id, device ,flag[device], unknown_queue[device].front()->client_id,unknown_queue[device].size()); + int threadId = static_cast(::syscall(SYS_gettid)); + tool::Logging(LOG_INFO,myName_,"client %lu threadId = %d\n",client_id,threadId); + + while (unknown_queue[device].front() != job || flag[device] != 0 || queues[device] != 0) { + tool::Logging(LOG_INFO,myName_,"get int unknown_queue[%d].front() != job || flag != 0 || queues != 0,front() == %lu, flag == %d, queues = %d,\n",device, unknown_queue[device].front()->client_id, flag[device], queues[device]); + sleep(1); + } + //独占GPU + if(unknown_queue[device].front() == job) { + // int device = get_free_gpu(); + tool::Logging(LOG_INFO,myName_,"client %lu starts on gpu %d\n",job->client_id, device); + // cudaSetDevice(device); + job->device = device; + int shmid = shmget(1234, sizeof(int), IPC_CREAT | 0666); + if (shmid < 0) { + perror("shmget"); + exit(EXIT_FAILURE); + } + + shm_util = (size_t *)shmat(shmid, NULL, 0); + if ((intptr_t)shm_util == -1) { + perror("shmat"); + exit(EXIT_FAILURE); + } + start = chrono::steady_clock::now(); + pid = fork(); + unknown_dequeue(pid); + } + } + else {//已知任务信息的任务 + if (job->cur_It < THRESH_ITER + 1) { + // std::cout << "clientID:" << client_id << " get in if " << device <cur_It == (THRESH_ITER + 1) && unknown_queue[device].front() == job) { + // if(pid > 0) { + tool::Logging(LOG_INFO,myName_,"device:%d, finishing profiling client %lu\n", device, job->client_id); + unknown_dequeue(pid); + unknown_queue[device].pop(); + // } + chrono::steady_clock::time_point now_time = chrono::steady_clock::now(); + job->served_time = (chrono::duration_cast(now_time - start).count() * 1.0 / 1000); + job->perIt_time = job->served_time / (THRESH_ITER - 1); + job->remain_time = (job->num_It - job->cur_It + 1) * job->perIt_time; + tool::Logging(LOG_INFO,myName_,"client %lu job Info: num_It=%d, cur_It=%d, sm_util=%d, mem=%zu, served_time = %lf,perIt_time = %lf,remain_time=%lf\n", + job->client_id, job->num_It, job->cur_It, job->sm_util, job->used_mem, job->served_time, job->perIt_time, job->remain_time); + known_queue[device].emplace_back(job); + tool::Logging(LOG_INFO, myName_, "device:%d, client %lu insert known_queue. unknown_queue size = %d\n",device, job->client_id, unknown_queue[device].size()); + known_dequeue(device); + start = chrono::steady_clock::now(); + tool::Logging(LOG_INFO, myName_, "client %lu restarts.\n", job->client_id); + // } + } + else {//选择剩余时间最小的任务 + known_dequeue(device); + } + } + } + + void unknown_dequeue(pid_t pid) { + if(pid == 0) {//child process + // int device; + // cudaGetDevice(&device); + nvmlReturn_t result; + result = nvmlInit(); + if (result != NVML_SUCCESS) { + tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: Failed to initialize NVML %d\n",job->client_id, result); + return; + } + int index = 0; + cudaGetDevice(&index); + nvmlDevice_t dev; + result = nvmlDeviceGetHandleByIndex_v2(index, &dev); + if(result != NVML_SUCCESS) { + tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: nvmlDeviceGetHandleByIndex %d\n",job->client_id, result); + return; + } + int cnt = 0; + while (true) { + nvmlUtilization_t util; + result = nvmlDeviceGetUtilizationRates(dev, &util); + if(result != NVML_SUCCESS) { + tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: nvmlDeviceGetUtilizationRates\n",job->client_id); + return; + } + if(util.gpu > 0) { + job->sm_util += util.gpu; + // tool::Logging(LOG_INFO,myName_,"client %lu util = %d %d %d\n",job->client_id,util.gpu,util.gpu,job->sm_util); + ++cnt; + *shm_util = job->sm_util / cnt; + // *shm_util = max((uint)*shm_util, util.gpu); + } + + } + } + else if(pid > 0) { + if(job->cur_It == (THRESH_ITER + 1)) { + if (kill(pid, SIGKILL) == 0) { + tool::Logging(LOG_INFO,myName_,"client %lu Sent SIGKILL to child process\n",job->client_id); + } else { + tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: Failed to send SIGINT to child process\n",job->client_id); + } + job->sm_util = *shm_util; + } + } + else { + tool::Logging(LOG_ERROR,myName_,"client %lu fork error\n",job->client_id); + } + } + + void known_allocate(int device) { + std::lock_guard lock(mtx_sche[job->device]); + // tool::Logging(LOG_INFO,myName_,"client %lu try to known_allocate\n",job->client_id); + if (!flag[device]) { + //按剩余时间从小到大排序 + sort(known_queue[device].begin(), known_queue[device].end()); + // tool::Logging(LOG_INFO,myName_,"client %lu finish sort\n",job->client_id); + + JobInfo_* shortest = known_queue[device][0]; + known_map[shortest] = 1; + ++flag[device]; + size_t mem = shortest->used_mem; + int util = shortest->sm_util; + tool::Logging(LOG_INFO,myName_,"choose shortest job: cid = %lu, mem = %lu, util = %u\n",shortest->client_id, mem, util); + + //选择资源互补任务 + for(auto it = known_queue[device].begin(); it != known_queue[device].end(); ++it) { + tool::Logging(LOG_INFO,myName_,"jobInfo: cid = %lu, mem = %lu/%lu, util = %u\n",(*it)->client_id, (*it)->used_mem, total_mem[shortest->device],(*it)->sm_util,(*it)->remain_time); + if(it == known_queue[device].begin()){ + continue; + } + JobInfo_* tmp = *it; + if(tmp->device != shortest->device) + continue; + if((tmp->used_mem + mem) <= total_mem[shortest->device] && (tmp->sm_util + util) <= 100 && known_map[tmp] == 0) { + known_map[tmp] = 1; + mem += tmp->used_mem; + util += tmp->sm_util; + ++flag[device]; + tool::Logging(LOG_INFO,myName_,"choose resource-inter job: cid = %lu, mem = %lu, util = %u\n",tmp->client_id, mem, util); + } + } + // tool::Logging(LOG_INFO,myName_,"client %lu finish known_allocate\n",job->client_id); + } + } + + void known_dequeue(int device) { + { + std::lock_guard lock(mtx_used[device]); + if(queues[device] == 0 && unknown_queue[device].empty()) { + queues[device] = 1; + tool::Logging(LOG_INFO, myName_, "device[%d]there is no job in queue0, change to handle queue %d\n",device,queues[device]); + } + } + + while (!unknown_queue[device].empty() && queues[device] == 0) {//等待新来任务完成独占profiling + if(known_map[job] == 1) { + --flag[device]; + known_map[job] = 0; + //TODO + chrono::steady_clock::time_point now_time = chrono::steady_clock::now(); + job->served_time += (chrono::duration_cast(now_time - start).count() * 1.0 / 1000); + job->remain_time = (job->num_It - job->cur_It + 1) * job->perIt_time; + tool::Logging(LOG_INFO, myName_, "client id %lu block, there is a new job, id = %lu\n", job->client_id, unknown_queue[device].front()->client_id); + } + sleep(1); + } + if(known_map[job]){ + // std::cout << "get_in_known_job" << std::endl; + return; + } + + schedule_shortest: + known_allocate(device); + //flag != 0 有任务正在运行 + while(known_map[job] == 0) { + sleep(1); + } + if(!flag[device]) + goto schedule_shortest; + tool::Logging(LOG_INFO, myName_, "client %lu restarts.\n", job->client_id); + } + + void cal_add_It(uint64_t client_id) { + /* + 根据zss所说,每个iteration开始会调用两次cudaMemcpyAsyncH2DHandle,所以本函数的大概意思是,如果job->cur_It ==2,开始计时; + THRESH_ITER+1的值为12,也就是结束了profiling,就会再次调用enqueue插入到known_queue中。 + */ + ++job->cur_It; + if(job->cur_It == 2) { + start = chrono::steady_clock::now(); + } + if(job->cur_It == (THRESH_ITER + 1)) { + int device; + cudaGetDevice(&device); + std::cout << "clientId:"<< client_id << " cal_add_It_device:" << device << std::endl; + // tool::Logging(LOG_INFO, myName_, "cal_add_It_device_!!client %lu, Device: %d .\n", client_id, device); + + enqueue(client_id,device); + } + } + + void get_Iteration(int num) { + job->num_It = int(num); + } + + void cal_job_mem(size_t size) { + job->used_mem += size; + } + + void free_jobs(uint64_t client_id, int device) { + if(unknown_queue[device].front() == job) { + unknown_queue[device].pop(); + { + std::lock_guard lock(mtx_used[device]); + if(queues[device] == 0 && unknown_queue[device].empty()) { + queues[device] = 1; + } + } + + } + else { + for(auto it = known_map.begin(); it != known_map.end(); ++it) { + if((*it).second == 1) { + (*it).second = 0; + --flag[device]; + chrono::steady_clock::time_point now_time = chrono::steady_clock::now(); + job->served_time += (chrono::duration_cast(now_time - start).count() * 1.0 / 1000); + job->remain_time = (job->num_It - job->cur_It + 1) * job->perIt_time; + tool::Logging(LOG_INFO, myName_, "client id %lu block, job %lu has finished\n", (*it).first->client_id,job->client_id); + } + } + auto it = known_map.find(job); + if(it != known_map.end()) + known_map.erase(it); + auto it1 = find(known_queue[device].begin(), known_queue[device].end(), job); + if(it1 != known_queue[device].end()) + known_queue[device].erase(it1); + std::lock_guard lock(mtx_used[device]); + if(queues[device] == 1 && known_queue[device].empty()) { + queues[device] = 0; + } + } + + auto it2 = jobs.find(job); + if(it2 != jobs.end()) + jobs.erase(it2); + // add_sub_unuse_men(device, total_m, 1); + if (kill(pid, SIGKILL) == 0) { + tool::Logging(LOG_INFO,myName_,"client %lu Sent SIGKILL to child process\n",job->client_id); + } else { + tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: Failed to send SIGINT to child process\n",job->client_id); + } + } + + + +}; + +inline size_t Scheduler::free_mem[GPU_MAX_NUM]; +inline size_t Scheduler::total_mem[GPU_MAX_NUM]; +inline set Scheduler::jobs; +inline queue Scheduler::unknown_queue[GPU_MAX_NUM]; // 未知任务信息的队列 +inline vector Scheduler::known_queue[GPU_MAX_NUM]; // 已知任务信息队列 +inline int Scheduler::flag[GPUNum];//判断是否有已知信息的任务在运行,0表示没有 +inline unordered_map Scheduler::known_map; +inline int Scheduler::queues[GPUNum];//0表示轮到unknown_queue,1表示轮到known_queue +// inline chrono::steady_clock::time_point Scheduler::timer; +inline chrono::steady_clock::time_point Scheduler::timers[GPUNum]; +inline set Scheduler::client_ids; + +inline std::mutex Scheduler::mtx_used[GPU_MAX_NUM]; +inline std::mutex Scheduler::mtx_sche[GPU_MAX_NUM]; + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h b/GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h new file mode 100644 index 0000000..eab9ef6 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h @@ -0,0 +1,422 @@ +#ifndef SERVER_ENDPOINT_H +#define SERVER_ENDPOINT_H +#include +#include "configure.h" +#include "requestBuffer.h" +#include "requestIOV.h" +#include "registerIOV.h" +#include "ucpConnection.h" +// #include "./tsl/bhopscotch_map.h" +#include "./hashing/robin_hood.h" +#include "./conqueue/readerwriterqueue.h" +#include "./shmqueue/shmUtil.h" +#include "ptxExtractor.h" +#include "mapper.h" +#include "asyncRequest.h" +#include +#include +#include +#include +#include +#include +// #include +#include "scheduler.h" +#include "elasticscheduler.h" +#ifdef GV_MSGHANDLER +#include "msghandler.h" +#endif + + +typedef struct { + DeviceBlockMapper* blockManager; + std::vector blockInfoList; + HandleMapper* handleManager; + // robin_hood::unordered_flat_map mapVirAddr2HandleInfo; + // robin_hood::unordered_flat_map mapRealAddr2handleListIdx; + // std::vector handleInfoList; + // std::queue> freeHandleQueue; + std::vector ncclRedOpList; + // std::vector ncclUidList; + + bool isFirstIter; + robin_hood::unordered_flat_map* mapFatBinHandle2CuModule; + robin_hood::unordered_flat_map* mapHost2CuFunc; //used for cudaLaunchKernel + robin_hood::unordered_flat_map* mapHostVar2CuDevPtr; //used for cudaMemcopyToSymbol + robin_hood::unordered_flat_map* mapDevName2DevPtr; //used for identifying the device pointer, shared by all devices + PTXExtractor* ptxExtractor; + moodycamel::BlockingReaderWriterQueue< std::pair >* imageQueue; +} CUinfoMap_t; + +#define DEFINE_SERVER_AM_CALLBACK(name) \ + ucs_status_t ServerEndpoint::name(void *arg, const void *header, size_t header_length, \ + void *data, size_t length, const ucp_am_recv_param_t *param) + + +extern Configure config; +class ServerEndpoint { + private: + const char* myShortName_ = "ServerEp"; + char* myName_ = NULL; + ConnStatus_t connStatus_ = {true, true}; + uint64_t clientID_; + char* clientIP_; + char* clientPort_; + int clientPID_ = 0; + size_t priority_ = 0; + +#ifdef GV_Scheduler + Scheduler* sche; +#endif + +#ifdef GV_MSGHANDLER + MsgHandler* msghandler; + boost::thread* MSGRecvThread_ = nullptr; + bool MSGStopFlag_ = false; + bool createThreadFlag = false; + int msgResetCnt = 0; + bool msgWaitFlag_ = false; +#endif + +#ifdef GV_eScheduler + eScheduler* esche; +#endif + + bool initFlag_ = false; // to initialize the shared memory (and imageQueue) only once + size_t curIter_ = 0; + size_t numIterations = 0; + int ckptCnt = 0; + + int curDev_ = 0; // the current device of this thread + bool recoveryFlag_ = false; + bool replayFlag_ = false; + int getDeviceFlag = 0; + int CliDev_ = -1; + + SharedMemoryOpt* shmQueSizes_[3] = {NULL, NULL, NULL}; + CMessageQueue* shmQueues_[3] = {NULL, NULL, NULL}; + size_t lastCopyLen_[3] = {0, 0, 0}; + uint8_t lastCopyType_ = MEMCPY_OTHER; + std::vector trainTensors_; + + ncclComm_t curComm = NULL; // todo: maybe not only one stream + cudaStream_t commStream_ = NULL; + cudaStream_t defaultStream_ = NULL; + + std::list watchedEventsList_; + std::list notCompleteEventsList_; + Sync_t eventWatchedSync_; + boost::thread* eventWatchedThread_ = nullptr; + + std::string backupFilePath_; + HostBuffer_t serverState_ = {NULL, 0}; + void* backupMemcpyBuffer_ = NULL; + std::vector streamList_; + Sync_t backupSync_; + AsyncRequest stateBackup_; + AsyncRequest trainTensorBackup_; + bool bufferReady_ = false; + bool bufferFinished_ = false; + bool bufferResized_ = false; + // boost::thread* backupStorageThread_ = nullptr; + std::unique_ptr fileMappingRegion_; + std::unique_ptr trainTensorsMappingRegion_ = nullptr; + + public: + ucp_conn_request_h _connectReq; + ucp_worker_h _dataWorker; + ucp_ep_h _clientEp; + + CUinfoMap_t _cuInfoMap; + boost::thread* _ptxThread = nullptr; + std::vector _fatbinList; + + static void CreateServerEp(ucp_conn_request_h conn_request, void *arg); + ServerEndpoint(ucp_worker_h dataWorker, UCPConnection_t conn); + ~ServerEndpoint(); + void CloseEp(uint64_t flags); + + void SendResponse(RequestIOV* reqBuffer, const ucp_ep_h* ep, ucs_memory_type_t memType = UCS_MEMORY_TYPE_HOST); + void SendStatus(int status); + + // Get the device pointer from userAddr(virtAddr) + inline void* GetDevPtr(uint64_t userAddr) { +#ifdef GV_MEMORY + uint64_t realAddr = 0; + if (_cuInfoMap.blockManager->FindByVirAddr(userAddr, realAddr) == -1) { + return NULL; + } + else { + return (void*)realAddr; + } +#elif defined(GV_MEMORY_PTX) + return (void*)(_cuInfoMap.blockInfoList[GET_BLOCK_ID(userAddr)].devPtr + GET_BLOCK_INTER_OFFSET(userAddr)); +#else + return (void*)userAddr; +#endif // GV_MEMORY + } + + // Get the block index of current device's blockList from userAddr(virtAddr) + inline size_t GetBlockIdx(uint64_t userAddr) { + return GET_BLOCK_ID(userAddr); + } + + // Find the virtual address of the device pointer from the real address + inline uint64_t FindDevPtrVirAddr(void* devPtr) { +#ifdef GV_MEMORY + uint64_t virtAddr = 0; + if (_cuInfoMap.blockManager->FindByRealAddr((const uint64_t)devPtr, virtAddr) == -1) { + tool::Logging(LOG_ERROR, myName_, "FindDevPtrVirAddr failed: devPtr(%p) not found\n", devPtr); + return 0; + } + else { + return virtAddr; + } +#else + return (uint64_t)devPtr; +#endif // GV_MEMORY + } + + // Get the real address of the handle from userAddr(virtAddr) + inline void* GetHandle(uint64_t userAddr, bool reset = false) { +#ifdef GV_HANDLE + return _cuInfoMap.handleManager->FindRealAddrByVirAddr(userAddr, reset); +#else + return (void*)(userAddr); +#endif // GV_HANDLE + } + + inline Handle_t* GetHandleInfo(uint64_t userAddr) { + return _cuInfoMap.handleManager->GetHandleInfoByVirAddr(userAddr); + } + + // Set the handle info + inline void SetHandleVirAddr(uint64_t userAddr, void* handlePtr, enum API_REQUEST_CODE_SET handleType = __CUDA_REGISTER) { + _cuInfoMap.handleManager->UpdateHandle(userAddr, handlePtr, handleType); + } + + // Add the handle info and return the virtual address of the handle + inline void* GetHandleVirAddr(void* handlePtr, enum API_REQUEST_CODE_SET handleType) { +#ifdef GV_HANDLE + return _cuInfoMap.handleManager->AddHandle(handlePtr, handleType); +#else + return handlePtr; +#endif // GV_HANDLE + } + + // Find the virtual address of the handle from the real address + inline uint64_t FindHandleVirAddr(void* handlePtr) { +#ifdef GV_HANDLE + return _cuInfoMap.handleManager->FindIdxByRealAddr(handlePtr); +#else + return (uint64_t)handlePtr; +#endif // GV_HANDLE + } + + inline ncclRedOp_t GetNcclRedOp(ncclRedOp_t userOp, bool reset = false) { +#ifdef GV_HANDLE + ncclRedOp_t redOp = _cuInfoMap.ncclRedOpList[userOp]; + if (reset) { + _cuInfoMap.ncclRedOpList[userOp] = static_cast(0); + } + return redOp; +#else + return userOp; +#endif // GV_HANDLE + } + + inline ncclRedOp_t GetNcclRedOpVirAddr(ncclRedOp_t redOp) { +#ifdef GV_HANDLE + size_t redOpIndex = ( + _cuInfoMap.ncclRedOpList.emplace_back(redOp), + _cuInfoMap.ncclRedOpList.size() - 1 + ); + return (ncclRedOp_t)redOpIndex; +#else + return redOp; +#endif // GV_HANDLE + } + + inline void SetNcclRedOpVirAddr(ncclRedOp_t redOp, ncclRedOp_t userOp) { +#ifdef GV_HANDLE + _cuInfoMap.ncclRedOpList[userOp] = redOp; +#endif // GV_HANDLE + } + + void Run(); + void NewRun(); + void UpdateStream(cudaStream_t stream, bool isCommStream = true); + void CommEventMonitor(); + void StopCommEventMonitor(); + void Backup2Memory(); + void Backup2Storage(); + void BackupTrainTensors2Storage(); + void Persist2File(const char* fileName, const uint8_t* data, size_t size); + void LoadFromStorage(); + void LoadFromStorage(size_t iter); + void StopBackup(); + + /* CUDA Runtime Internal */ + DECLARE_AM_CALLBACK(__cudaRegisterHandle); + DECLARE_AM_CALLBACK(__cudaRegisterFatBinaryHandle); + DECLARE_AM_CALLBACK(__cudaRegisterFunctionHandle); + DECLARE_AM_CALLBACK(__cudaRegisterVarHandle); + void __cudaUnregisterFatBinaryHandle(); + + /* CUDA Runtime Execution */ + DECLARE_AM_CALLBACK(cudaLaunchKernelHandle); + DECLARE_AM_CALLBACK(cudaFuncGetAttributesHandle); + + /* CUDA Runtime Device */ + DECLARE_AM_CALLBACK(cudaGetDeviceHandle); + DECLARE_AM_CALLBACK(cudaGetDeviceCountHandle); + DECLARE_AM_CALLBACK(cudaGetDevicePropertiesHandle); + DECLARE_AM_CALLBACK(cudaSetDeviceHandle); + DECLARE_AM_CALLBACK(cudaSetMainDeviceHandle); + DECLARE_AM_CALLBACK(cudaDeviceSynchronizeHandle); + DECLARE_AM_CALLBACK(cudaDeviceGetAttributeHandle); + + /* CUDA Runtime Memory */ + DECLARE_AM_CALLBACK(cudaMallocHandle); + DECLARE_AM_CALLBACK(cudaMemGetInfoHandle); + DECLARE_AM_CALLBACK(cudaMemsetHandle); + DECLARE_AM_CALLBACK(cudaMemsetAsyncHandle); + DECLARE_AM_CALLBACK(cudaFreeHandle); + DECLARE_AM_CALLBACK(cudaMemcpyH2DHandle); + DECLARE_AM_CALLBACK(cudaMemcpyD2HHandle); + DECLARE_AM_CALLBACK(cudaMemcpyD2DHandle); + DECLARE_AM_CALLBACK(NewIterHandle); + // DECLARE_AM_CALLBACK(cudaMemcpyHandle); + // DECLARE_AM_CALLBACK(cudaMemcpyAsyncHandle); + DECLARE_AM_CALLBACK(cudaMemcpyAsyncH2DHandle); + DECLARE_AM_CALLBACK(cudaMemcpyAsyncD2HHandle); + DECLARE_AM_CALLBACK(cudaMemcpyAsyncD2DHandle); + DECLARE_AM_CALLBACK(cudaMemcpyToSymbolHandle); + + /* CUDA Runtime Event */ + DECLARE_AM_CALLBACK(cudaEventCreateHandle); + DECLARE_AM_CALLBACK(cudaEventCreateWithFlagsHandle); + DECLARE_AM_CALLBACK(cudaEventRecordHandle); + DECLARE_AM_CALLBACK(cudaEventQueryHandle); + DECLARE_AM_CALLBACK(cudaEventDestroyHandle); + DECLARE_AM_CALLBACK(cudaEventElapsedTimeHandle); + + /* CUDA Runtime Stream */ + DECLARE_AM_CALLBACK(cudaStreamCreateHandle); + DECLARE_AM_CALLBACK(cudaStreamCreateWithFlagsHandle); + DECLARE_AM_CALLBACK(cudaStreamCreateWithPriorityHandle); + DECLARE_AM_CALLBACK(cudaStreamWaitEventHandle); + DECLARE_AM_CALLBACK(cudaStreamSynchronizeHandle); + DECLARE_AM_CALLBACK(cudaStreamDestroyHandle); + DECLARE_AM_CALLBACK(cudaStreamIsCapturingHandle); + DECLARE_AM_CALLBACK(cudaStreamGetCaptureInfoHandle); + + /* CUDA Runtime Other */ + DECLARE_AM_CALLBACK(cudaOccupancyMaxActiveBlocksPerMultiprocessorHandle); + DECLARE_AM_CALLBACK(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsHandle); + + /* cuBLAS */ + DECLARE_AM_CALLBACK(cublasCreate_v2Handle); + DECLARE_AM_CALLBACK(cublasSgemm_v2Handle); + DECLARE_AM_CALLBACK(cublasSgemmStridedBatchedHandle); + DECLARE_AM_CALLBACK(cublasDestroy_v2Handle); + DECLARE_AM_CALLBACK(cublasSetStream_v2Handle); + DECLARE_AM_CALLBACK(cublasSetWorkspace_v2Handle); + DECLARE_AM_CALLBACK(cublasSetMathModeHandle); + DECLARE_AM_CALLBACK(cublasGetMathModeHandle); + + /* cuBLASLt */ + DECLARE_AM_CALLBACK(cublasLtCreateHandle); + DECLARE_AM_CALLBACK(cublasLtDestroyHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulDescCreateHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulDescDestroyHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulDescSetAttributeHandle); + DECLARE_AM_CALLBACK(cublasLtMatrixLayoutCreateHandle); + DECLARE_AM_CALLBACK(cublasLtMatrixLayoutDestroyHandle); + DECLARE_AM_CALLBACK(cublasLtMatrixLayoutSetAttributeHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulPreferenceCreateHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulPreferenceDestroyHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulPreferenceSetAttributeHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulAlgoGetHeuristicHandle); + DECLARE_AM_CALLBACK(cublasLtMatmulHandle); + + /* cuDNN */ + DECLARE_AM_CALLBACK(cudnnCreateHandle); + DECLARE_AM_CALLBACK(cudnnDestroyHandle); + DECLARE_AM_CALLBACK(cudnnCreateTensorDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnDestroyTensorDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnGetTensorSizeInBytesHandle); + DECLARE_AM_CALLBACK(cudnnSetTensor4dDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnSetTensorNdDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnSetTensorNdDescriptorExHandle); + DECLARE_AM_CALLBACK(cudnnCreateTensorTransformDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnSetTensorTransformDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnDestroyTensorTransformDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnInitTransformDestHandle); + DECLARE_AM_CALLBACK(cudnnTransformTensorExHandle); + DECLARE_AM_CALLBACK(cudnnTransformFilterHandle); + DECLARE_AM_CALLBACK(cudnnCreateFilterDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnSetFilterNdDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnDestroyFilterDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnGetFilterSizeInBytesHandle); + DECLARE_AM_CALLBACK(cudnnGetFoldedConvBackwardDataDescriptorsHandle); + DECLARE_AM_CALLBACK(cudnnSetStreamHandle); + DECLARE_AM_CALLBACK(cudnnBatchNormalizationBackwardExHandle); + DECLARE_AM_CALLBACK(cudnnBatchNormalizationForwardTrainingExHandle); + DECLARE_AM_CALLBACK(cudnnBatchNormalizationForwardInferenceHandle); + DECLARE_AM_CALLBACK(cudnnBackendCreateDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnBackendDestroyDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnBackendSetAttributeHandle); + DECLARE_AM_CALLBACK(cudnnBackendGetAttributeHandle); + DECLARE_AM_CALLBACK(cudnnBackendExecuteHandle); + DECLARE_AM_CALLBACK(cudnnBackendFinalizeHandle); + DECLARE_AM_CALLBACK(cudnnGetBatchNormalizationBackwardExWorkspaceSizeHandle); + DECLARE_AM_CALLBACK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSizeHandle); + DECLARE_AM_CALLBACK(cudnnGetBatchNormalizationTrainingExReserveSpaceSizeHandle); + DECLARE_AM_CALLBACK(cudnnCreateConvolutionDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnDestroyConvolutionDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnSetConvolutionGroupCountHandle); + DECLARE_AM_CALLBACK(cudnnSetConvolutionMathTypeHandle); + DECLARE_AM_CALLBACK(cudnnSetConvolutionNdDescriptorHandle); + DECLARE_AM_CALLBACK(cudnnSetConvolutionReorderTypeHandle); + DECLARE_AM_CALLBACK(cudnnGetConvolutionForwardAlgorithm_v7Handle); + DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardFilterAlgorithm_v7Handle); + DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardDataAlgorithm_v7Handle); + DECLARE_AM_CALLBACK(cudnnGetConvolutionForwardWorkspaceSizeHandle); + DECLARE_AM_CALLBACK(cudnnConvolutionForwardHandle); + DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardDataWorkspaceSizeHandle); + DECLARE_AM_CALLBACK(cudnnConvolutionBackwardFilterHandle); + DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardFilterWorkspaceSizeHandle); + DECLARE_AM_CALLBACK(cudnnConvolutionBackwardDataHandle); + + /* nccl */ + DECLARE_AM_CALLBACK(ncclGroupStartHandle); + DECLARE_AM_CALLBACK(ncclGroupEndHandle); + DECLARE_AM_CALLBACK(ncclCommInitRankHandle); + DECLARE_AM_CALLBACK(ncclCommDestroyHandle); + DECLARE_AM_CALLBACK(ncclCommGetAsyncErrorHandle); + DECLARE_AM_CALLBACK(ncclGetUniqueIdHandle); + DECLARE_AM_CALLBACK(ncclGetVersionHandle); + DECLARE_AM_CALLBACK(ncclAllReduceHandle); + DECLARE_AM_CALLBACK(ncclReduceHandle); + DECLARE_AM_CALLBACK(ncclReduceScatterHandle); + DECLARE_AM_CALLBACK(ncclAllGatherHandle); + DECLARE_AM_CALLBACK(ncclBroadcastHandle); + DECLARE_AM_CALLBACK(ncclSendHandle); + DECLARE_AM_CALLBACK(ncclRecvHandle); + DECLARE_AM_CALLBACK(ncclCommCountHandle); + DECLARE_AM_CALLBACK(ncclCommUserRankHandle); + DECLARE_AM_CALLBACK(ncclCommCuDeviceHandle); + DECLARE_AM_CALLBACK(ncclCommAbortHandle); + DECLARE_AM_CALLBACK(ncclCommInitAllHandle); + DECLARE_AM_CALLBACK(ncclCommInitRankConfigHandle); + DECLARE_AM_CALLBACK(ncclCommSplitHandle); + DECLARE_AM_CALLBACK(ncclCommFinalizeHandle); + DECLARE_AM_CALLBACK(ncclCommRegisterHandle); + DECLARE_AM_CALLBACK(ncclCommDeregisterHandle); + DECLARE_AM_CALLBACK(ncclMemAllocHandle); + DECLARE_AM_CALLBACK(ncclMemFreeHandle); + DECLARE_AM_CALLBACK(ncclRedOpCreatePreMulSumHandle); + DECLARE_AM_CALLBACK(ncclRedOpDestroyHandle); +}; + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp) b/GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp) new file mode 100644 index 0000000..85a0838 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp) @@ -0,0 +1,25 @@ +set(CUDA_ROOT_PATH "/usr/local/cuda") + +if(NOT CUDA_ROOT_PATH) + message(FATAL_ERROR "Please set the CUDA_PATH environment variable to the path of your desired CUDA installation.") +endif() + +list(APPEND CMAKE_PREFIX_PATH ${CUDA_ROOT_PATH}) +find_package(CUDA REQUIRED) +list(APPEND CUDA_NVCC_FLAGS "-lcudart") + +set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) + +aux_source_directory(. CLIENT_SRC) +cuda_add_library(ClientCore ${CLIENT_SRC}) +target_link_libraries(ClientCore + CommCore + ${CUDA_LIBRARIES} + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda.so + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufile.so +) + +target_include_directories(ClientCore PUBLIC ${CUDA_INCLUDE_DIRS}) + +add_executable(Client clientApp.cc) +target_link_libraries(Client ${CLIENT_FINAL_OBJ}) \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak b/GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak new file mode 100644 index 0000000..69d2acc --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak @@ -0,0 +1,278 @@ +#include +#include +#include "../../include/configure.h" +#include "../../include/ucpConnection.h" +#include "../../include/clientEndpoint.h" +using namespace std; + +Configure config_("config.json"); +const char* myName = "ClientApp"; +ClientEndpoint* clientEp; +UCPConnection* connectionObj; + +void cleanup() { + delete clientEp; + delete connectionObj; +} + +int main(int argc, char* argv[]) { + // connectionObj = new UCPConnection(config_.GetServerIp().c_str(), config_.GetServerPort()); + // clientEp = new ClientEndpoint(connectionObj); + // //clientEp->Run(); + // tool::Logging(myName, "start the client endpoint.\n"); + + // int reqNum = 2; + // int amID = 0; + // ucp_dt_iov_t *iov = (ucp_dt_iov_t *)alloca(reqNum * sizeof(ucp_dt_iov_t)); + // size_t cuda_size = 4 * 1024 * 1024; + + // size_t* header = (size_t*)malloc(sizeof(size_t) * reqNum); + + // /* batch1: cudaMalloc + cudaSetDevice */ + // // req1 + // RequestBuffer reqBuf = RequestBuffer(sizeof(int) + sizeof(size_t)); + // reqBuf.Push(CUDA_MALLOC); + // reqBuf.Push(cuda_size); + // iov[0].length = reqBuf.GetSize(); + // iov[0].buffer = reqBuf._dataBuffer; + // header[0] = iov[0].length; + // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); + + // //req2 + // RequestBuffer reqBuf2 = RequestBuffer(sizeof(int) + sizeof(int)); + // reqBuf2.Push(CUDA_SET_DEVICE); + // reqBuf2.Push(0); + // // tool::Logging(myName, "reqBuf2.GetSize() = %zu\n", reqBuf2.GetSize()); + // // char* msg = (char*)malloc(8000); + // // reqBuf2.PushCString(msg); + // // tool::Logging(myName, "reqBuf2.GetSize() = %zu\n", reqBuf2.GetSize()); + // iov[1].length = reqBuf2.GetSize(); + // iov[1].buffer = reqBuf2._dataBuffer; + // header[1] = iov[1].length; + // tool::Logging(myName, "iov[1].length = %zu\n", iov[1].length); + + // SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + // ucp_dt_iov_t *recv_iov; + // uint64_t devPtr = 0; + // PrepareSingleIOV(&recv_iov, &devPtr, sizeof(uint64_t)); + // size_t tmp = 0; + + // ReceiveData(recv_iov, &tmp, &devPtr, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL); + // tool::Logging(myName, "devPtr = %p(%llu), size = %zu\n", devPtr, devPtr, tmp); + + + // // /* batch2: cudaFree + cudaSetDevice */ + // // // req1 + // // RequestBuffer reqBuf3 = RequestBuffer(sizeof(int) + sizeof(uint64_t)); + // // reqBuf3.Push(CUDA_FREE); + // // reqBuf3.Push64BitPointer(devPtr); + // // iov[0].length = reqBuf3.GetSize(); + // // iov[0].buffer = reqBuf3._dataBuffer; + // // header[0] = iov[0].length; + // // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); + + // // //req2 + // // RequestBuffer reqBuf4 = RequestBuffer(sizeof(int) + sizeof(int)); + // // reqBuf4.Push(CUDA_SET_DEVICE); + // // reqBuf4.Push(0); + // // iov[1].length = reqBuf4.GetSize(); + // // iov[1].buffer = reqBuf4._dataBuffer; + // // header[1] = iov[1].length; + // // tool::Logging(myName, "iov[1].length = %zu\n", iov[1].length); + // // SendData(iov, 2, header, sizeof(size_t) * 2, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + + // /* batch3: cudaSetDevice + cudaMemset */ + // // req1 + // RequestBuffer reqBuf5 = RequestBuffer(sizeof(int) + sizeof(int)); + // reqBuf5.Push(CUDA_SET_DEVICE); + // reqBuf5.Push(0); + // iov[0].length = reqBuf5.GetSize(); + // iov[0].buffer = reqBuf5._dataBuffer; + // header[0] = iov[0].length; + // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); + + // //req2 + // RequestBuffer reqBuf6 = RequestBuffer(sizeof(int) + sizeof(uint64_t) + sizeof(int) + sizeof(size_t) + 3 * 1024 * 1024); + // reqBuf6.Push(CUDA_MEMSET); + // reqBuf6.Push64BitPointer(devPtr); + // reqBuf6.Push(1); + // reqBuf6.Push(cuda_size); + // iov[1].length = reqBuf6.GetSize(); + // iov[1].buffer = reqBuf6._dataBuffer; + // header[1] = iov[1].length; + // tool::Logging(myName, "iov[1].length = %zu\n", iov[1].length); + // SendData(iov, 2, header, sizeof(size_t) * 2, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + // /* batch4: cudaMemcpy(host -> remote dev) */ + // //req1 + // RequestBuffer reqBuf7 = RequestBuffer(sizeof(int) + sizeof(cudaMemcpyKind) + sizeof(size_t) + sizeof(uint64_t)); + // reqBuf7.Push(CUDA_MEMCPY); + // reqBuf7.Push(cudaMemcpyHostToDevice); + // reqBuf7.Push(cuda_size); + // reqBuf7.Push64BitPointer(devPtr); + // iov[0].length = reqBuf7.GetSize(); + // iov[0].buffer = reqBuf7._dataBuffer; + // header[0] = iov[0].length; + // tool::Logging(myName, "cudaMemcpyHostToDevice, iov[0].length = %zu\n", iov[0].length); + // SendData(iov, 1, header, sizeof(size_t) * 1, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + // //req1data + // int* hostBuffer; + // hostBuffer = (int*)malloc(cuda_size); + // for (int i = 0; i < (cuda_size / sizeof(int)); i++) { + // hostBuffer[i] = i; + // } + // void* devBuffer; + // cudaMalloc(&devBuffer, cuda_size); + // cudaMemcpy(devBuffer, hostBuffer, cuda_size, cudaMemcpyHostToDevice); + + // void* dataBuffer = hostBuffer; + + // iov[0].length = cuda_size; + // iov[0].buffer = dataBuffer; + // header[0] = iov[0].length; + // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); + // SendData(iov, 1, header, sizeof(size_t) * 0, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + // /* batch5: cudaMemcpy(remote dev -> host) */ + // //req1 + // RequestBuffer reqBuf8 = RequestBuffer(sizeof(int) + sizeof(cudaMemcpyKind) + sizeof(size_t) + sizeof(uint64_t)); + // reqBuf8.Push(CUDA_MEMCPY); + // reqBuf8.Push(cudaMemcpyDeviceToHost); + // reqBuf8.Push(cuda_size); + // reqBuf8.Push64BitPointer(devPtr); + // iov[0].length = reqBuf8.GetSize(); + // iov[0].buffer = reqBuf8._dataBuffer; + // header[0] = iov[0].length; + // tool::Logging(myName, "cudaMemcpyDeviceToHost, iov[0].length = %zu\n", iov[0].length); + // SendData(iov, 1, header, sizeof(size_t) * 1, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + // int* hostBuffer2; + // hostBuffer2 = (int*)malloc(cuda_size); + // memset(hostBuffer2, 0, cuda_size); + // ucp_dt_iov_t *recv_iov2; + // size_t iovNum = 0; + + // PrepareSingleIOV(&recv_iov2, hostBuffer2, cuda_size); + // ReceiveData(recv_iov2, &iovNum, hostBuffer2, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL); + // printf("iovNum = %zu, recv data: ", iovNum); + // for (int i = 0; i < 20; i++) { + // printf("%02x ", ((uint8_t*)hostBuffer2)[i]); + // } + // printf("\n"); + + + + // hyf: STREAM TEST + connectionObj = new UCPConnection(config_.GetServerIp().c_str(), config_.GetServerPort()); + clientEp = new ClientEndpoint(connectionObj); + tool::Logging(myName, "start the client endpoint.\n"); + int amID = 0; + int reqNum = 1; + ucp_dt_iov_t *iov = (ucp_dt_iov_t *)alloca(reqNum * sizeof(ucp_dt_iov_t)); + + size_t cuda_size = 4 * 1024 * 1024; + size_t* header = (size_t*)malloc(sizeof(size_t) * reqNum); + + // req1: cudaStreamCreateWithFlags + unsigned int flags = 1; // 0阻塞流,1非阻塞流 + RequestBuffer reqBuf_1 = RequestBuffer(sizeof(uint)); + reqBuf_1.Push(CUDA_STREAM_CREATE_WITH_FLAGS); + reqBuf_1.Push(flags); + iov[0].length = reqBuf_1.GetSize(); + iov[0].buffer = reqBuf_1._dataBuffer; + header[0] = iov[0].length; + tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); + + SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + ucp_dt_iov_t *recv_iov; + cudaStream_t stream = NULL; + PrepareSingleIOV(&recv_iov, &stream, sizeof(cudaStream_t)); + size_t tmp = 0; + // recv pStream + ReceiveData(recv_iov, &tmp, &stream, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL); + tool::Logging(myName, "pStream = %p, size = %zu\n", stream, tmp); + + // req2: cudaStreamSynchronize + RequestBuffer reqBuf_2 = RequestBuffer(sizeof(cudaStream_t)); + reqBuf_2.Push(CUDA_STREAM_SYNCHRONIZE); + reqBuf_2.Push64BitPointer(stream); // pStreamcudaStreamGetCaptureInfoHandle + iov[0].length = reqBuf_2.GetSize(); + iov[0].buffer = reqBuf_2._dataBuffer; + header[0] = iov[0].length; + + tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); + SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + // req3: cudaStreamIsCapturing + enum cudaStreamCaptureStatus pCaptureStatus; + RequestBuffer reqBuf_3 = RequestBuffer(sizeof(cudaStream_t) + sizeof(enum cudaStreamCaptureStatus *)); + reqBuf_3.Push(CUDA_STREAM_IS_CAPTURING); + reqBuf_3.Push64BitPointer(stream); + iov[0].length = reqBuf_3.GetSize(); + iov[0].buffer = reqBuf_3._dataBuffer; + tool::Logging(myName, "iov[1].length = %zu\n", iov[0].length); + SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + ucp_dt_iov_t *recv_iov3; + PrepareSingleIOV(&recv_iov3, &pCaptureStatus, sizeof(enum cudaStreamCaptureStatus *)); + size_t tmp3 = 0; + // recv pCaptureStatus + ReceiveData(recv_iov3, &tmp3, &pCaptureStatus, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL); + tool::Logging(myName, "pCaptureStatus = %d, size = %zu\n", pCaptureStatus, tmp3); + + + // req4: cudaStreamGetCaptureInfo + enum cudaStreamCaptureStatus pCaptureStatusofStream; + unsigned long long pId; + RequestBuffer reqBuf_4 = RequestBuffer(sizeof(cudaStream_t) + sizeof(enum cudaStreamCaptureStatus *) + sizeof(unsigned long long *)); + reqBuf_4.Push(CUDA_STREAM_GET_CAPTURE_INFO); + reqBuf_4.Push64BitPointer(stream); + + iov[0].length = reqBuf_4.GetSize(); + iov[0].buffer = reqBuf_4._dataBuffer; + + tool::Logging(myName, "iov[2].length = %zu\n", iov[0].length); + SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + // ucp_dt_iov_t *recv_iov4; + // // recv pCaptureStatus + // PrepareSingleIOV(&recv_iov4, &pCaptureStatusofStream, sizeof(enum cudaStreamCaptureStatus*)); + // size_t tmp4 = 0; + // ReceiveData(recv_iov4, &tmp4, &pCaptureStatusofStream, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL); + // tool::Logging(myName, "pCaptureStatusofStream = %d, size = %zu\n", pCaptureStatusofStream, tmp4); + // recv pId + + reqBuf_4.Reset(sizeof(enum cudaStreamCaptureStatus*)+sizeof(unsigned long long *), reqBuf_4._dataBuffer); + + ucp_dt_iov_t *recv_iov5; + PrepareSingleIOV(&recv_iov5, reqBuf_4._dataBuffer, sizeof(enum cudaStreamCaptureStatus*)+sizeof(unsigned long long *)); + size_t tmp5 = 0; + ReceiveData(recv_iov5, &tmp5, reqBuf_4._dataBuffer, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL); + pCaptureStatus = reqBuf_4.Pop(); + pId = reqBuf_4.Pop(); + printf("cudaStreamGetCaptureInfo success, status = %d, pId = %lld\n", pCaptureStatus, pId); + + + + // req5: cudaStreamDestroy + RequestBuffer reqBuf_5 = RequestBuffer(sizeof(cudaStream_t)); + reqBuf_5.Push(CUDA_STREAM_DESTROY); + reqBuf_5.Push64BitPointer(stream); + iov[0].length = reqBuf_5.GetSize(); + iov[0].buffer = reqBuf_5._dataBuffer; + tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); + SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp); + + // end + + //free(iov); + + //sleep(5); + + cleanup(); + + return 0; +} + diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc b/GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc new file mode 100644 index 0000000..381b1f4 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc @@ -0,0 +1,275 @@ +#include "../../include/clientEndpoint.h" +#include + +void ClientEndpoint::Checkpointing() { + // std::ofstream outFile("api_records.log", std::ios::out); + // for (auto it = reqIOVList.begin(); it != reqIOVList.end(); ++it) { + // RequestIOV* tmpReqIOV = &(*it); + // outFile << tmpReqIOV->GetRequestType() << std::endl; + // } + // outFile.close(); + + auto start = std::chrono::high_resolution_clock::now(); + ShrinkReqIOVList(); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end - start; + + tool::Logging(LOG_INFO, myName_, "reqIOVList has %zu requests after shrinking(using %f seconds)\n", reqIOVList.size(), duration.count()); + isReConnected = false; // reset reconnection flag + + // std::ofstream outFile2("api_records_shrink.log", std::ios::out); + // for (auto it = reqIOVList.begin(); it != reqIOVList.end(); ++it) { + // RequestIOV* tmpReqIOV = &(*it); + // outFile2 << tmpReqIOV->GetRequestType(); + // if (tmpReqIOV->GetRequestType() == CUDA_EVENT_RECORD) { + // outFile2 << " " << tmpReqIOV->GetHandleByIndex(0) << ", " << tmpReqIOV->GetHandleByIndex(1); + // } + // outFile2 << std::endl; + // } + // outFile2.close(); + + // debug: + // if (ckptCnt >= 2) { + // exit(EXIT_FAILURE); + // } + // else { + // ckptCnt ++; + // } + + auto it = reqIOVList.end(); + --it; + recordedReq = &(*it); + + // ckptIter = 0; +} + +void ClientEndpoint::Replay() { + auto start_replay = std::chrono::high_resolution_clock::now(); + tool::Logging(LOG_INFO, myName_, "start replaying %zu requests from registerIOVList\n", regIOVList.size()); + SendRegisterRequest(this, true); + + // std::ofstream outFile2("api_records_replay.log", std::ios::out); + tool::Logging(LOG_INFO, myName_, "start replaying %zu requests from requestIOVList\n", reqIOVList.size()); + for (auto it = reqIOVList.begin(); it != reqIOVList.end(); ++it) { + RequestIOV* tmpReqIOV = &(*it); + // outFile2 << tmpReqIOV->GetRequestType() << std::endl; + + if (tmpReqIOV->GetRequestType() == NCCL_GET_UNIQUE_ID) { + ncclUniqueId uniqueId; + RequestIOV resBuf = RequestIOV(); + resBuf.Push(uniqueId); + SendRequestRecvResponse(tmpReqIOV, &resBuf, true, false); +#ifdef GV_GPUMAP + gpuIdMap->UpdateUniqueID((uint8_t*)&uniqueId, sizeof(ncclUniqueId)); + tool::Logging(LOG_INFO, myName_, "[pid:%d, tid:%d] updated uniqueId\n", _processID, _threadID); +#else + tool::Logging(LOG_ERROR, myName_, "Replaying for ncclGetUniqueId is not supported in non-GV_GPUMAP mode\n"); + exit(EXIT_FAILURE); +#endif // GV_GPUMAP + continue; + } + else if (tmpReqIOV->GetRequestType() == NCCL_COMM_INIT_RANK + || tmpReqIOV->GetRequestType() == NCCL_COMM_INIT_RANK_CONFIG) { + ncclUniqueId uniqueId; +#ifdef GV_GPUMAP + gpuIdMap->RequestUniqueID((uint8_t*)&uniqueId, sizeof(ncclUniqueId)); + tmpReqIOV->SetElement(2, &uniqueId, sizeof(ncclUniqueId)); + SendRequest(tmpReqIOV, true, false); + tool::Logging(LOG_INFO, myName_, "[pid:%d, tid:%d] requested uniqueId\n", _processID, _threadID); +#else + tool::Logging(LOG_ERROR, myName_, "Replaying for ncclCommInitRank is not supported in non-GV_GPUMAP mode\n"); + exit(EXIT_FAILURE); +#endif // GV_GPUMAP + } + else if (tmpReqIOV->GetRequestType() == CUDA_MEMCPY_ASYNC_H2D) { + SendRequestH2D(tmpReqIOV, (uint8_t*)tmpReqIOV->GetHeaders(), sizeof(cudaMemcpyKind)+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t)+sizeof(uint64_t), false, false); + } + else if (tmpReqIOV->GetRequestType() == CUDA_STREAM_SYNCHRONIZE || tmpReqIOV->GetRequestType() == NCCL_COMM_GET_ASYNC_ERROR) { + int result = 0; + RequestIOV resBuf = RequestIOV(); + resBuf.Push(&result); + SendRequestRecvResponse(tmpReqIOV, &resBuf, true, false); + } + else { + SendRequest(tmpReqIOV, true, false); + } + } + // outFile2.close(); + + // eval for replay time + size_t free = 0; + size_t total = 0; + int replayFinished = 1; + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_MEM_GET_INFO); + reqBuf.Push(replayFinished); // dummy + RequestIOV resBuf = RequestIOV(); + resBuf.Push(&free); + resBuf.Push(&total); + SendRequestRecvResponse(&reqBuf, &resBuf); + const char* recovery_path = std::getenv("FLEXGV_RECOVERY_CSV_PATH"); + if (recovery_path) { + auto end_replay = std::chrono::high_resolution_clock::now(); + tool::Logging(LOG_INFO, myName_, "replay finished with %f seconds\n", std::chrono::duration(end_replay - start_replay).count()); + + std::ofstream outFile; + outFile.open(recovery_path, std::ios::app); + if (outFile.tellp() == 0) { + outFile << "Time" << std::endl; + } + outFile << std::chrono::duration(end_replay - start_replay).count() << std::endl; + outFile.close(); + } + + tool::Logging(LOG_INFO, myName_, "replay finished\n"); +} + +void ClientEndpoint::UpdateReqIOVList(RequestIOV* reqBuffer) { + // if (recordedReq == NULL) { + // return; // not started yet + // } + + int reqType = reqBuffer->GetRequestType(); + + if (std::binary_search(std::begin(NotNeedRecordAPIs), std::end(NotNeedRecordAPIs), reqType)) { + return; // no need to record these apis + } else { + RequestIOV* bakReqBuffer = reqBuffer->Clone(); + listLock_.lock(); + reqIOVList.push_back(*bakReqBuffer); + listLock_.unlock(); + } +} + +void ClientEndpoint::UpdateReqIOVList(RequestIOV* reqBuffer, uint8_t* header, size_t headerSize) { + size_t memcpyCnt = *(size_t*)(header + sizeof(cudaMemcpyKind)); + cudaStream_t memcpyStream = *(cudaStream_t*)(header + sizeof(cudaMemcpyKind)+sizeof(size_t)); + void* memcpyDst = *(void**)(header + sizeof(cudaMemcpyKind)+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t)); + size_t memsetValue = 0; + bool isTrainTensor = false; + + for (const auto& tensor : tensorByteList) { + if (tensor.size == memcpyCnt) { + return; // means this memcpy request is for train dataloading + } + } + + // RequestIOV reqBuf = RequestIOV(); + // reqBuf.PushRequestType(CUDA_MEMSET_ASYNC); + // reqBuf.Push64BitPointer(memcpyDst); + // reqBuf.Push(memsetValue); + // reqBuf.Push(memcpyCnt); + // reqBuf.Push64BitPointer(memcpyStream); + // reqBuf.PushThreadID(ttID); + // RequestIOV* bakReqBuffer = reqBuf.Clone(); + + RequestIOV* bakReqBuffer = reqBuffer->Clone(header, headerSize); + + tool::Logging(LOG_DEBUG, myName_, "recorded cudaMemcpyAsync request: dst=%p, count=%zu\n", memcpyDst, memcpyCnt); + + listLock_.lock(); + reqIOVList.push_back(*bakReqBuffer); + listLock_.unlock(); +} + +void ClientEndpoint::ShrinkReqIOVList() { + robin_hood::unordered_flat_set destroyHandles; + robin_hood::unordered_flat_set setHandles; + + // bool shrinkWhole = reqIOVList.size() > BACKUP_API_MAX_NUM ? true : false; + bool shrinkWhole = true; + bool exitFlag = false; + bool lastNewIter = true; + + int tensorIdx = tensorByteList.size() - 1; + auto it = reqIOVList.end(); + --it; + while (!exitFlag) { + auto eraseIt = it; // avoid invalid iterator + + RequestIOV* tmpReqIOV = &(*eraseIt); + int reqType = tmpReqIOV->GetRequestType(); + + bool needErase = false; + if (tmpReqIOV == recordedReq) { + needErase = true; + exitFlag = shrinkWhole == false ? true : false; + } + else if (reqType == NEW_ITERATION_REQ) { + needErase = lastNewIter == false ? true : false; + lastNewIter = false; + } + // else if (tensorIdx >= 0) { + // needErase = false; // reserve the last memcpy request for each tensor + // if (reqType == CUDA_MEMCPY_ASYNC_H2D) { + // uint8_t* headers = (uint8_t*)tmpReqIOV->GetHeaders(); + // size_t memcpyCnt = *(size_t*)(headers + sizeof(cudaMemcpyKind)); + // void* memcpyDst = *(void**)(headers + sizeof(cudaMemcpyKind)+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t)); + // if (memcpyCnt == tensorByteList[tensorIdx].size && memcpyDst == tensorByteList[tensorIdx].devPtr) { + // tool::Logging(LOG_INFO, myName_, "reserved memcpy request for tensor#%d: dst=%p, count=%zu\n", tensorIdx, memcpyDst, memcpyCnt); + // tensorIdx--; + // } + // else { + // tool::Logging(LOG_INFO, myName_, "skipped memcpy request for tensor#%d: dst=%p, count=%zu\n", tensorIdx, memcpyDst, memcpyCnt); + // } + // } + // else { + // tool::Logging(LOG_INFO, myName_, "skipped request#%d\n", reqType); + // } + // } + else if (std::binary_search(std::begin(ComputeAPIs), std::end(ComputeAPIs), reqType) + || reqType == CUDA_MALLOC + || reqType == CUDA_MEMSET || reqType == CUDA_MEMSET_ASYNC + || reqType == CUDA_MEMCPY_H2D || reqType == CUDA_MEMCPY_ASYNC_H2D || reqType == CUDA_MEMCPY_ASYNC_D2D + ) { //todo: memory-related APIs + needErase = true; + } + else if (std::binary_search(std::begin(DestroyAPIs), std::end(DestroyAPIs), reqType)) { + if (reqType == NCCL_COMM_DEREGISTER) { + destroyHandles.insert(tmpReqIOV->GetHandleByIndex(1)); + } + else { + destroyHandles.insert(tmpReqIOV->GetHandleByIndex(0)); + } + needErase = true; + } + else if (reqType == CUDNN_SET_STREAM + || reqType == CUBLAS_SET_STREAM_V2 || reqType == CUBLAS_SET_MATH_MODE) { + std::string tmpKey = std::to_string(reqType) + "_" + std::to_string(tmpReqIOV->GetHandleByIndex(0)); + if (setHandles.contains(tmpKey)) { + needErase = true; + } + else { + setHandles.insert(tmpKey); + } + } + else { + ucp_dt_iov_t* iovs = tmpReqIOV->GetIOVs(); + size_t iovNum = tmpReqIOV->GetNum(); + for (size_t i = 0; i < iovNum; i++) { + if (iovs[i].length != sizeof(uint64_t) || *(void**)iovs[i].buffer == NULL) { + continue; + } + if (destroyHandles.contains(*(uint64_t*)iovs[i].buffer)) { + needErase = true; + break; + } + } + } + + if (eraseIt != reqIOVList.begin()) { + it--; + } + else { + exitFlag = true; + } + + if (needErase) { + reqIOVList.erase(eraseIt); + delete tmpReqIOV; + } + } + + destroyHandles.clear(); + recordedReq = NULL; +} \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc b/GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc new file mode 100644 index 0000000..fb8c69a --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc @@ -0,0 +1,345 @@ +#include "../../include/clientEndpoint.h" + +ClientEndpoint::ClientEndpoint(uint64_t clientID, size_t priority, ucp_worker_h clientWorker, int dev){ + _clientID = clientID; + priority_ = priority; + _dataWorker = clientWorker; + + clientIP_ = (char*)malloc(IP_STRING_LEN); + clientPort_ = (char*)malloc(PORT_STRING_LEN); + + _myDevIdx = dev; + + _threadID = static_cast(::syscall(SYS_gettid)); + _processID = getpid(); + Connect(); + + _shmOpt = shmOpt; + + tool::Logging(LOG_DEBUG, myName_, "[pid:%d, tid:%d] client endpoint object is created successfully.\n", _processID, _threadID); +} + +void ClientEndpoint::Connect(bool replay) { + uint16_t serverPort; +#ifdef GV_GPUMAP + GpuInfoEntry_t* ginfo; + +#ifdef GV_BACKUP + if(replay){ + std::cout << "replay" << std::endl; + gpuIdMap->ReallocGPU(); + } +#endif + + gpuIdMap->GetGPUinfo(_myDevIdx, &ginfo); + const std::string serverIP(ginfo->nodeIp); + serverPort = ginfo->nodePort; + shmOpt->WriteIpPort(ginfo->dataPort, ginfo->dataIp); +#else + const std::string& serverIP = config_->GetServerIp(); + serverPort = config_->GetServerPort(); +#endif + + ucp_ep_params_t ep_params; + ucs_status_t status; + struct sockaddr_storage serverAddr; + tool::SetSockAddr(serverIP.c_str(), serverPort, &serverAddr, AF_INET); + ep_params.field_mask = UCP_EP_PARAM_FIELD_FLAGS | + UCP_EP_PARAM_FIELD_SOCK_ADDR | + UCP_EP_PARAM_FIELD_ERR_HANDLER | + UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE; + + ep_params.err_mode = UCP_ERR_HANDLING_MODE_PEER; + ep_params.err_handler.cb = ClientErrorCallback; + ep_params.err_handler.arg = &connStatus_; + ep_params.flags = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER | + UCP_EP_PARAMS_FLAGS_SEND_CLIENT_ID; + ep_params.sockaddr.addr = (const struct sockaddr*)&serverAddr; + ep_params.sockaddr.addrlen = sizeof(struct sockaddr_storage); + + if ((status = ucp_ep_create(_dataWorker, &ep_params, &_serverEp)) != UCS_OK) { + tool::Logging(LOG_ERROR, myName_, "failed to create an endpoint and connect to the server (%s)\n", ucs_status_string(status)); + ucp_worker_destroy(_dataWorker); + } + else { + ucp_am_handler_param_t param; + param.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID | + UCP_AM_HANDLER_PARAM_FIELD_CB | + UCP_AM_HANDLER_PARAM_FIELD_ARG; + param.id = SERVER_STATUS; + param.cb = ServerStatusCallback; + param.arg = this; + if ((status = ucp_worker_set_am_recv_handler(_dataWorker, ¶m)) != UCS_OK) { + tool::Logging(LOG_ERROR, myName_, "failed to set am handler (%s)\n", ucs_status_string(status)); + } + else { + SendMainDevice(replay); + } + } +} + +/** + * Close UCP endpoint. + */ +void ClientEndpoint::CloseEp(uint64_t flags) { + ucp_request_param_t param; + ucs_status_t status; + void *close_req; + + param.op_attr_mask = UCP_OP_ATTR_FIELD_FLAGS; + param.flags = flags; + close_req = ucp_ep_close_nbx(_serverEp, ¶m); + if (UCS_PTR_IS_PTR(close_req)) { + do { + ucp_worker_progress(_dataWorker); + status = ucp_request_check_status(close_req); + } while (status == UCS_INPROGRESS); + ucp_request_free(close_req); + } else { + status = UCS_PTR_STATUS(close_req); + } + + if (status != UCS_OK) { + fprintf(stderr, "failed to close ep %p: %s\n", (void*)_serverEp, + ucs_status_string(status)); + } +} + +ClientEndpoint::~ClientEndpoint() { + CloseEp(UCP_EP_CLOSE_MODE_FLUSH); + ucp_worker_destroy(_dataWorker); + + // if (_shmOpt != NULL) { //todo + // delete _shmOpt; + // } + tool::Logging(LOG_INFO, myName_, "close the client endpoint object(GPU #%d).\n", _myDevIdx); +} + +ucs_status_t ClientEndpoint::SendRequest(RequestIOV* reqBuffer, bool forcedEager, bool isCheckpoint) { + tool::Logging(LOG_COMM, "SendRequest", "ready prepare amID: %d, apiID: %d, ttID: %d\n", (ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType(), reqBuffer->GetRequestType(), ttID); + + reqBuffer->PushThreadID(ttID); + ucs_status_t status = SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(), + reqBuffer->GetHeaders(), reqBuffer->GetHeaderSize(), + reqBuffer->GetRequestType(), &_dataWorker, &_serverEp, + true, UCS_MEMORY_TYPE_HOST, forcedEager); +#ifdef GV_BACKUP + if (status == UCS_ERR_CONNECTION_RESET) { + { + std::unique_lock lock(reConnectMutex); + if (!isReConnected) { // only the first thread can reconnect + tool::Logging(LOG_INFO, "SendRequest", "ready to reconnect...\n"); + sleep(10); + Connect(true); + isReConnected = true; + } + } + + tool::Logging(LOG_INFO, "SendRequest", "[pid:%d, ttid:%d] ready to resend previous request (requestType: %d)\n", _processID, ttID, reqBuffer->GetRequestType()); + ucs_status_t status = SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(), + reqBuffer->GetHeaders(), reqBuffer->GetHeaderSize(), + reqBuffer->GetRequestType(), &_dataWorker, &_serverEp, + true, UCS_MEMORY_TYPE_HOST, forcedEager); + if (status != UCS_OK) { + tool::Logging(LOG_ERROR, "SendRequest", "[pid:%d, ttid:%d] failed to resend the request (requestType: %d)\n", _processID, ttID, reqBuffer->GetRequestType()); + exit(EXIT_FAILURE); + } + } + // isCheckpoint = false; + if (isCheckpoint) { + UpdateReqIOVList(reqBuffer); + } +#endif // GV_BACKUP + CheckTensors(reqBuffer->GetRequestType()); + return status; +} + +void ClientEndpoint::SendRegisterRequest(ClientEndpoint* curEp, bool forcedEager) { + // tool::Logging(LOG_COMM, "SendRequest", "ready prepare amID: %d, apiID: %d, ttID: %d\n", (ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType(), reqBuffer->GetRequestType(), ttID); + + // reqBuffer->PushThreadID(ttID); + // reqBuffer->Print(); + for (RegisterIOV* reqBuffer : regIOVList) { + SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(), + reqBuffer->GetUcpHeaders(), reqBuffer->GetUcpHeaderSize(), + reqBuffer->GetRequestType(), &curEp->_dataWorker, &curEp->_serverEp, + true, UCS_MEMORY_TYPE_HOST, forcedEager); + } +} + +void ClientEndpoint::SendRequestH2D(RequestIOV* reqBuffer, uint8_t* header, size_t headerSize, bool forcedEager, bool isCheckpoint) { + // no response required for RNDV request, so no need to push threadID + SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(), + (size_t*)header, headerSize, + reqBuffer->GetRequestType(), &_dataWorker, &_serverEp, + false, UCS_MEMORY_TYPE_HOST, forcedEager); + CheckTensors(reqBuffer->GetRequestType()); + +#ifdef GV_BACKUP + if (memcpyRecord_ && isCheckpoint) { + UpdateReqIOVList(reqBuffer, header, headerSize); + } +#endif // GV_BACKUP +} + +void ClientEndpoint::SendNewIterRequest(size_t iterNum) { + // Send a request to the server to notify the start of a new iteration + RequestIOV reqBuffer; + reqBuffer.PushRequestType(NEW_ITERATION_REQ); + reqBuffer.PushConst(tensorByteList.size()); + reqBuffer.Push(tensorByteList.data(), tensorByteList.size()); + SendRequest(&reqBuffer, true); + +#ifdef GV_BACKUP + if (iterNum + 1 == BACKUP_PERIOD) { + memcpyRecord_ = true; // prepare for recording the memcpy requests for the next iteration (before shrinking the list) + } + if (iterNum > 0 && iterNum % BACKUP_PERIOD == 0) { + Checkpointing(); + } +#endif // BACKUP + + // cudaStreamSynchronize(NULL); +} + +void ClientEndpoint::SendMainDevice(bool replay) { + bool loadData = replay; + int gpuIdInNode = _myDevIdx; +#ifdef GV_GPUMAP + gpuIdMap->GetGPUId(_myDevIdx, &gpuIdInNode); +#endif // GV_GPUMAP + + RequestIOV reqBuffer; + reqBuffer.PushRequestType(CUDA_SET_MAIN_DEVICE); + reqBuffer.Push(gpuIdInNode); + reqBuffer.Push(_processID); + reqBuffer.Push(priority_); + reqBuffer.Push(loadData); + reqBuffer.PushThreadID(ttID); + SendData(reqBuffer.GetIOVs(), reqBuffer.GetNum(), + reqBuffer.GetHeaders(), reqBuffer.GetHeaderSize(), + reqBuffer.GetRequestType(), &_dataWorker, &_serverEp, + false, UCS_MEMORY_TYPE_HOST, true); + + + ucp_ep_attr_t ep_attr; + ep_attr.field_mask = UCP_EP_ATTR_FIELD_LOCAL_SOCKADDR | + UCP_EP_ATTR_FIELD_REMOTE_SOCKADDR; + ucs_status_t status = ucp_ep_query(_serverEp, &ep_attr); + if (status != UCS_OK) { + tool::Logging(LOG_ERROR, myName_, "failed to query the endpoint: %s\n", ucs_status_string(status)); + return; + } + else { + tool::GetIpStrFromSockaddr(&ep_attr.local_sockaddr, clientIP_, IP_STRING_LEN); + tool::GetPortStrFromSockaddr(&ep_attr.local_sockaddr, clientPort_, PORT_STRING_LEN); + tool::Logging(LOG_INFO, myName_, "[pid:%d, tid:%d] client(%s:%s) has connected to server and will use GPU #%d(IDinNode:#%d)\n", _processID, _threadID, clientIP_, clientPort_, _myDevIdx, gpuIdInNode); + } + connStatus_.isClosed = false; + + if (replay) { + Replay(); + } +} + +ucs_status_t ClientEndpoint::SendRequestRecvResponse(RequestIOV* reqBuffer, RequestIOV* responseBuffer, bool forcedEager, bool isCheckpoint) { + const char* myName = "SendRequestRecvResponse"; + ucs_status_t status = UCS_OK; + // isCheckpoint = false; + + // prepare for receiving response first (avoid receiving message before setting the handler) + ucp_dt_iov_t *iov = responseBuffer->GetIOVs(); + size_t iovNum = responseBuffer->GetNum(); + NewActiveMessageDesc_t am_request_ctx = { .complete = 0, .is_rndv = 0, .mem_type = UCS_MEMORY_TYPE_HOST, .desc = NULL, + .iov = iov, .iov_num = iovNum}; + // .iov = iov, .iov_num = iovNum, .send_amID = ((ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType())}; + Request_t recv_request_ctx = {.type = 1, .complete = 0}; + ucp_am_handler_param_t param1; + param1.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID | + UCP_AM_HANDLER_PARAM_FIELD_CB | + UCP_AM_HANDLER_PARAM_FIELD_ARG; + param1.id = (ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType(); + param1.cb = RetrieveData; + param1.arg = &am_request_ctx; + tool::Logging(LOG_COMM, myName, "amID: %d, requestType: %d, ttID: %d\n", param1.id, reqBuffer->GetRequestType(), ttID); + status = ucp_worker_set_am_recv_handler(_dataWorker, ¶m1); + if (status != UCS_OK) { + tool::Logging(LOG_ERROR, myName, "failed to set am handler: %s\n", ucs_status_string(status)); + return status; + } + else { + tool::Logging(LOG_COMM, myName, "set am handler successfully for param1.id: %d\n", param1.id); + } + + // send request and wait for completion + SendRequest(reqBuffer, forcedEager, false); + + // wait for response + tool::Logging(LOG_COMM, myName, "waiting for server to send response.\n"); + while (!am_request_ctx.complete && !connStatus_.isClosed) { // waiting ActiveMessageRecvCallback() to be invoked + ucp_worker_progress(_dataWorker); + } + if (connStatus_.isClosed) { + tool::Logging(LOG_DEBUG, myName, "connection is closed, try to resend the request.\n"); + + // re-send request and wait for completion + SendRequest(reqBuffer, forcedEager, false); + ucp_worker_set_am_recv_handler(_dataWorker, ¶m1); // make sure the handler is set + tool::Logging(LOG_COMM, myName, "waiting for server to send response again.\n"); + while (!am_request_ctx.complete && !connStatus_.isClosed) { // waiting ActiveMessageRecvCallback() to be invoked + ucp_worker_progress(_dataWorker); + } + if (connStatus_.isClosed) { + tool::Logging(LOG_ERROR, myName, "connection is still closed, failed to receive response.\n"); + return UCS_ERR_CONNECTION_RESET; + } + tool::Logging(LOG_COMM, myName, "response has arrived after re-sending the request.\n"); + } + + if (!am_request_ctx.is_rndv) { + tool::Logging(LOG_COMM, myName, "Eager response has arrived.\n"); + } + else { + tool::Logging(LOG_COMM, myName, "Rendezvous response has arrived.\n"); + recv_request_ctx.complete = 0; + ucp_request_param_t param2; + param2.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_USER_DATA| + UCP_OP_ATTR_FIELD_MEMORY_TYPE; + param2.op_attr_mask |= UCP_OP_ATTR_FLAG_NO_IMM_CMPL; + param2.datatype = ucp_dt_make_contig(1); + param2.user_data = &recv_request_ctx; + param2.cb.recv_am = (ucp_am_recv_data_nbx_callback_t)RecvCallBack; + param2.memory_type = UCS_MEMORY_TYPE_HOST; + Request_t* rndv_request = (Request_t*)ucp_am_recv_data_nbx(_dataWorker, + am_request_ctx.desc, + iov[0].buffer, iov[0].length, + ¶m2); + status = Wait(rndv_request, &recv_request_ctx, &_dataWorker); + if (status != UCS_OK) { + tool::Logging(LOG_ERROR, myName, "ucp_am_recv_data_nbx failed: %s\n", ucs_status_string(status)); + } + else { + tool::Logging(LOG_COMM, myName, "ucp_am_recv_data_nbx completed successfully.\n"); + } + ucp_request_free(rndv_request); + } + +#ifdef GV_BACKUP + if (isCheckpoint) { + int reqType = reqBuffer->GetRequestType(); + if (std::binary_search(std::begin(CreateAPIs), std::end(CreateAPIs), reqType)) { + UpdateReqIOVList(reqBuffer); + } + else if (reqType == CUDA_STREAM_SYNCHRONIZE) { + UpdateReqIOVList(reqBuffer); + } + else if (reqType == CUDA_MALLOC) { + UpdateReqIOVList(reqBuffer); + } + } +#endif // GV_BACKUP + + return status; +} \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc new file mode 100644 index 0000000..81273e8 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc @@ -0,0 +1,224 @@ +#include "../../include/hook/hook.h" + + +int mainDevIdx; // different ranks(processes) may use different devices +std::vector clientEpList; +std::vector threadValidList; + +Configure* config_; +UCPConnection* connectionObj; +SharedMemoryOpt* shmOpt; +GPUidMap* gpuIdMap; +std::once_flag initFlag; +std::once_flag registerFlag; +std::vector regIOVList; +std::vector registeredKernels; +robin_hood::unordered_flat_map mapHost2KernelInfo; + +bool isReConnected = false; +std::mutex reConnectMutex; +std::shared_mutex threadSharedMutex; +int processID; // the process ID of the current process +int threadNum; // how many sub-threads have been created in this process +int commDevIdx; // the device index for the communicator +thread_local int threadID; // e.g, 1641432 +thread_local int ttID; // e.g, 1, 2, 3, ... +thread_local ClientEndpoint* clientEpObj; // the client endpoint object for the current thread +thread_local int myDevIdx; +thread_local int lastReqType = -1; +thread_local bool isTraining = false; +thread_local bool batchCollected = false; +thread_local int curTensorIdx = -1; +thread_local size_t curIter = 0; +thread_local std::vector tensorByteList; + +void Intialize() { + config_ = new Configure("config.json", true); + // regIOV = new RegisterIOV(); + regIOVList.reserve(240); + registeredKernels.reserve(1000); + clientEpList.reserve(config_->GetReqGPUnum()); + for (int i = 0; i < config_->GetReqGPUnum(); i++) { + clientEpList.push_back(nullptr); + } + + processID = getpid(); + std::string shmName = "/flexgv_shm_" + std::to_string(config_->GetClientID()) + "_" + std::to_string(processID) + "_datatype"; + // CurType + dataFeeder IP + dataFeeder Port + CurBatchSize + shmOpt = new SharedMemoryOpt(shmName, sizeof(BatchInfo_t) + sizeof(int) + IP_STRING_LEN); + + + connectionObj = new UCPConnection(true); +#ifdef GV_GPUMAP +#ifndef GV_MSGHANDLER + gpuIdMap = new GPUidMap(config_->GetReqGPUnum(), config_->GetClientID(), config_->GetProxyIp(), config_->GetProxyPort()); +#else + gpuIdMap = new GPUidMap(config_->GetReqGPUnum(), config_->GetClientID(), config_->GetModel(), config_->GetBatchSize(), config_->GetProxyIp(), config_->GetProxyPort()); +#endif +// #elif +// mainDevIdx = 0; // todo: retrieve the available device from shared memory +// SwitchClientEp(mainDevIdx); // switch to the main device as default +#endif + + // threadID = static_cast(::syscall(SYS_gettid)); + // threadNum++; + // ttID = threadNum; +} + +void SwitchClientEp(int dev, bool threadInit) { // used for cudaSetDevice or first-time initialization + ClientEndpoint* localClientEpObj = nullptr; + { + std::shared_lock readLock(threadSharedMutex); + localClientEpObj = clientEpList[dev]; // read from the list + } + if (localClientEpObj == nullptr // the clientEp for dev is not created yet + || threadInit) { // or the thread is newly created, so need to update the threadID + std::unique_lock writeLock(threadSharedMutex); + + // re-check: another thread may have created the object while waiting for the lock + if (clientEpList[dev] == nullptr) { // not found, create a new one for dev + uint64_t clientID = config_->GetClientID(); + size_t priority = config_->GetPriority(); + ClientEndpoint* newClientEpObj = new ClientEndpoint(clientID, priority, connectionObj->CreateWorker(true, clientID), dev); + clientEpList[dev] = newClientEpObj; + } + localClientEpObj = clientEpList[dev]; + + if (threadInit) { // used for sub-thread initialization + threadID = static_cast(::syscall(SYS_gettid)); + threadNum++; + ttID = threadNum; + threadValidList.push_back(true); + tool::Logging(LOG_DEBUG, "SetupClientEpIfNeeded", "new thread#%d(%d) in process(%d) has been created\n", ttID, threadID, processID); + } + else { + threadValidList[ttID - 1] = true; + } + } + clientEpObj = localClientEpObj; // switch to the target object +} + +void DestoryResources() { + if (config_ != nullptr) { + delete config_; + config_ = nullptr; + } + if (!mapHost2KernelInfo.empty()) { + for(auto it = mapHost2KernelInfo.begin(); it != mapHost2KernelInfo.end(); it++) { + free(it->second); + } + mapHost2KernelInfo.clear(); + } + if (!regIOVList.empty()) { + for(auto it = regIOVList.begin(); it != regIOVList.end(); it++) { + delete *it; + } + regIOVList.clear(); + } + if (!clientEpList.empty()) { + for(auto it = clientEpList.begin(); it != clientEpList.end(); it++) { + delete *it; + } + clientEpList.clear(); + delete connectionObj; + delete shmOpt; +#ifdef GV_GPUMAP + delete gpuIdMap; +#endif + } +} + +ucs_status_t ServerStatusCallback(void *arg, const void *header, size_t header_length, void *data, size_t length, const ucp_am_recv_param_t *param) { + const char* myName = "ServerStatusCallback"; + ClientEndpoint* curEp = (ClientEndpoint*)arg; + printf("%s: header_length: %zu, length: %zu\n", myName, header_length, length); + RequestIOV reqBuf = RequestIOV(header, header_length, data); + int serverStatus = reqBuf.Pop(); + if (serverStatus == 0) { + tool::Logging(LOG_INFO, myName, "sucess: the server has received the request.\n"); + } + else { + tool::Logging(LOG_ERROR, myName, "failed: the server's status is %d.\n", serverStatus); + } + return UCS_OK; +} + +void ClientErrorCallback(void *arg, ucp_ep_h ep, ucs_status_t status) { + const char* myName = "ClientErrorCallback"; + ConnStatus_t* connStatus = (ConnStatus_t*)arg; + if (status == UCS_ERR_CONNECTION_RESET) { + tool::Logging(LOG_INFO, myName, "connection failed: the server has shutdown the connection early.\n"); + connStatus->isClosed = true; +#ifndef GV_BACKUP + DestoryResources(); + exit(EXIT_FAILURE); +#endif + // exit(0); + // ReConnect(); + } + else if (status == UCS_ERR_ENDPOINT_TIMEOUT) { + tool::Logging(LOG_ERROR, myName, "connection failed: the connection is timed out.\n"); + DestoryResources(); + exit(EXIT_FAILURE); + } + else if (status == UCS_ERR_NOT_CONNECTED) { + tool::Logging(LOG_ERROR, myName, "connection failed: the server is not connected or the connection is closed.\n"); + DestoryResources(); + exit(EXIT_FAILURE); + } + else { + tool::Logging(LOG_ERROR, myName, "connection failed: %d(%s)\n", + status, ucs_status_string(status)); + connStatus->isClosed = true; + } +} + +void CheckTensors(int reqType) { + if (reqType != CUDA_MEMCPY_ASYNC_H2D && batchCollected == false && isTraining == true) { + if (lastReqType == CUDA_MEMCPY_ASYNC_H2D && reqType == CUDA_STREAM_SYNCHRONIZE) { + // maybe next request is cudaMemcpyAsync + tool::Logging(LOG_DEBUG, HOOK_LOG_TAG, "continue to collect next tensor size\n"); + } + else if (tensorByteList.size() >= 2) { // at least 2 tensors are collected + // now cudaMemcpyAsync is no longer called consecutively + batchCollected = true; + + // debug + for (size_t i = 0; i < tensorByteList.size(); i++) { + tool::Logging(LOG_INFO, HOOK_LOG_TAG, "batch[%zu]: %zu\n", i, tensorByteList[i].size); + } + } + else { + tool::Logging(LOG_DEBUG, HOOK_LOG_TAG, "only one tensor is collected, curReqType: %d\n", reqType); + tensorByteList.clear(); + } + } + lastReqType = reqType; +} + +bool CheckIteration(void* dst, size_t size) { + bool isNewIter = false; + if (isTraining == false) { + } + else if (batchCollected == false) { + tensorByteList.push_back({NULL, size}); + } + else if (curTensorIdx >= 0 && (lastReqType != CUDA_MEMCPY_ASYNC_H2D && lastReqType != CUDA_STREAM_SYNCHRONIZE)) { + curTensorIdx = -1; + } + else if (tensorByteList[curTensorIdx + 1].size == size) { + curTensorIdx ++; + tensorByteList[curTensorIdx].devPtr = dst; + if (curTensorIdx == tensorByteList.size() - 1) { + curTensorIdx = -1; + curIter ++; + isNewIter = true; + tool::Logging(HOOK_LOG_TAG, "[pid:%d, tid:%d] curIter: %zu (#%zu in period)\n", processID, threadID, curIter, curIter % BACKUP_PERIOD); + clientEpObj->SendNewIterRequest(curIter); + } + } + else { + curTensorIdx = -1; + } + return isNewIter; +} \ No newline at end of file