From fb0a45c4fd42114950c73ee71c59dd33af01689b Mon Sep 17 00:00:00 2001
From: yumemiso <mengjianzou@qq.com>
Date: Mon, 22 Jun 2026 21:57:06 +0800
Subject: [PATCH] add client endpoint, ckpt, hook methods for gpu
 virtualization in cloud

---
 .../gpu-remoting/include/asyncRequest.h       |  93 +++
 .../gpu-remoting/include/elasticscheduler.h   | 358 ++++++++++++
 .../gpu-remoting/include/gpuIdMap.h           | 239 ++++++++
 .../gpu-remoting/include/mapper.h             | 331 +++++++++++
 .../gpu-remoting/include/msghandler.h         | 184 ++++++
 .../gpu-remoting/include/ptxExtractor.h       | 411 +++++++++++++
 .../gpu-remoting/include/registerIOV.h        | 236 ++++++++
 .../gpu-remoting/include/requestBuffer.h      | 238 ++++++++
 .../gpu-remoting/include/requestIOV.h         | 307 ++++++++++
 .../gpu-remoting/include/scheduler.h          | 544 ++++++++++++++++++
 .../gpu-remoting/include/serverEndpoint.h     | 422 ++++++++++++++
 .../src/client/CMakeLists.txt(clientApp)      |  25 +
 .../gpu-remoting/src/client/clientApp.cc.bak  | 278 +++++++++
 .../gpu-remoting/src/client/clientCkpt.cc     | 275 +++++++++
 .../gpu-remoting/src/client/clientEndpoint.cc | 345 +++++++++++
 .../gpu-remoting/src/client/clientHook.cc     | 224 ++++++++
 16 files changed, 4510 insertions(+)
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/mapper.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/msghandler.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/registerIOV.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/requestIOV.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/scheduler.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h
 create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp)
 create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak
 create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc
 create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc
 create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc

diff --git a/GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h b/GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h
new file mode 100644
index 0000000..e028a99
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/asyncRequest.h
@@ -0,0 +1,93 @@
+#ifndef ASYNC_REQUEST_H
+#define ASYNC_REQUEST_H
+
+#include "configure.h"
+#include <boost/function.hpp>
+#include <boost/thread/mutex.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/thread/thread_functors.hpp>
+
+class AsyncRequest{
+    private:
+        const char* myName_ = "AsyncRequest";
+        boost::mutex _mutex;                 
+        boost::condition_variable _cv;       // wake up waiting thread
+        bool _stop;                          // stop the worker thread
+        bool _newReq;                        // whether there is a new request
+        boost::thread* _worker = nullptr;     // worker thread
+
+    public: 
+        AsyncRequest() : _stop(false), _newReq(false) {
+        }
+
+        void Start(boost::function<void()> func){
+            boost::thread_attributes attrs;
+            attrs.set_stack_size(THREAD_STACK_SIZE);
+            _worker = new boost::thread(attrs, func);
+        }
+
+        inline bool CheckStart() {
+            return _worker != nullptr;
+        }
+
+        void Wait() {
+            boost::unique_lock<boost::mutex> lock(_mutex);
+            _cv.wait(lock, [this] {
+                return !_newReq; // wait until there is a new request
+            });
+        }
+
+        void Notify() {
+            boost::unique_lock<boost::mutex> lock(_mutex);
+            _newReq = !_newReq; // set new request flag 
+            _cv.notify_one(); // notify the worker thread
+        }
+
+        bool Check() {
+            boost::unique_lock<boost::mutex> lock(_mutex);
+            _cv.wait(lock, [this] {
+                return _newReq || _stop;
+            });
+            if (_stop && !_newReq) { // receive stop signal and no new request
+                return false;
+            }
+            return true;
+        }
+
+        bool CheckStop(boost::chrono::seconds timeout) {
+            boost::unique_lock<boost::mutex> lock(_mutex);
+            bool res = _cv.wait_for(lock, timeout, [this] {
+                return _stop;
+            });
+            return res;
+        }
+
+        void Stop() {
+            boost::unique_lock<boost::mutex> lock(_mutex);
+            _stop = true;
+            _cv.notify_one();
+        }
+
+        void Lock() {
+            _mutex.lock();
+        }
+
+        void Unlock() {
+            _mutex.unlock();
+        }
+
+        ~AsyncRequest(){
+            Stop();
+            tool::Logging(LOG_DEBUG, myName_, "ready to destroy\n");
+            if (_worker != nullptr) {
+                if (_worker->joinable()) {
+                    _worker->join();
+                }
+                tool::Logging(LOG_DEBUG, myName_, "worker thread is joined\n");
+                delete _worker;
+            }
+            tool::Logging(LOG_DEBUG, myName_, "finish destroying\n");
+        }
+};
+
+#endif // ASYNC_REQUEST_H
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h b/GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h
new file mode 100644
index 0000000..a7ca013
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/elasticscheduler.h
@@ -0,0 +1,358 @@
+#include "configure.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cuda_device_runtime_api.h>
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <netinet/in.h>
+#include <nvml.h>
+#include <driver_types.h>
+#include <queue>
+#include <set>
+#include <strings.h>
+#include <unistd.h>
+#include <unordered_map>
+#include <chrono>
+#include <utility>
+#include <cuda.h>
+#include <vector>
+#include <mutex>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include "constVar.h"
+#include "define.h"
+#include <sys/un.h>
+#include <unistd.h>
+using namespace std;
+
+#define GPU_MAX_NUMe 2
+//高低优先级宏定义，可修改成其他值
+#define HighPriority 0//高优先级
+#define LowPriority 1//低优先级
+#define ProfilingIter 20//Profiling的Iteration数
+
+typedef struct eJob_Info{
+    uint64_t client_id;
+    int device;
+    int priority;
+    int lastIt;//记录上一次迭代数
+    int curIt;//记录当前迭代数
+    int totalIt;//记录总迭代数
+    unordered_map<eJob_Info*, double> coloThrougput;//记录与当前高优先级作业的共置吞吐量
+    chrono::steady_clock::time_point job_start_time;//记录当前作业开始时间
+    int highColoIndex = 0;//高优先级作业用于记录当前profiling到第几个低优先级作业
+    int finishProFlag = 0;//低优先级作业用于记录当前作业是否已经profiling
+    //dummy
+    int tmpFlag;
+
+}eJob_Info_;
+
+
+class eScheduler{
+    private:
+        const char* myName_ = "eScheduler";
+        static set<eJob_Info_*> jobs;
+        static chrono::steady_clock::time_point timers[GPU_MAX_NUMe];//计时器，判断处理哪个队列
+        static unordered_map<eJob_Info*, int> JobIsRunning;//判断任务是否运行，JobIsRunning[job] = 1表示job正在运行
+        static std::mutex mtx_used[GPU_MAX_NUMe];//一堆锁
+        static std::mutex mtx_time[GPU_MAX_NUMe];
+        static std::mutex mtx_low[GPU_MAX_NUMe];
+        static std::mutex mtx_high[GPU_MAX_NUMe];
+        static std::mutex mtx_del[GPU_MAX_NUMe];
+        static vector<eJob_Info_*> HighPriorityQueue[GPU_MAX_NUMe];//高优先级队列，存放高优先级任务
+        static vector<eJob_Info_*> LowPriorityQueue[GPU_MAX_NUMe];//低优先级队列，存放低优先级任务
+        static eJob_Info_* nowHighjob[GPU_MAX_NUMe];//当前 <正在运行> 的 <高优先级任务>
+        static int HighPriorityRunningFlag[GPU_MAX_NUMe];//当前有几个高优先级任务在运行
+        static int LowPriorityRunningFlag[GPU_MAX_NUMe];//当前有几个低优先级任务在运行
+        static int ColoRunningFlag[GPU_MAX_NUMe];//当前是否有任务在 <共置> 运行
+        static int FinishProfiling[GPU_MAX_NUMe];//当前高优先级任务是否已经完成了所有的低优先级任务的profiling
+        static int OneLastLowJob[GPU_MAX_NUMe];//当前是否只有一个低优先级任务，用于最后一个低优先级任务唤醒自己
+
+        static vector<eJob_Info_*> LowPriorityWaitingQueue[GPU_MAX_NUMe];//低优先级队列，存放低优先级任务
+
+        eJob_Info_* job;
+
+    public:
+        eScheduler(uint64_t clientId){
+            tool::Logging(LOG_INFO,myName_,"eScheduler::eScheduler(): client_ID:%lu\n",clientId);
+            job = new eJob_Info_();
+            job->client_id = clientId;
+        }
+
+        ~eScheduler(){
+            tool::Logging(LOG_INFO,myName_,"eScheduler::~eScheduler(): client_ID:%lu\n",job->client_id);
+            delete job;
+        }
+
+        void enqueue(uint64_t client_id, int device, int priority){//每次接受一个劫持的API就调用一次
+            if(jobs.count(job) == 0){//作业第一次到达。
+                job->device = device;
+                job->priority = priority;
+                jobs.insert(job);//插入所有作业的队列
+                if(job->priority == HighPriority){//高优先级任务
+                    HighPriorityQueue[device].push_back(job);
+                    JobIsRunning[job] = 0;
+                    if(HighPriorityRunningFlag[device] == 0){//如果当前没有高优先级任务在运行,启动当前任务
+                        HighPriorityRunningFlag[device]++;
+                        JobIsRunning[job] = 1;
+                        nowHighjob[device] = job;
+                    }
+                    // nowHighjob[device] = job;
+                    tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert HighPriorityQueue, device:%d, HighPriorityQueue_size:%d\n",device, client_id, device ,HighPriorityQueue[device].size());
+                }else if(job->priority == LowPriority){//低优先级任务
+                    // LowPriorityQueue[device].push_back(job);
+                    LowPriorityWaitingQueue[device].push_back(job);//低优先级任务先放在等待队列中
+                    tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert LowPriorityWaitingQueue, device:%d, LowPriorityWaitingQueue_size:%d\n",device, client_id, device ,LowPriorityWaitingQueue[device].size());
+                    while(LowPriorityQueue[device].size() >= 3 && LowPriorityQueue[device].front() != job){//如果当前有低优先级任务在运行，直接返回
+                        sleep(2);
+                    }
+                    LowPriorityQueue[device].push_back(job);//低优先级任务放入低优先级队列中
+                    LowPriorityWaitingQueue[device].erase(LowPriorityWaitingQueue[device].begin());//从等待队列中删除
+                    tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert LowPriorityQueue, device:%d, LowPriorityQueue_size:%d, LowPriorityWaitingQueue_size:%d\n",device, client_id, device ,LowPriorityQueue[device].size(), LowPriorityWaitingQueue[device].size());
+                    if(FinishProfiling[device] == 1){
+                        FinishProfiling[device] = 0;
+                    }
+                    if(LowPriorityQueue[device].size() > 1){
+                        OneLastLowJob[device] = 0;
+                    }
+                }
+            }else{
+                while(job->priority == HighPriority && JobIsRunning[job] == 0){
+                    sleep(5);
+                }
+                if(job->curIt < 5){//前5个Iteration属于初始化阶段，不参与调度。
+                    return;
+                }
+                if(job->curIt == 5 && job->priority == LowPriority){
+                    job->lastIt = job->curIt;
+                }
+                while(job->priority == LowPriority && JobIsRunning[job] == 0 && HighPriorityRunningFlag[device] > 0){//高优先级任务突然到来的情况
+                    if(OneLastLowJob[device]){//最后一个低优先级任务唤醒自己
+                        JobIsRunning[job] = 1;
+                        return;
+                    }
+                    sleep(1);
+                }
+                if(job->priority == LowPriority && FinishProfiling[device] == 0 && job->finishProFlag == 1 && JobIsRunning[job] == 1){
+                    JobIsRunning[job] = 0;
+                    LowPriorityRunningFlag[device]--;
+                    tool::Logging(LOG_INFO,myName_,"new low priority job comming, stop 2 let new job profiling\n");
+                }
+                if(job->priority == LowPriority && HighPriorityRunningFlag[device] > 0 && LowPriorityRunningFlag[device] > 1 && JobIsRunning[job] == 1){//高优先级任务突然到来的情况，低优先级作业暂停自己
+                    job->lastIt = job->curIt;
+                    JobIsRunning[job] = 0;
+                    LowPriorityRunningFlag[device]--;
+                }
+                {
+                std::lock_guard<std::mutex> lock(mtx_used[device]);   
+                    if(job->priority == HighPriority && FinishProfiling[device] == 1 && LowPriorityRunningFlag[device] == 0){
+                        ToChooseBestColoJob(device);
+                    }
+                    if(job->priority == HighPriority && job->highColoIndex >= LowPriorityQueue[device].size() && LowPriorityRunningFlag[device] == 0 && FinishProfiling[device] == 0){
+                        tool::Logging(LOG_INFO,myName_,"HighPriority Job:%lu has new Job to profiling\n", nowHighjob[device]->client_id);
+                        // nowHighjob[device] = HighPriorityQueue[device][0];
+                        job->highColoIndex = LowPriorityQueue[device].size() - 1;
+                        auto lowJob = LowPriorityQueue[device][job->highColoIndex];
+                        tool::Logging(LOG_INFO,myName_,"choosing low priority job 2 profiling : %lu\n",lowJob->client_id);
+                        JobIsRunning[lowJob] = 1;
+                        LowPriorityRunningFlag[device]++;
+                        lowJob->lastIt = lowJob->curIt;//记录当前迭代数
+                    }
+                    if(job->priority == HighPriority && job->highColoIndex < LowPriorityQueue[device].size() && LowPriorityRunningFlag[device] == 0){
+                            nowHighjob[device] = HighPriorityQueue[device][0];
+                            auto lowJob = LowPriorityQueue[device][job->highColoIndex];
+                            tool::Logging(LOG_INFO,myName_,"choosing low priority job 2 profiling : %lu\n",lowJob->client_id);
+                            JobIsRunning[lowJob] = 1;
+                            LowPriorityRunningFlag[device]++;
+                            lowJob->lastIt = lowJob->curIt;//记录当前迭代数
+                    }
+                }
+                if(job->priority == LowPriority && job->curIt - job->lastIt == 0 && HighPriorityRunningFlag[device] != 0){ 
+                    if(job->job_start_time == chrono::steady_clock::time_point()){
+                        tool::Logging(LOG_INFO,myName_,"client id %lu start profiling\n",job->client_id);
+                        job->job_start_time = chrono::steady_clock::now();
+                        nowHighjob[device]->lastIt = nowHighjob[device]->curIt;   
+                    }
+                }else if(job->priority == LowPriority && job->curIt - job->lastIt == ProfilingIter && job->finishProFlag == 0 && HighPriorityRunningFlag[device] != 0){
+                    JobIsRunning[job] = 0;
+                    chrono::steady_clock::time_point now_time = chrono::steady_clock::now();
+                    // int duration = (chrono::duration_cast<chrono::milliseconds>(now_time - job->job_start_time).count() * 1.0 / 1000);
+                    double duration = (chrono::duration_cast<chrono::milliseconds>(now_time - job->job_start_time).count() * 1.0 / 1000);
+                    tool::Logging(LOG_INFO,myName_,"--------------------------------------------------------\n");
+                    tool::Logging(LOG_INFO,myName_,"duration:%f\n",duration);
+                    tool::Logging(LOG_INFO,myName_,"nowHighjob->curIt:%d\n",nowHighjob[device]->curIt - nowHighjob[device]->lastIt);
+                    job->coloThrougput[nowHighjob[device]] = 1.0 * 20 / duration;
+                    tool::Logging(LOG_INFO,myName_,"LowPriority Job:client id %lu coloThrougput:%f\n",job->client_id,job->coloThrougput[nowHighjob[device]]);
+                    nowHighjob[device]->coloThrougput[job] = 1.0 * (nowHighjob[device]->curIt - nowHighjob[device]->lastIt) / duration;
+                    tool::Logging(LOG_INFO,myName_,"HighPriority Job: client id %lu coloThrougput:%f\n",nowHighjob[device]->client_id,nowHighjob[device]->coloThrougput[job]);
+                    nowHighjob[device]->highColoIndex++;
+                    LowPriorityRunningFlag[device]--;
+                    job->job_start_time = chrono::steady_clock::time_point();
+                    job->finishProFlag = 1;
+                    if(nowHighjob[device]->highColoIndex == LowPriorityQueue[device].size()){
+                        tool::Logging(LOG_INFO,myName_,"HighPriority Job:%lu Finish all Profiling\n", nowHighjob[device]->client_id);
+                        FinishProfiling[device] = 1;
+                    }
+                    tool::Logging(LOG_INFO,myName_,"LowPriorityRunningFlag:%d\n",LowPriorityRunningFlag[device]);
+                    // std::cout << "finishflag:" << FinishProfiling[device] << "low runing num :"<< LowPriorityRunningFlag[device] << std::endl;
+                    tool::Logging(LOG_INFO,myName_,"--------------------------------------------------------\n");
+                }
+                {
+                    std::lock_guard<std::mutex> lock(mtx_low[device]);
+                    if(job->priority == LowPriority && JobIsRunning[job] == 1 && HighPriorityRunningFlag[device] == 0 && LowPriorityRunningFlag[device] == 1 && !OneLastLowJob[device]){
+                        for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){//启动所有低优先级作业
+                            if((*it)->client_id == job->client_id){
+                                continue;
+                            }
+                            auto tmplowjob = *it;
+                            JobIsRunning[tmplowjob] = 1;
+                            LowPriorityRunningFlag[device]++;
+                            tool::Logging(LOG_INFO,myName_,"starting low job : %lu , LowPriorityRunningFlag:%d\n",tmplowjob->client_id,LowPriorityRunningFlag[device]);
+                        }
+                        tool::Logging(LOG_INFO,myName_,"all low jobs are running\n");
+                    }
+                }
+                
+            }
+        }
+
+        void ToChooseBestColoJob(int device){    
+            std::lock_guard<std::mutex> lock(mtx_del[device]);
+            if(ColoRunningFlag[device] != 0 && LowPriorityRunningFlag[device] != 0){
+                return;
+            }
+
+            if(nowHighjob[device]->coloThrougput.size() == 0){
+                return;
+            }
+
+            auto max_element = std::max_element(
+                nowHighjob[device]->coloThrougput.begin(), nowHighjob[device]->coloThrougput.end(),
+                [](const std::pair<eJob_Info*, double>& a, const std::pair<eJob_Info*, double>& b) {
+                    return a.second < b.second;
+                }
+            );
+            auto nowLowJob = max_element->first;
+            tool::Logging(LOG_INFO,myName_,"choosing best colo job: %lu\n",nowLowJob->client_id);
+            JobIsRunning[nowLowJob] = 1;
+            LowPriorityRunningFlag[device]++;
+            ColoRunningFlag[device]++;
+        }
+
+        void cal_add_It(uint64_t client_id){
+            ++job->curIt;
+        }
+
+        void get_Iteration(int num){
+            job->totalIt = int(num);
+        }
+
+        void free_jobs(uint64_t client_id, int device){
+            if(job->priority == HighPriority){
+                std::lock_guard<std::mutex> lock(mtx_high[device]);
+                for(auto it = jobs.begin(); it != jobs.end(); ++it){
+                    auto tmp = (*it)->coloThrougput.find(job);
+                    if(tmp != (*it)->coloThrougput.end()){
+                        (*it)->coloThrougput.erase(tmp);
+                    }
+                }
+                HighPriorityRunningFlag[device]--;
+                for(auto it = HighPriorityQueue[device].begin(); it != HighPriorityQueue[device].end(); ++it){
+                    if((*it)->client_id == client_id && (*it)->device == device){
+                        HighPriorityQueue[device].erase(it);
+                        tool::Logging(LOG_INFO,myName_,"free clientId:%lu free job\n",client_id);
+                        break;
+                    }
+                }
+                if(HighPriorityQueue[device].size() == 0){
+                    nowHighjob[device] = NULL;
+                }else{
+                    nowHighjob[device] = HighPriorityQueue[device][0];
+                    JobIsRunning[nowHighjob[device]] = 1;
+                    HighPriorityRunningFlag[device]++;
+                    for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){//启动下一个高优先级作业，对所有低优先作业重新profiling
+                        tool::Logging(LOG_INFO,myName_,"Stopping Low job : %lu\n",(*it)->client_id);
+                        (*it)->finishProFlag = 0;
+                        if(JobIsRunning[(*it)] == 1){
+                            JobIsRunning[(*it)] = 0;
+                            LowPriorityRunningFlag[device]--;
+                        }
+                        (*it)->lastIt = (*it)->curIt;
+                    }
+                    FinishProfiling[device] = 0;
+                }
+                for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){
+                    (*it)->coloThrougput.erase(job);
+                }
+            }else if(job->priority == LowPriority){
+                for(auto it = LowPriorityQueue[device].begin(); it != LowPriorityQueue[device].end(); ++it){
+                    if((*it)->client_id == client_id && (*it)->device == device){
+                        LowPriorityQueue[device].erase(it);
+                        tool::Logging(LOG_INFO,myName_,"free clientId:%lu free job\n",client_id);
+                        break;
+                    }
+                }
+                {
+                std::lock_guard<std::mutex> lock(mtx_used[device]);    
+                    for(auto it = jobs.begin(); it != jobs.end(); ++it){
+                        auto tmp = (*it)->coloThrougput.find(job);
+                        if(tmp != (*it)->coloThrougput.end()){
+                            (*it)->coloThrougput.erase(tmp);
+                        }
+                    }
+                    if(LowPriorityRunningFlag[device] > 0){
+                        LowPriorityRunningFlag[device]--;
+                    }
+                    ColoRunningFlag[device]--;
+                    // if(FinishProfiling[device] == 0 && HighPriorityRunningFlag[device] == 1){//说明该低优先级任务在Profiling的时候就结束了执行
+                    //     nowHighjob[device]->highColoIndex--;
+                    // }
+                    if(LowPriorityQueue[device].size() == 1){
+                        OneLastLowJob[device] = 1;
+                    }
+                }
+            }
+
+            auto it = JobIsRunning.find(job);
+            if(it != JobIsRunning.end()){
+                JobIsRunning.erase(it);
+            }
+
+            for(auto it = jobs.begin(); it != jobs.end(); ++it){
+                auto tmp = (*it)->coloThrougput.find(job);
+                if(tmp != (*it)->coloThrougput.end()){
+                    (*it)->coloThrougput.erase(tmp);
+                }
+            }
+            for(auto it = jobs.begin(); it != jobs.end(); ++it){
+                if((*it)->client_id == client_id && (*it)->device == device){
+                    jobs.erase(it);
+                    tool::Logging(LOG_INFO,myName_,"free clientId:%lu free job\n",client_id);
+                    break;
+                }
+            }
+        }
+
+};
+
+
+
+
+inline set<eJob_Info_*> eScheduler::jobs;
+inline chrono::steady_clock::time_point eScheduler::timers[GPU_MAX_NUMe];
+inline unordered_map<eJob_Info*, int> eScheduler::JobIsRunning;
+inline std::mutex eScheduler::mtx_used[GPU_MAX_NUMe];
+inline std::mutex eScheduler::mtx_time[GPU_MAX_NUMe];
+inline std::mutex eScheduler::mtx_low[GPU_MAX_NUMe];
+inline std::mutex eScheduler::mtx_high[GPU_MAX_NUMe];
+inline std::mutex eScheduler::mtx_del[GPU_MAX_NUMe];
+inline vector<eJob_Info_*> eScheduler::HighPriorityQueue[GPU_MAX_NUMe];
+inline vector<eJob_Info_*> eScheduler::LowPriorityQueue[GPU_MAX_NUMe];
+inline eJob_Info_* eScheduler::nowHighjob[GPU_MAX_NUMe];
+inline int eScheduler::HighPriorityRunningFlag[GPU_MAX_NUMe];
+inline int eScheduler::LowPriorityRunningFlag[GPU_MAX_NUMe];
+inline int eScheduler::ColoRunningFlag[GPU_MAX_NUMe];
+inline int eScheduler::FinishProfiling[GPU_MAX_NUMe];
+inline int eScheduler::OneLastLowJob[GPU_MAX_NUMe];
+inline vector<eJob_Info_*> eScheduler::LowPriorityWaitingQueue[GPU_MAX_NUMe];
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h b/GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h
new file mode 100644
index 0000000..62d922a
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/gpuIdMap.h
@@ -0,0 +1,239 @@
+#ifndef GPU_ID_MAP_H
+#define GPU_ID_MAP_H
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include "chunkStructure.h"
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/json_parser.hpp>
+#include <boost/iostreams/device/array.hpp>
+#include <boost/iostreams/stream.hpp>
+#include <sstream>
+#include "define.h"
+#include <cuda_runtime.h>
+
+struct GpuInfoEntry_t {
+    int nodeDevIdx;
+    uint16_t nodePort;
+    char nodeIp[IP_STRING_LEN];
+    uint16_t dataPort;
+    char dataIp[IP_STRING_LEN];
+    cudaDeviceProp devprop;
+};
+
+class GPUidMap{
+    private:
+        const char* myName_ = "GPUidMap";
+        int proxySock_ = 0;
+        std::vector<GpuInfoEntry_t*> gpuInfoList_;
+        uint64_t clientID_;
+        size_t devNum_;
+        size_t reqCommIDcnt_ = 1; // request NCCL unique ID count (avoid request for old ID, start from 1)
+#ifdef GV_MSGHANDLER
+        const char* model_;
+        size_t batch_Size_;
+#endif
+    public: 
+#ifndef GV_MSGHANDLER
+        GPUidMap(size_t devNum, uint64_t clientID, const std::string& proxyIP, uint16_t proxyPort) {
+            clientID_ = clientID;
+            devNum_ = devNum;
+            gpuInfoList_.reserve(devNum_);
+
+            struct sockaddr_storage serv_addr;
+            if ((proxySock_ = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+                tool::Logging(LOG_ERROR, myName_, "Socket creation failed\n");
+                exit(EXIT_FAILURE);
+            }
+
+            tool::SetSockAddr(proxyIP.c_str(), proxyPort, &serv_addr, AF_INET);
+            if (connect(proxySock_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+                tool::Logging(LOG_ERROR, myName_, "Connection to GPU Proxy(%s:%d) failed\n", proxyIP.c_str(), proxyPort);
+                exit(EXIT_FAILURE);
+            }
+            else {
+                tool::Logging(LOG_INFO, myName_, "Connection to GPU Proxy success\n");
+            }
+
+            RequestGPU();
+        }
+#else
+        GPUidMap(size_t devNum, uint64_t clientID, const std::string& model, size_t batch_Size, const std::string& proxyIP, uint16_t proxyPort) {
+            clientID_ = clientID;
+            devNum_ = devNum;
+            model_ = model.c_str();
+            batch_Size_ = batch_Size;
+            gpuInfoList_.reserve(devNum_);
+
+            struct sockaddr_storage serv_addr;
+            if ((proxySock_ = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+                tool::Logging(LOG_ERROR, myName_, "Socket creation failed\n");
+                exit(EXIT_FAILURE);
+            }
+
+            tool::SetSockAddr(proxyIP.c_str(), proxyPort, &serv_addr, AF_INET);
+            if (connect(proxySock_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+                tool::Logging(LOG_ERROR, myName_, "Connection to GPU Proxy(%s:%d) failed\n", proxyIP.c_str(), proxyPort);
+                exit(EXIT_FAILURE);
+            }
+            else {
+                tool::Logging(LOG_INFO, myName_, "Connection to GPU Proxy success\n");
+            }
+
+            RequestGPU();
+        }
+#endif
+        ~GPUidMap() {
+            tool::Logging(LOG_DEBUG, myName_, "Close socket to GPU Proxy\n");
+            close(proxySock_);
+        }
+
+        void RequestGPU() {
+#ifndef GV_MSGHANDLER
+            std::string sendData = "GPUQuery:" + std::to_string(clientID_) + "," + std::to_string(devNum_);
+#else
+            std::string sendData = "GPUQuery:" + std::to_string(clientID_) + "," + std::to_string(devNum_) + "," + model_ + "," + std::to_string(batch_Size_);
+#endif
+            // 
+            size_t sendDataLen = sendData.size();
+            send(proxySock_, &sendDataLen, sizeof(size_t), 0);
+            send(proxySock_, sendData.c_str(), sendDataLen, 0);
+            tool::Logging(LOG_DEBUG, myName_, "Send GPU allocation request(%zu bytes) to GPU Proxy\n", sendDataLen);
+
+            for (int i = 0; i < devNum_; i++) {
+                GpuInfoEntry_t* gpuInfo = (GpuInfoEntry_t*)malloc(sizeof(GpuInfoEntry_t));
+                if (!tool::ReadSocketMessage(proxySock_, (uint8_t*)gpuInfo, sizeof(GpuInfoEntry_t))) {
+                    // printf("Failed to read buffer length from GPU Proxy\n");
+                    tool::Logging(LOG_ERROR, myName_, "Failed to recv gpu info list from GPU Proxy\n");
+                    exit(EXIT_FAILURE);
+                }
+                else {
+                    tool::Logging(LOG_INFO, myName_, "GPU ID: %d, IP Address: %s, Port: %d, DataIP: %s, DataPort: %d, prop->name: %s\n", gpuInfo->nodeDevIdx, gpuInfo->nodeIp, gpuInfo->nodePort, gpuInfo->dataIp, gpuInfo->dataPort, gpuInfo->devprop.name);
+                }
+                gpuInfoList_.emplace_back(gpuInfo);
+            }
+        }
+
+        void ReallocGPU(){
+            tool::Logging(LOG_DEBUG, myName_, "ReallocGPU: ready to realloc GPU\n");
+            std::string sendData = "GPURealloc:" + std::to_string(clientID_) + "," + std::to_string(devNum_);
+            size_t sendDataLen = sendData.size();
+            send(proxySock_, &sendDataLen, sizeof(size_t), 0);
+            send(proxySock_, sendData.c_str(), sendDataLen, 0);
+            tool::Logging(LOG_DEBUG, myName_, "Send GPU Reallocation request(%zu bytes) to GPU Proxy\n", sendDataLen);
+            for(int i = 0; i < devNum_; ++i){
+                tool::Logging(LOG_INFO, myName_, "Waiting for GPU reallocation\n");
+                GpuInfoEntry_t* gpuInfo = (GpuInfoEntry_t*)malloc(sizeof(GpuInfoEntry_t));
+                if (!tool::ReadSocketMessage(proxySock_, (uint8_t*)gpuInfo, sizeof(GpuInfoEntry_t))) {//没得到GPU前阻塞
+                    tool::Logging(LOG_ERROR, myName_, "Failed to recv gpu info list from GPU Proxy\n");
+                    exit(EXIT_FAILURE);
+                }
+                else {
+                    tool::Logging(LOG_INFO, myName_, "Reallocate GPU ID: %d, IP Address: %s, Port: %d, DataIP: %s, DataPort: %d, prop->name: %s\n", gpuInfo->nodeDevIdx, gpuInfo->nodeIp, gpuInfo->nodePort, gpuInfo->dataIp, gpuInfo->dataPort, gpuInfo->devprop.name);
+                }
+                gpuInfoList_[i] = gpuInfo;
+            }
+            // gpuInfoList_[0]->nodeDevIdx = 1;
+
+        }   
+
+        void UpdateUniqueID(const uint8_t* uniqueID, const size_t len) {
+            std::string sendData = "CommUpdate:" + std::to_string(clientID_) + "," + std::to_string(len);
+            size_t sendDataLen = sendData.size();
+            send(proxySock_, &sendDataLen, sizeof(size_t), 0);
+            send(proxySock_, sendData.c_str(), sendDataLen, 0);
+            send(proxySock_, uniqueID, len, 0);
+            tool::Logging(LOG_DEBUG, myName_, "Send NCCL unique ID update request(%zu bytes) to GPU Proxy\n", sendDataLen);
+            // for (size_t i = 0; i < len; i++) {
+            //     printf("%02x", uniqueID[i]);
+            // }
+            // printf("\n");
+        }
+
+        void RequestUniqueID(uint8_t* uniqueID, const size_t len) {
+            reqCommIDcnt_++;
+            uint8_t isVaild = false;
+            std::string sendData = "CommQuery:" + std::to_string(clientID_) + "," + std::to_string(reqCommIDcnt_);
+            size_t sendDataLen = sendData.size();
+            while (!(bool)isVaild) {
+                send(proxySock_, &sendDataLen, sizeof(size_t), 0);
+                send(proxySock_, sendData.c_str(), sendDataLen, 0);
+                tool::Logging(LOG_DEBUG, myName_, "Send NCCL unique ID request to GPU Proxy, waiting for response...\n");
+                if (!tool::ReadSocketMessage(proxySock_, &isVaild, sizeof(uint8_t))) {
+                    tool::Logging(LOG_ERROR, myName_, "Failed to recv NCCL unique ID from GPU Proxy\n");
+                    exit(EXIT_FAILURE);
+                }
+            }
+
+            if (!tool::ReadSocketMessage(proxySock_, uniqueID, len)) {
+                tool::Logging(LOG_ERROR, myName_, "Failed to recv NCCL unique ID from GPU Proxy\n");
+                exit(EXIT_FAILURE);
+            }
+            else {
+                tool::Logging(LOG_DEBUG, myName_, "Received NCCL unique ID from GPU Proxy\n");
+            }
+            // for (size_t i = 0; i < len; i++) {
+            //     printf("%02x", uniqueID[i]);
+            // }
+            // printf("\n");
+        }
+
+        void Print() {
+            tool::Logging(LOG_INFO, myName_, "GPU ID Map:\n");
+            for (int i = 0; i < gpuInfoList_.size(); i++) {
+                tool::Logging(LOG_INFO, myName_, "[%d] GPU ID: %d, IP Address: %s, Port: %d\n", i, gpuInfoList_[i]->nodeDevIdx, gpuInfoList_[i]->nodeIp, gpuInfoList_[i]->nodePort);
+            }
+        }
+
+
+        inline bool Check(int virtDevIdx) {
+            if (virtDevIdx < 0 || virtDevIdx >= gpuInfoList_.size()) {
+                tool::Logging(LOG_ERROR, myName_, "Invalid virtual device index: %d (max: %d)\n", virtDevIdx, gpuInfoList_.size());
+                return false;
+            }
+            return true;
+        }
+
+        bool GetGPUKey(int gpuIdxInNode, int* virtDevIdx) {
+            tool::Logging(LOG_DEBUG, myName_, "GPUIdGetKey: ready to read the key(gpuIdx=%d) from GPU ID Map\n", gpuIdxInNode);
+            for (int i = 0; i < gpuInfoList_.size(); i++) {
+                if (gpuInfoList_[i]->nodeDevIdx == gpuIdxInNode) {
+                    *virtDevIdx = i;
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        bool GetGPUId(int virtDevIdx, int* gpuIdxInNode) {
+            tool::Logging(LOG_DEBUG, myName_, "GPUIdGetValue: ready read the key(virDev=%d) from GPU ID Map\n", virtDevIdx);
+            Check(virtDevIdx);
+            *gpuIdxInNode = gpuInfoList_[virtDevIdx]->nodeDevIdx;
+            return true;
+        }
+
+        bool GetGPUprop(int virtDevIdx, cudaDeviceProp* prop) {
+            tool::Logging(LOG_DEBUG, myName_, "GetGpuIdMap: ready to read the key(virDev=%d) from GPU ID Map\n", virtDevIdx);
+            Check(virtDevIdx);
+            *prop = gpuInfoList_[virtDevIdx]->devprop;
+            return true;
+        }
+
+        bool GetGPUinfo(int virtDevIdx, GpuInfoEntry_t** gpuInfo) {
+            tool::Logging(LOG_DEBUG, myName_, "GetGpuIdMap: ready to read the key(virDev=%d) from GPU ID Map\n", virtDevIdx);
+            Check(virtDevIdx);
+            *gpuInfo = gpuInfoList_[virtDevIdx];
+            return true;
+        }
+
+        bool SetGPUinfo(int virtDevIdx, const GpuInfoEntry_t& gpuInfo) {
+            tool::Logging(LOG_DEBUG, myName_, "SetGpuIdMap: ready to write the key(virDev=%d) to GPU ID Map\n", virtDevIdx);
+            Check(virtDevIdx);
+            memcpy(gpuInfoList_[virtDevIdx], &gpuInfo, sizeof(GpuInfoEntry_t));
+            return true;
+        }
+
+};
+
+
+#endif
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/mapper.h b/GPU-Virtual-Service/gpu-remoting/include/mapper.h
new file mode 100644
index 0000000..9291077
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/mapper.h
@@ -0,0 +1,331 @@
+#ifndef MAPPERS_H
+#define MAPPERS_H
+
+#include "configure.h"
+#include "./hashing/robin_hood.h"
+#include <cuda_runtime.h>
+#include <nccl.h>
+
+class DeviceBlockMapper {
+private:
+    const char* myName_ = "BlockMapper";
+    std::vector<Block_t> blocks_; // !: start from 0
+    uint64_t baseAddr_;
+
+public:
+    DeviceBlockMapper(const size_t reserveSize = 200, uint64_t base = 0x7f12e0a60000) {
+        blocks_.reserve(reserveSize);
+        baseAddr_ = base;
+    }
+
+    const std::vector<Block_t>& GetBlocks() const {
+        return blocks_; // only read
+    }
+
+    std::vector<Block_t>& GetBlocks() {
+        return blocks_; // read and write
+    }
+
+    void Resize(const size_t size) {
+        blocks_.resize(size);
+    }
+
+    uint64_t AddBlock(const uint64_t devPtr, const size_t size, bool essential = false) {
+        if (blocks_.empty()) {
+            blocks_.emplace_back(Block_t{.start = baseAddr_, .devPtr = devPtr, .size = size, .valid = true, .essential = essential});
+        }
+        else {
+            blocks_.emplace_back(Block_t{.start = blocks_.back().start + blocks_.back().size, .devPtr = devPtr, .size = size, .valid = true, .essential = essential});
+        }
+        // printf("AddBlock: devPtr = %p, blockIdx = %zu\n", (void*)devPtr, blocks_.size() - 1);
+        return blocks_.back().start;
+    }
+
+    int FindByVirAddr(const uint64_t userAddr, uint64_t& devPtr) {
+        auto it = std::lower_bound(blocks_.begin(), blocks_.end(), userAddr,
+            [](const Block_t& block, uint64_t addr) { return block.start + block.size <= addr; });
+
+        if (it != blocks_.end() && it->start <= userAddr && userAddr < it->start + it->size) {
+            if (it->valid == false) {
+                return -1;
+            }
+            else {
+                devPtr = it->devPtr + userAddr - it->start;
+                return it - blocks_.begin();
+            }
+
+        }
+        return -1;
+    }
+
+    int FindByRealAddr(const uint64_t devPtr, uint64_t& userAddr) {
+        for (size_t i = 0; i < blocks_.size(); i++) {
+            if (blocks_[i].devPtr <= devPtr && devPtr < blocks_[i].devPtr + blocks_[i].size) {
+                userAddr = blocks_[i].start + devPtr - blocks_[i].devPtr;
+                return i;
+            }
+        }
+        return -1;
+    }
+
+
+    bool ResetBlock(const uint64_t userAddr, int wantIdx = -1) {
+        uint64_t devPtr = 0;
+        int idx = (wantIdx == -1) ? FindByVirAddr(userAddr, devPtr) : wantIdx;
+        if (idx != -1) {
+            // printf("ResetBlock: devPtr = %p, blockIdx = %d\n", (void*)devPtr, idx);
+            blocks_[idx].valid = false;
+            blocks_[idx].devPtr = 0;
+            return true;
+        }
+        else {
+            return false;
+        }
+    }
+
+    void Print() {
+        tool::Logging(LOG_INFO, myName_, "BlockManager has %zu blocks\n", blocks_.size());
+        for (size_t i = 0; i < blocks_.size(); i++) {
+            tool::Logging(LOG_INFO, myName_, "Block[%zu]: start = %p, devPtr = %p, size = %zu, valid = %d, essential = %d\n", i, (void*)blocks_[i].start, (void*)blocks_[i].devPtr, blocks_[i].size, blocks_[i].valid, blocks_[i].essential);
+        }
+    }
+};
+
+class HandleMapper {
+private:
+    const char* myName_ = "HandleMapper";
+    std::vector<Handle_t> handleInfoList; // !: start from 1
+    robin_hood::unordered_flat_map<uint64_t, size_t> mapRealAddr2handleListIdx;
+    std::queue<pair<size_t, size_t>> freeHandleQueue;
+
+public:
+    HandleMapper(const size_t reserveSize = 1000) {
+        handleInfoList.reserve(reserveSize);
+        handleInfoList.emplace_back(Handle_t{.handlePtr = (uint64_t)NULL, .type = __CUDA_REGISTER, .valid = true, .stream = 0});  // let the handleNum start from 1
+    }
+
+    const std::vector<Handle_t>& GetHandleInfoList() const {
+        return handleInfoList; // only read
+    }
+
+    std::vector<Handle_t>& GetHandleInfoList() {
+        return handleInfoList; // read and write
+    }
+
+    void Resize(const size_t size) {
+        handleInfoList.resize(size);
+    }
+
+    void Reset(const std::vector<std::pair<size_t, Handle_t>>& handleList) {
+        handleInfoList.resize(handleList.back().first + 1); // handleIdxList is sorted and has at least one element
+        mapRealAddr2handleListIdx.clear();
+        while (!freeHandleQueue.empty()) {
+            freeHandleQueue.pop();
+        }
+        for (size_t i = 0; i < handleList.size(); i++) {
+            size_t handleIdx = handleList[i].first;
+            handleInfoList[handleIdx] = handleList[i].second;
+            if (i == 0) {
+                continue;
+            }
+            mapRealAddr2handleListIdx.insert({(uint64_t)handleInfoList[handleIdx].handlePtr, handleIdx});
+            // no need to push the free space into the queue since the replaying process has not been completed
+            // if (i == 1 && handleIdx != 1) {
+            //     freeHandleQueue.push({1, handleIdx - 1});
+            // }
+            // else if (i > 1 && handleIdx != handleList[i - 1].first + 1) {
+            //     freeHandleQueue.push({handleList[i - 1].first + 1, handleIdx - 1});
+            // }
+        }
+    }
+
+    void Shrink() {
+        tool::Logging(LOG_DEBUG, myName_, "Shrink: handleInfoList size: %zu (mapRealAddr2handleListIdx size: %zu)\n", handleInfoList.size(), mapRealAddr2handleListIdx.size());
+        mapRealAddr2handleListIdx.clear();
+        while (!freeHandleQueue.empty()) {
+            freeHandleQueue.pop();
+        }
+
+        for (size_t i = 1; i < handleInfoList.size(); i++) {
+            if (handleInfoList[i - 1].valid == false && handleInfoList[i].valid == false) {
+                freeHandleQueue.back().second = i;
+            }
+            else if (handleInfoList[i - 1].valid == true && handleInfoList[i].valid == false) {
+                freeHandleQueue.push({i, i});
+            }
+
+            if (handleInfoList[i].valid == true) {
+                mapRealAddr2handleListIdx.insert({(uint64_t)handleInfoList[i].handlePtr, i});
+            }
+        }
+        tool::Logging(LOG_DEBUG, myName_, "Shrink: mapRealAddr2handleListIdx (valid handles) is reorganized to %zu\n", mapRealAddr2handleListIdx.size());        
+    }
+
+    void Indexing() {
+        mapRealAddr2handleListIdx.clear();
+        for (size_t i = 1; i < handleInfoList.size(); i++) {
+            if (handleInfoList[i].valid == true) {
+                mapRealAddr2handleListIdx.insert({(uint64_t)handleInfoList[i].handlePtr, i});
+            }
+        }
+    }
+
+    void* AddHandle(void* realAddr, enum API_REQUEST_CODE_SET handleType) {
+        auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr);
+        size_t handleNum = 0;
+        if (it != mapRealAddr2handleListIdx.end()) {
+            handleNum = it->second; // find out the address previously created
+            handleInfoList[handleNum].valid = true;
+            handleInfoList[handleNum].type = handleType;
+        }
+        else {
+            if (freeHandleQueue.empty()) { // handleInfo List has no free space to add a new handle, so need to expand
+                handleNum = (
+                    handleInfoList.emplace_back(Handle_t{.handlePtr = (uint64_t)realAddr, .type = handleType, .valid = true, .stream = 0}),
+                    handleInfoList.size() - 1
+                );
+            }
+            else { // reuse the free space in handleInfo List
+                handleNum = freeHandleQueue.front().first;
+                freeHandleQueue.front().first++;
+                if (freeHandleQueue.front().first > freeHandleQueue.front().second) {
+                    freeHandleQueue.pop();
+                }
+                handleInfoList[handleNum].handlePtr = (uint64_t)realAddr;
+                handleInfoList[handleNum].type = handleType;
+                handleInfoList[handleNum].valid = true;
+                handleInfoList[handleNum].stream = 0;
+            }
+            mapRealAddr2handleListIdx.insert({(uint64_t)realAddr, handleNum});               
+
+            if (handleInfoList.size() >= HANDLE_MAX_NUM && freeHandleQueue.size() == 0) {
+                tool::Logging(LOG_INFO, myName_, "AddHandle: mapRealAddr2handleListIdx is too large(%zu), preparing to reorganize\n", mapRealAddr2handleListIdx.size());
+                Shrink();
+            }
+        }
+        // no need to check the handleNum, since ((1LL << 48) - 1) is enough large 
+        uint64_t handleVirAddr = HANDLE_PREFIX | handleNum;
+        return (void*)handleVirAddr;
+    }
+
+    void* FindRealAddrByVirAddr(uint64_t userAddr, bool reset) {
+        if (CHECK_HANDLE_PREFIX(userAddr) == 0) {
+            return NULL;
+        }
+        size_t handleID = GET_HANDLE_ID(userAddr);
+        if (handleID <= 0 || handleID >= handleInfoList.size() || handleInfoList[handleID].valid == false) {
+            return NULL;
+        }
+        void* realAddr = (void*)(handleInfoList[handleID].handlePtr);
+        if (reset) {
+            handleInfoList[handleID].valid = false;
+        }
+        return realAddr;
+    }
+
+    Handle_t* GetHandleInfoByRealAddr(void* realAddr) {
+        auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr);
+        if (it != mapRealAddr2handleListIdx.end()) {
+            return &handleInfoList[it->second];
+        }
+        else {
+            // for (size_t i = 1; i < handleInfoList.size(); i++) {
+            //     if (handleInfoList[i].handlePtr == (uint64_t)realAddr) {
+            //         return &handleInfoList[i];
+            //     }
+            // }
+            // tool::Logging(LOG_ERROR, myName_, "GetHandleInfoByRealAddr failed: realAddr(%p) is not in the map\n", realAddr);
+            return NULL;
+        }
+    }
+
+    uint64_t FindIdxByRealAddr(void* realAddr) {
+        auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr);
+        if (it != mapRealAddr2handleListIdx.end()) {
+            uint64_t virtAddr = HANDLE_PREFIX | it->second;
+            return virtAddr;
+        }
+        else {
+            for (size_t i = 1; i < handleInfoList.size(); i++) {
+                if (handleInfoList[i].handlePtr == (uint64_t)realAddr) {
+                    return HANDLE_PREFIX | i;
+                }
+            }
+            tool::Logging(LOG_ERROR, myName_, "FindIdxByRealAddr failed: realAddr(%p) is not in the map\n", realAddr);
+            return 0;
+        }            
+        return 0;
+    }
+
+    Handle_t* GetHandleInfoByVirAddr(uint64_t userAddr) {
+        if (CHECK_HANDLE_PREFIX(userAddr) == 0) {
+            return NULL;
+        }
+        size_t handleID = GET_HANDLE_ID(userAddr);
+        if (handleID <= 0 || handleID >= handleInfoList.size()) {
+            return NULL;
+        }
+        return &handleInfoList[handleID]; //! the pointer will be changed
+    }
+
+    void UpdateHandle(const uint64_t userAddr, void* realAddr, enum API_REQUEST_CODE_SET handleType) {
+        if (CHECK_HANDLE_PREFIX(userAddr) == 0) {
+            tool::Logging(LOG_ERROR, myName_, "UpdateHandle failed: userAddr is not a handle(%p)\n", (void*)userAddr);
+            return;
+        }
+        size_t handleID = GET_HANDLE_ID(userAddr);
+        if (handleID <= 0) {
+            tool::Logging(LOG_ERROR, myName_, "UpdateHandle failed: handleID(%zu) is out of range\n", handleID);
+            return;
+        }
+        if (handleID == handleInfoList.size()) {
+            handleInfoList.emplace_back(Handle_t{.handlePtr = (uint64_t)realAddr, .type = handleType, .valid = true, .stream = 0});
+        }
+        if (handleID > handleInfoList.size()) {
+            handleInfoList.resize(handleID + 1); // expand the boundary
+            handleInfoList[handleID] = Handle_t{.handlePtr = (uint64_t)realAddr, .type = handleType, .valid = true, .stream = 0};
+        } // todo: to be optimized
+
+        // No need to update the mapRealAddr2handleListIdx, since shrink() will be called after the replaying process
+        // auto it = mapRealAddr2handleListIdx.find((uint64_t)realAddr);
+        // if (it != mapRealAddr2handleListIdx.end()) {
+        //     it->second = handleID; // update the handleID of the realAddr
+        // }
+        // else {
+        //     mapRealAddr2handleListIdx.insert({(uint64_t)realAddr, handleID});
+        // } // this handleID was created after the checkpoint, maybe was not in the map
+        
+        if (handleID < handleInfoList.size()) { // handleID is located in the existing range (not include the boundary)
+            handleInfoList[handleID].handlePtr = (uint64_t)realAddr;
+            handleInfoList[handleID].valid = true;
+        } 
+
+        tool::Logging(LOG_DEBUG, myName_, "UpdateHandle: userAddr=%p, original realAddr=%p, new realAddr=%p\n", (void*)userAddr, (void*)handleInfoList[handleID].handlePtr, realAddr);
+    }
+
+    size_t GetValidHandleNum() {
+        size_t validHandleNum = 0;
+        for (size_t i = 1; i < handleInfoList.size(); i++) {
+            if (handleInfoList[i].valid == true) {
+                validHandleNum++;
+            }
+        }
+        return validHandleNum;
+    }
+
+    std::vector<std::pair<size_t, Handle_t>> GetValidHandles() {
+        std::vector<std::pair<size_t, Handle_t>> validHandles;
+        for (size_t i = 0; i < handleInfoList.size(); i++) {
+            if (handleInfoList[i].valid == true) {
+                validHandles.emplace_back(i, handleInfoList[i]);
+            }
+        }
+        return validHandles;
+    }
+
+    size_t GetCapacity() {
+        return handleInfoList.size();
+    }   
+};
+
+#endif // MAPPERS_H
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/msghandler.h b/GPU-Virtual-Service/gpu-remoting/include/msghandler.h
new file mode 100644
index 0000000..9c92fe5
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/msghandler.h
@@ -0,0 +1,184 @@
+#ifndef MSG_HANDLER_H
+#define MSG_HANDLER_H
+
+#include "configure.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cuda_device_runtime_api.h>
+#include <cuda_runtime_api.h>
+#include <netinet/in.h>
+#include <nvml.h>
+#include <driver_types.h>
+#include <queue>
+#include <set>
+#include <string>
+#include <strings.h>
+#include <unistd.h>
+#include <unordered_map>
+#include <chrono>
+#include <utility>
+#include <cuda.h>
+#include <vector>
+#include <mutex>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include "constVar.h"
+#include "define.h"
+#include <sys/un.h>
+#include <unistd.h>
+
+extern Configure config_;
+using namespace std;
+
+typedef struct Jobinfo {
+    size_t used_mem; // memory
+    uint sm_util; //SM utilization
+    double served_time;//按秒算
+    double remain_time;
+    double perIt_time;
+    int cur_It;// current iterations
+    int num_It;// the number of iterations
+    int device;
+    uint64_t client_id;
+    bool stop_listen_flag = false;
+    int priority_;
+    
+
+    bool operator<(const Jobinfo& other) const {
+        return remain_time < other.remain_time;
+    }
+} Jobinfo_;
+
+
+class MsgHandler {
+    private:
+        Jobinfo_* job;
+        const char *myName_ = "MsgHandler";
+        int dispatch_sock = -1;
+        int monitor_sock = -1;
+    
+    public:
+        bool replay_flag = false;
+        bool stop_flag = false;
+
+
+        MsgHandler(uint64_t clientId) {
+            tool::Logging(LOG_INFO, myName_, "MsgHandler() client %lu, MsgHandler is created.\n", clientId);
+            job = new Jobinfo_();
+            job->client_id = clientId;
+            initialize_connection(config_.GetDpcIp(), config_.GetDpcPort(), dispatch_sock);
+
+        }
+
+        ~MsgHandler() {
+            send_msg("TypeE:", 1);
+            tool::Logging(LOG_INFO, myName_, "~MSGHandler() client %lu, MsgHandler is destroyed.\n", job->client_id);
+            release_connection(dispatch_sock);
+            delete job;
+        }
+
+        void initialize_connection(const std::string& ip, int port, int& sock) {
+            if (sock == -1) {
+                int tmptid = static_cast<pid_t>(::syscall(SYS_gettid));
+                std::cout << "clientId: " << job->client_id << " tid:" << tmptid << " get in initialize_connection "  <<std::endl;
+        
+                struct sockaddr_in serv_addr;
+        
+                if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+                    std::cerr << "Socket creation error." << std::endl;
+                    return;
+                }
+                memset(&serv_addr, 0, sizeof(serv_addr));
+                serv_addr.sin_family = AF_INET;
+                serv_addr.sin_port = htons(port);
+        
+                if (inet_pton(AF_INET, ip.c_str(), &serv_addr.sin_addr) <= 0) {
+                    std::cerr << "Invalid address/ Address not supported." << std::endl;
+                    return;
+                }
+        
+                if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+                    std::cerr << "Connection Failed." << std::endl;
+                    return;
+                }
+                std::string msg = "TypeC:" + to_string(job->client_id);
+                send(sock, msg.c_str(), msg.size(), 0);
+                
+            }
+        }
+
+        void release_connection(int _sock) {
+            // std::lock_guard<std::mutex> lock(conn_mutex);
+            std::cout << "_sock:" << _sock << std::endl;
+            int tmptid = static_cast<pid_t>(::syscall(SYS_gettid));
+            if (_sock != -1) {
+                std::cout << "clientId: " << job->client_id <<" tid:" << tmptid << " get in release_connection" << std::endl;
+                close(_sock);
+                tool::Logging(LOG_INFO, "Scheduler", "Disconnected from Monitor.\n");
+            }
+        }
+
+        void send_msg(const std::string& message, int sock_flag){
+            int tmp_sock = -1;
+            if(sock_flag == 1){
+                tmp_sock = dispatch_sock;
+            }else if(sock_flag == 2){
+                tmp_sock = monitor_sock;
+            }   
+            string msg = "";
+            if (tmp_sock != -1) {
+                if(message == "TypeE:"){
+                    tool::Logging(LOG_INFO, myName_,"TypeE msg SendMeg::client %lu\n", job->client_id);
+                    // if(replay_flag){
+                    //     msg = message + to_string(job->client_id) + "," + to_string(1); 
+                    // }else{
+                    //     msg = message + to_string(job->client_id) + "," + to_string(0);
+                    // }
+                    msg = message + to_string(job->client_id) + "," + to_string(1); 
+                    send(tmp_sock, msg.c_str(), msg.size(), 0);
+                }
+            }
+        }
+
+        
+        bool receive_message(int sock, std::string& message) {
+            char buffer[1024] = {0};
+            int bytes_received = recv(sock, buffer, sizeof(buffer), 0);
+            if (bytes_received > 0) {
+                message = std::string(buffer, bytes_received);
+                if (message == "stop"){
+                    // std::cout << "Received stop message." << std::endl;
+                    tool::Logging(LOG_INFO,myName_,"client:%d, Received stop message.\n", job->client_id);
+                    stop_flag = true;
+                }
+                return true;
+            } else if (bytes_received == 0) {
+                std::cout << "Connection closed by peer." << std::endl;
+                return false;
+            } else {
+                // std::cerr << "Receive error." << std::endl;
+                tool::Logging(LOG_ERROR,myName_,"client:%d, Receive error.\n", job->client_id);
+                release_connection(sock);
+                return false;
+            }
+        }
+
+        void aysnc_receive_message() {
+            std::string message;
+            while (!job->stop_listen_flag) {
+                if (receive_message(dispatch_sock, message)) {
+                    tool::Logging(LOG_INFO,myName_,"client:%d, Received message: %s\n", job->client_id, message.c_str());
+                    if (message == "stop") {
+                        job->stop_listen_flag = true;
+                    }
+                }
+            }
+            tool::Logging(LOG_INFO,myName_,"client:%d, Stop listening.\n", job->client_id);
+        }
+
+};
+
+
+
+#endif
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h b/GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h
new file mode 100644
index 0000000..28f6c9d
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/ptxExtractor.h
@@ -0,0 +1,411 @@
+#ifndef PTX_EXTRACTOR_H
+#define PTX_EXTRACTOR_H
+
+#include "configure.h"
+#include "./hashing/robin_hood.h"
+#include "./conqueue/readerwriterqueue.h"
+#include <boost/asio/thread_pool.hpp>
+#include <boost/asio/post.hpp>
+#include <nv_decode.h>
+
+size_t inline SafeStr2Ull(const std::string& str) {
+    try {
+        return std::stoull(str);
+    } catch (...) {
+        return 0;
+    }
+}
+
+class PTXExtractor {
+private:
+    const char* myName_ = "PTXExtractor";
+    std::mutex mutex_;
+    boost::asio::thread_pool pool_;
+    // std::vector<void*> imageList_;
+
+public:
+    moodycamel::BlockingReaderWriterQueue< std::pair<void*, size_t> >* _imageQueue;
+    robin_hood::unordered_flat_map<std::string, uint64_t>* _kernelDevMap;
+    volatile bool _readyClosed = false;
+    volatile bool _finished = false;
+    
+
+    PTXExtractor(moodycamel::BlockingReaderWriterQueue< std::pair<void*, size_t> >* iq, 
+                 robin_hood::unordered_flat_map<std::string, uint64_t>* kdm)
+        : _imageQueue(iq), _kernelDevMap(kdm), pool_(std::thread::hardware_concurrency()) {
+        // imageList_.reserve(240);
+        tool::Logging(LOG_INFO, myName_, "PTXExtractor is ready\n");
+    }
+
+    ~PTXExtractor() {
+        tool::Logging(myName_, "ready to close PTXExtractor\n");
+        if (_finished == false) { // PTXExtractor is still working
+            if (_readyClosed == false) { // cudaLaunchKernel has not been called
+                _readyClosed = true;
+            }
+            while (_finished == false) { // waiting the Run() to finish
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+        }
+        // for (auto& image : imageList_) {
+        //     free(image);
+        // }
+        delete _imageQueue;
+        delete _kernelDevMap;
+        tool::Logging(LOG_INFO, myName_, "close the PTXExtractor\n");
+    }
+
+    void Extract(void* image, size_t imageSize) {
+        // step1: writing the image to a file
+
+        // 创建一个临时文件来存储image内容
+        char temp_filename[] = "/tmp/cuda_image_XXXXXX";
+        int fd = mkstemp(temp_filename);
+        if (fd == -1) {
+            throw std::runtime_error("Failed to create temp file");
+        }
+        // 使用RAII来确保文件描述符被关闭
+        struct FDCloser {
+            void operator()(int *fd) const { if (*fd != -1) close(*fd); }
+        };
+        std::unique_ptr<int, FDCloser> fd_closer(&fd);
+        // 写入image内容到临时文件
+        if (write(fd, image, imageSize) != static_cast<ssize_t>(imageSize)) {
+            throw std::runtime_error("Failed to write image to temp file");
+        }
+
+        // step2: disassemble the image to PTX codes
+        
+        // 使用cuobjdump命令处理临时文件
+        std::string ptxCodes;
+        ptxCodes.reserve(1024 * 1024);
+        std::array<char, 4096> buffer;
+        std::string cmd = "cuobjdump -ptx " + std::string(temp_filename);
+        std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
+        if (!pipe) {
+            throw std::runtime_error("popen() failed!");
+        }
+
+        // 读取PTX代码
+        while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+            ptxCodes += buffer.data();
+        }   
+
+        // 删除临时文件
+        std::remove(temp_filename);
+
+        // step3: match kernel body and extract dev ptr from kernel params
+
+        // 提取PTX代码中的一系列Kernel
+        std::vector<KernelPtx_t> kernels;
+        kernels.reserve(150);
+        ExtractKernelsFromPtx(ptxCodes, kernels);
+
+        for (const auto& kernel : kernels) {
+            tool::Logging(myName_, "Kernel name: %s\n", kernel.name.c_str());
+
+            // // if (kernel.name == "_ZN2at6native43_GLOBAL__N__eaa73ff3_10_Dropout_cu_f338191124fused_dropout_kernel_vecIffjLi1ELi4EbEEvNS_4cuda6detail10TensorInfoIT_T1_EES8_NS5_IT4_S7_EES7_T0_NS_15PhiloxCudaStateE"
+            // if (kernel.name == "_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_15gpu_kernel_implINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_" 
+            // ) {
+            //     std::ofstream outFile("flexgv-log-complex-ptx", std::ios::out | std::ios::app); 
+            //     outFile << kernel.name << std::endl;
+            //     outFile << kernel.body << std::endl;
+            //     outFile.close();
+            // }
+
+        /*
+            // 提取 ld.param.u64 行的寄存器、参数序号和参数偏移量
+            robin_hood::unordered_flat_map<std::string, LdParamInfo_t> param_map;
+            ExtractParamsFromKernelPtx(kernel.body, param_map);
+
+            // 提取 cvta.to.global.u64 行的源寄存器
+            auto dev_ptr_list = new std::vector<std::pair<size_t, size_t>>();
+            dev_ptr_list->reserve(param_map.size());
+            MatchDevPtrsFromKernelPtx(kernel.body, param_map, dev_ptr_list);
+        */
+
+            // 直接提取 ld.param.u64 行的参数序号和参数偏移量
+            auto dev_ptr_list = new std::vector<std::pair<size_t, size_t>>();
+            dev_ptr_list->reserve(200);
+            ExtractDevptrFromKernelPtx(kernel.body, kernel.name, dev_ptr_list);
+
+            // 将Kernel名称和设备内存指针映射关系存入_kernelDevMap，共享给serverEp
+            mutex_.lock();
+            _kernelDevMap->insert({kernel.name, reinterpret_cast<uint64_t>(dev_ptr_list)});
+            mutex_.unlock();
+
+            // 打印设备指针列表
+            for (const auto& dev_ptr : *dev_ptr_list) {
+                tool::Logging(myName_, "\tParam #%zu, Offset: %zu\n", dev_ptr.first, dev_ptr.second);
+            }         
+        }
+        tool::Logging(myName_, "finished Extract().\n");
+        // free(image); 
+        // imageList_.push_back(image); // the image will be loaded by cuModuleLoadData
+    }
+    
+    void Run() {
+        std::pair<void*, size_t> image;
+        while(!_readyClosed || _imageQueue->size_approx() > 0) {
+            if (_imageQueue->wait_dequeue_timed(image, std::chrono::milliseconds(5))) {
+                tool::Logging(LOG_DEBUG, myName_, "pop image: %p, size: %zu\n", image.first, image.second);
+                boost::asio::post(pool_, 
+                    std::bind(&PTXExtractor::Extract, this, image.first, image.second));
+            }
+        }
+        pool_.join();
+        _finished = true;
+        tool::Logging(LOG_INFO, myName_, "finished PTXExtractor work\n");
+    }
+
+    // Extract kernel body from PTX codes by matching brackets
+    size_t ExtractKernelBody(const std::string& ptx_code, size_t start_pos) {
+        std::stack<char> brackets;
+        size_t end_pos = start_pos;
+        
+        for (; end_pos < ptx_code.length(); ++end_pos) {
+            if (ptx_code[end_pos] == '{') {
+                brackets.push('{');
+            } else if (ptx_code[end_pos] == '}') {
+                if (!brackets.empty()) {
+                    brackets.pop();
+                }
+                if (brackets.empty()) {
+                    ++end_pos; // include the last '}'
+                    break;
+                }
+            }
+        }
+        return end_pos;
+    }
+
+    // Extract kernel's PTX name and its PTX body from PTX codes
+    void ExtractKernelsFromPtx(const std::string& ptx_code, std::vector<KernelPtx_t>& kernels) {
+        static const boost::regex kernel_pattern(R"(\.entry\s+(\w+)\s*\()");
+        boost::sregex_iterator kernels_begin(ptx_code.begin(), ptx_code.end(), kernel_pattern);
+        boost::sregex_iterator kernels_end;
+
+        for (auto i = kernels_begin; i != kernels_end; ++i) {
+            const boost::smatch& match = *i;
+            const std::string& kernel_name = match[1].str();
+            size_t body_start_pos = ptx_code.find('{', match.position());
+            if (body_start_pos != std::string::npos) {
+                size_t body_end_pos = ExtractKernelBody(ptx_code, body_start_pos);
+                kernels.emplace_back(
+                    kernel_name.c_str(), kernel_name.length(),
+                    ptx_code.c_str() + body_start_pos, body_end_pos - body_start_pos
+                );
+            }
+        }
+    }
+
+    // Extract param index and param offset from ld.param.u64 line
+    static void ExtractDevptrFromKernelPtx(const std::string& kernel_body, const std::string& kernel_name,
+                    std::vector<std::pair<size_t, size_t>>* dev_ptr_list) {
+        // static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[%\w+\+(\d+)\];)");
+        static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[(%\w+)(?:\+(\d+))?\];)");
+        static const boost::regex mov_b64_pattern(R"(mov\.b64\s+(%\w+),\s+\w+_param_(\d+);)");
+        boost::sregex_iterator params_begin(kernel_body.begin(), kernel_body.end(), ld_pattern);
+        boost::sregex_iterator params_end;
+
+        for (auto i = params_begin; i != params_end; ++i) {
+            const boost::smatch& match = *i;
+
+            if (match[1].matched && match[2].matched) {
+                // First pattern: ld.param.u64 dst_register_name, [Kernel_param_0+offset];
+                std::string dst_register_name(match[1].first, match[1].second);
+                size_t param_index = SafeStr2Ull(match[2].str());
+                size_t param_offset = match[3].matched ? SafeStr2Ull(match[3].str()) : 0;
+
+                dev_ptr_list->emplace_back(param_index, param_offset);
+            } else if (match[4].matched && match[5].matched) {
+                // Second pattern: ld.param.u64 dst_register_name, [Register+Offset]; 
+                std::string dst_register_name(match[4].first, match[4].second);
+                size_t param_index = 0; // For this form, need to find the corresponding mov.u64 & mov.b64
+                size_t param_offset = match[6].matched ? SafeStr2Ull(match[6].str()) : 0;
+                std::string related_register(match[5].first, match[5].second);
+
+                // Step 1: Find pre_register based on related_register: mov.u64 pre_register, related_register;
+                const boost::regex mov_pattern(R"(mov\.u64\s+)" + related_register + R"(,\s+(%\w+);)");
+                boost::sregex_iterator mov_begin(kernel_body.begin(), kernel_body.end(), mov_pattern);
+                boost::sregex_iterator mov_end;
+                std::string pre_register;
+                for (auto mov_it = mov_begin; mov_it != mov_end; ++mov_it) {
+                    const boost::smatch& mov_match = *mov_it;
+                    pre_register = mov_match[1].str();
+                    break; // Assuming only one match needed
+                }
+
+                // Step 2: Find param_index based on pre_register: mov.b64 pre_register, Kernel_param_x;
+                if (!pre_register.empty()) {
+                    // static const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)");
+                    const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)");
+                    boost::sregex_iterator b64_begin(kernel_body.begin(), kernel_body.end(), b64_pattern);
+                    boost::sregex_iterator b64_end;
+
+                    for (auto b64_it = b64_begin; b64_it != b64_end; ++b64_it) {
+                        const boost::smatch& b64_match = *b64_it;
+                        param_index = SafeStr2Ull(b64_match[1].str());
+                        break; // Assuming only one match needed
+                    }
+                }
+
+                dev_ptr_list->emplace_back(param_index, param_offset);
+            }
+        }
+
+        // Handle mov.b64 pattern: mov.b64 dst_register_name, Kernel_param_x; (no offset)
+        boost::sregex_iterator mov_b64_begin(kernel_body.begin(), kernel_body.end(), mov_b64_pattern);
+        for (auto i = mov_b64_begin; i != params_end; ++i) {
+            const boost::smatch& match = *i;
+            if (match[1].matched && match[2].matched) {
+                std::string dst_register_name(match[1].first, match[1].second);
+                size_t param_index = SafeStr2Ull(match[2].str());
+
+                // special case: gpu_kernel_impl<at::native::CUDAFunctor_add<float> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<float> const&)
+                if (kernel_name.find("_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_15gpu_kernel_implINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_") != std::string::npos) {
+                    continue; // skipping the mov.b64 pattern (the memory contains some device pointers, so it's not a real device pointer)
+                }
+
+                dev_ptr_list->emplace_back(param_index, 0); // No offset in this case
+
+                // special case: CatArrInputTensorMetadata related kernel, https://github.com/pytorch/pytorch/blob/feef057691b357481981fd24483a817ce87c8517/aten/src/ATen/native/cuda/Shape.cu#L155
+                if (kernel_name.find("CatArrInputTensorMetadata") != std::string::npos) {
+                    size_t  size = 1000;
+                    char    realname[size];
+                    int     status;
+                    __cu_demangle(kernel_name.c_str(), realname, &size, &status);
+                    std::string realname_str(realname);
+                    boost::regex bracket_pattern("<[^<>]*>.*?<[^<>]*?,\\s*[^<>]*?,\\s*[^<>]*?,\\s*\\(int\\)(\\d+)");
+                    boost::smatch newmatch;
+                    if (boost::regex_search(realname_str, newmatch, bracket_pattern)) {
+                        size_t batchSize = SafeStr2Ull(newmatch[1].str());  // matching the Template parameter(batch size)
+                        for (size_t i = 0; i < batchSize; ++i) { // push all the device pointers from input buffer
+                            dev_ptr_list->emplace_back(param_index, i * 8); 
+                        }
+                    }
+                }
+            }
+        }
+        std::sort((*dev_ptr_list).begin(), (*dev_ptr_list).end());
+        (*dev_ptr_list).erase(std::unique((*dev_ptr_list).begin(), (*dev_ptr_list).end()), (*dev_ptr_list).end()); // avoid loading the same device pointer repeatedly
+    }
+
+    // [not used] Extract register, param index and param offset from ld.param.u64 line
+    void ExtractParamsFromKernelPtx(const std::string& kernel_body, 
+                    robin_hood::unordered_flat_map<std::string, LdParamInfo_t>& param_map) {
+        // static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[%\w+\+(\d+)\];)");
+        static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[(%\w+)(?:\+(\d+))?\];)");
+        boost::sregex_iterator params_begin(kernel_body.begin(), kernel_body.end(), ld_pattern);
+        boost::sregex_iterator params_end;
+
+        for (auto i = params_begin; i != params_end; ++i) {
+            const boost::smatch& match = *i;
+
+            if (match[1].matched && match[2].matched) {
+                // First pattern: ld.param.u64 dst_register_name, [Kernel_param_0+offset];
+                std::string dst_register_name(match[1].first, match[1].second);
+                size_t param_index = SafeStr2Ull(match[2].str());
+                size_t param_offset = match[3].matched ? SafeStr2Ull(match[3].str()) : 0;
+
+                param_map.insert({std::move(dst_register_name), LdParamInfo_t(param_index, param_offset)});
+            } else if (match[4].matched && match[5].matched) {
+                // Second pattern: ld.param.u64 dst_register_name, [Register+offset];
+                std::string dst_register_name(match[4].first, match[4].second);
+                size_t param_index = 0; // For this form, need to find the corresponding mov.u64 & mov.b64
+                size_t param_offset = match[6].matched ? SafeStr2Ull(match[6].str()) : 0;
+                std::string related_register(match[5].first, match[5].second);
+
+                // Step 1: Find pre_register based on related_register
+                const boost::regex mov_pattern(R"(mov\.u64\s+)" + related_register + R"(,\s+(%\w+);)");
+                boost::sregex_iterator mov_begin(kernel_body.begin(), kernel_body.end(), mov_pattern);
+                boost::sregex_iterator mov_end;
+                std::string pre_register;
+                for (auto mov_it = mov_begin; mov_it != mov_end; ++mov_it) {
+                    const boost::smatch& mov_match = *mov_it;
+                    pre_register = mov_match[1].str();
+                    break; // Assuming only one match needed
+                }
+
+                // Step 2: Find param_index based on pre_register
+                if (!pre_register.empty()) {
+                    // static const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)");
+                    const boost::regex b64_pattern(R"(mov\.b64\s+)" + pre_register + R"(,\s+\w+_param_(\d+);)");
+                    boost::sregex_iterator b64_begin(kernel_body.begin(), kernel_body.end(), b64_pattern);
+                    boost::sregex_iterator b64_end;
+
+                    for (auto b64_it = b64_begin; b64_it != b64_end; ++b64_it) {
+                        const boost::smatch& b64_match = *b64_it;
+                        param_index = SafeStr2Ull(b64_match[1].str());
+                        break; // Assuming only one match needed
+                    }
+                }
+
+                param_map.insert({std::move(dst_register_name), LdParamInfo_t(param_index, param_offset)});
+            }
+        }
+    }
+
+    // [not used] Extract source register from cvta.to.global.u64 line and match with ld.param.u64, to extract the location of devPtr
+    void MatchDevPtrsFromKernelPtx(const std::string& kernel_body, 
+                    robin_hood::unordered_flat_map<std::string, LdParamInfo_t>& param_map,
+                    std::vector<std::pair<size_t, size_t>>* dev_ptr_list) {
+        static const boost::regex cvta_pattern(R"(cvta\.to\.global\.u64\s+\%\w+,\s+(%\w+);)");
+        boost::sregex_iterator cvta_begin(kernel_body.begin(), kernel_body.end(), cvta_pattern);
+        boost::sregex_iterator cvta_end;
+
+        for (auto i = cvta_begin; i != cvta_end; ++i) {
+            const boost::smatch& match = *i;
+            boost::ssub_match source_register_match = match[1];
+            
+            auto it = param_map.find(std::string(source_register_match.first, source_register_match.second));
+            if (it != param_map.end()) {
+                if (it->second.isUsed) {
+                    continue; // avoid extracting the same source register repeatedly
+                }
+                (*dev_ptr_list).emplace_back(it->second.index, it->second.offset);
+                it->second.isUsed = true;
+            }
+        }
+
+        // for (const auto& entry : param_map) {
+        //     (*dev_ptr_list).emplace_back(entry.second.index, entry.second.offset);
+        // }
+        
+        std::sort((*dev_ptr_list).begin(), (*dev_ptr_list).end());
+        (*dev_ptr_list).erase(std::unique((*dev_ptr_list).begin(), (*dev_ptr_list).end()), (*dev_ptr_list).end()); // avoid loading the same device pointer repeatedly
+    }
+
+    //     // Extract param index and param offset from ld.param.u64 line
+    // static void ExtractDevptrFromKernelPtx(const std::string& kernel_body, 
+    //                 std::vector<std::pair<size_t, size_t>>* dev_ptr_list) {
+    //     // static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|ld\.param\.u64\s+(%\w+),\s+\[%\w+\+(\d+)\];)");
+    //     static const boost::regex ld_pattern(R"(ld\.param\.u64\s+(%\w+),\s+\[\w+_param_(\d+)(?:\+(\d+))?\];|mov\.b64\s+%\w+,\s+\w+_param_(\d+);)");
+    //     boost::sregex_iterator params_begin(kernel_body.begin(), kernel_body.end(), ld_pattern);
+    //     boost::sregex_iterator params_end;
+
+    //     for (auto i = params_begin; i != params_end; ++i) {
+    //         const boost::smatch& match = *i;
+
+    //         if (match[1].matched && match[2].matched) {
+    //             // First pattern: ld.param.u64 dst_register_name, [Kernel_param_0+offset];
+    //             std::string dst_register_name(match[1].first, match[1].second);
+    //             size_t param_index = SafeStr2Ull(match[2].str());
+    //             size_t param_offset = match[3].matched ? SafeStr2Ull(match[3].str()) : 0;
+
+    //             dev_ptr_list->emplace_back(param_index, param_offset);
+    //         } else if (match[4].matched && match[5].matched) {
+    //             // Second pattern: mov.b64 dst_register_name, Kernel_param_0;
+    //             std::string dst_register_name(match[4].first, match[4].second);
+    //             size_t param_index = SafeStr2Ull(match[5].str());
+    //             size_t param_offset = match[6].matched ? SafeStr2Ull(match[6].str()) : 0;
+    //             dev_ptr_list->emplace_back(param_index, param_offset);
+    //         }
+    //     }
+
+    //     std::sort((*dev_ptr_list).begin(), (*dev_ptr_list).end());
+    //     (*dev_ptr_list).erase(std::unique((*dev_ptr_list).begin(), (*dev_ptr_list).end()), (*dev_ptr_list).end()); // avoid loading the same device pointer repeatedly
+    // }
+};
+
+#endif
diff --git a/GPU-Virtual-Service/gpu-remoting/include/registerIOV.h b/GPU-Virtual-Service/gpu-remoting/include/registerIOV.h
new file mode 100644
index 0000000..ab44e52
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/registerIOV.h
@@ -0,0 +1,236 @@
+#ifndef REGISTER_IOV_H
+#define REGISTER_IOV_H
+
+#include "constVar.h"
+#include "configure.h"
+#include "ucpUtil.h"
+
+typedef struct {
+    size_t threadID;
+    size_t reqNum;
+    size_t iovNum;
+} RegisterHeader_t;
+
+class RegisterIOV {
+    private:
+        const char* myName_ = "RegisterIOV";
+        size_t headers_[REG_PARAM_MAX_NUM];
+        ucp_dt_iov_t iovs_[REG_PARAM_MAX_NUM];
+        int iovIdx_;
+        int requestType_;
+        int popIdx_;
+
+        bool recvFlag_;
+        RegisterHeader_t ucpHeader_;
+
+    public:
+        RegisterIOV() : iovIdx_(0), popIdx_(0), recvFlag_(false) {
+            PushRequestType(__CUDA_REGISTER);
+            ucpHeader_.reqNum = 0;
+            ucpHeader_.threadID = -1;
+            ucpHeader_.iovNum = 0;
+            iovs_[iovIdx_].length = 0;
+            iovs_[iovIdx_].buffer = headers_;
+        }
+
+        RegisterIOV(int threadID) : RegisterIOV() {
+            ucpHeader_.threadID = threadID;
+        }
+
+        RegisterIOV(const void *header, size_t header_length, void *data) : RegisterIOV() {
+            recvFlag_ = true;
+            ucpHeader_.threadID = ((size_t*)header)[0];
+            ucpHeader_.reqNum = ((size_t*)header)[1];
+            ucpHeader_.iovNum = ((size_t*)header)[2];
+            if (unlikely(ucpHeader_.iovNum >= REG_PARAM_MAX_NUM)) {
+                tool::Logging(LOG_ERROR, myName_, "push failed: already full, iovNum=%zu\n", ucpHeader_.iovNum);
+                exit(EXIT_FAILURE);
+            }
+
+            size_t offset = sizeof(size_t) * (ucpHeader_.iovNum + 1); // headers contain iovNum+1 elements (data starts from iovIdx_ = 1)
+            for (int i = 1; i <= ucpHeader_.iovNum; i++) {
+                iovIdx_++; // start from iovIdx_ = 1
+                headers_[iovIdx_] = *(size_t*)UCS_PTR_BYTE_OFFSET(data, sizeof(size_t) * iovIdx_);
+
+                iovs_[iovIdx_].length = headers_[iovIdx_];
+                iovs_[iovIdx_].buffer = UCS_PTR_BYTE_OFFSET(data, offset);
+                offset += iovs_[iovIdx_].length;
+            }
+            tool::Logging(LOG_REGS, myName_, "recvFlag=%d, threadID=%d, reqNum=%d, iovNum=%d, iovIdx=%d\n", recvFlag_, ucpHeader_.threadID, ucpHeader_.reqNum, ucpHeader_.iovNum, iovIdx_);
+        }
+
+        ~RegisterIOV() {
+            if (!recvFlag_) {
+                for (int i = 1; i <= ucpHeader_.iovNum; i++) { // not include the first iov 
+                    free(iovs_[i].buffer);
+                }
+            }
+            iovIdx_ = 0;
+        }
+
+        void Reset() {
+            iovIdx_ = 0;
+        }
+
+        int GetNum() {
+            return iovIdx_ + 1;
+        }
+
+        ucp_dt_iov_t* GetIOVs() {
+            return iovs_;
+        }
+
+        size_t* GetUcpHeaders() {
+            return (size_t*)&ucpHeader_;
+        }
+
+        size_t GetUcpHeaderSize() {
+            return sizeof(ucpHeader_);
+        }
+
+        void PushRequestType(int reqType){
+            requestType_ = reqType;
+        }
+
+        int GetRequestType() {
+            return requestType_;
+        }
+
+        int GetThreadID() {
+            return ucpHeader_.threadID;
+        }
+
+        int GetRequestNum() {
+            return ucpHeader_.reqNum;
+        }
+
+        void PushThreadID (int threadID) {
+            ucpHeader_.threadID = threadID;
+            ucpHeader_.iovNum = iovIdx_;
+            iovs_[0].length = sizeof(size_t) * (ucpHeader_.iovNum + 1);
+            tool::Logging(LOG_DEBUG, myName_, "recvFlag=%d, threadID=%d, reqNum=%d, iovNum=%d, iovIdx=%d\n", recvFlag_, ucpHeader_.threadID, ucpHeader_.reqNum, ucpHeader_.iovNum, iovIdx_);
+         }
+
+        void PushSubRequestType(int reqType){
+            Push(reqType);
+            ucpHeader_.reqNum++;
+        }
+
+        inline void CheckFull() {
+            if (unlikely(iovIdx_ + 1 >= REG_PARAM_MAX_NUM)) {
+                tool::Logging(LOG_ERROR, myName_, "push failed: already full, iovIdx=%zu\n", iovIdx_);
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        inline void CheckPopEnd() {
+            if (unlikely(popIdx_ + 1 > iovIdx_)) {
+                tool::Logging(LOG_ERROR, myName_, "pop failed: already pop to the end\n");
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        template <class T>
+        void Push(const T& item) {
+            CheckFull();
+            iovIdx_++;
+            headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T);
+            iovs_[iovIdx_].buffer = malloc(iovs_[iovIdx_].length);
+            memcpy(iovs_[iovIdx_].buffer, &item, iovs_[iovIdx_].length);
+        }
+
+        template <class T>
+        void Push64BitPointer(T& item) {
+            Push((uint64_t)item);
+        }
+
+        template <class T>
+        void Push(T* item, size_t num = 1) {
+            CheckFull();
+            iovIdx_++;
+            if (item == NULL || num == 0) {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = 0;
+            } else {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num;
+                iovs_[iovIdx_].buffer = malloc(iovs_[iovIdx_].length);
+                memcpy(iovs_[iovIdx_].buffer, item, iovs_[iovIdx_].length);
+            }
+        }
+
+        template <class T>
+        void PushConst(const T* item, size_t num = 1) {
+            CheckFull();
+            iovIdx_++;
+            if (item == NULL || num == 0) {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = 0;
+            } else {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num;
+                iovs_[iovIdx_].buffer = malloc(iovs_[iovIdx_].length);
+                memcpy(iovs_[iovIdx_].buffer, item, iovs_[iovIdx_].length);
+            }
+        }
+
+        void PushCString(const char *s) {
+            PushConst(s, strlen(s) + 1); //! including the null-terminator
+        }
+
+        template <class T>
+        T Pop() {
+            CheckPopEnd();
+            popIdx_++;
+            return *(T*)iovs_[popIdx_].buffer;
+        }
+
+        template <class T>
+        T* AssignAddr() {
+            CheckPopEnd();
+            popIdx_++;
+            if (iovs_[popIdx_].length == 0) {
+                return NULL;
+            }
+            else {
+                return (T*)iovs_[popIdx_].buffer;
+            }
+        }
+
+        template <class T>
+        T* AssignAddrForAll() {
+            return AssignAddr<T>();
+        }
+
+        char* AssignCString(){
+            CheckPopEnd();
+            popIdx_++;
+            if (iovs_[popIdx_].length == 1) { // including the null-terminator
+                return NULL;
+            }
+            else {
+                return (char*)iovs_[popIdx_].buffer;
+            }
+        }
+
+        void Print() {
+            tool::Logging(myName_, "%s has %zu elements, popIdx=%d\n", myName_, iovIdx_ + 1, popIdx_);
+            for (size_t i = 0; i <= iovIdx_; i++) {
+
+                if (iovs_[i].length == 4){
+                    tool::Logging(myName_, "\t[%zu]: len=%zu, buffer=%d\n", i, iovs_[i].length, *(int*)iovs_[i].buffer);
+                }
+                else if (iovs_[i].length == 8){
+                    tool::Logging(myName_, "\t[%zu]: len=%zu, buffer=%p\n", i, iovs_[i].length, *(void**)iovs_[i].buffer);
+                }
+                else {
+                    tool::Logging(myName_, "\t[%zu]: len=%zu, buffer=%p\n", i, iovs_[i].length, iovs_[i].buffer);
+                }
+
+                if (i == 0) {
+                    for (size_t j = 0; j <= iovIdx_; j++) {
+                        tool::Logging(myName_, "\t\t[%zu]: header=%zu, iov[0].buffer=%zu\n", j, headers_[j], ((size_t*)(iovs_[0].buffer))[j]);
+                    }
+                }
+                
+            }
+        }
+};
+
+#endif
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h b/GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h
new file mode 100644
index 0000000..275555e
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/requestBuffer.h
@@ -0,0 +1,238 @@
+#ifndef REQUEST_BUFFER_H
+#define REQUEST_BUFFER_H
+
+#include "constVar.h"
+#include "configure.h"
+
+class RequestBuffer {
+    private:
+        const char* myName_ = "RequestBuffer";
+        uint32_t popOffset_;  // point to the first empty space of the buffer
+        uint32_t backOffset_; // [popOffset_, backOffset_) is the elements for the new request
+
+    public:
+        uint8_t* _dataBuffer;
+        uint32_t _allocatedSize;
+
+        RequestBuffer(uint32_t size, uint8_t* dataBuffer) : _allocatedSize(size), _dataBuffer(dataBuffer) {
+            popOffset_ = 0;
+            backOffset_ = size;
+        }
+
+        void Reset(uint32_t size, uint8_t* dataBuffer) {
+            popOffset_ = 0;
+            backOffset_ = sizeof(int) + size;
+            _allocatedSize = sizeof(int) + size;
+            _dataBuffer = dataBuffer; //!: after reset, the dataBuffer should be freed by the caller
+        }
+
+        void SetBackOffset(uint32_t offset) {
+            backOffset_ = offset;
+        }
+    
+        RequestBuffer(uint32_t size = REQUEST_BUFFER_SIZE) {
+            popOffset_ = 0;
+            backOffset_ = 0;
+            _allocatedSize = sizeof(int) + size;
+            _dataBuffer = (uint8_t*) malloc(_allocatedSize);
+        }
+
+        RequestBuffer(const RequestBuffer* otherBuf){
+            popOffset_ = 0;
+            backOffset_ = 0;
+            _allocatedSize = otherBuf->_allocatedSize;
+            _dataBuffer = (uint8_t*) malloc(_allocatedSize);
+            memcpy(_dataBuffer, otherBuf->_dataBuffer, _allocatedSize);
+        }
+
+        ~RequestBuffer() {
+            if (_dataBuffer != NULL){
+                //todo: free(_dataBuffer);
+            }
+        }
+
+        // return the allocated size of the buffer
+        uint32_t GetSize() {
+            return _allocatedSize;
+        }
+
+        void CheckFull(size_t size) {
+            if ((backOffset_ + size) > _allocatedSize) {
+                tool::Logging(myName_, "Already full: offset %zu, new element size %zu, total %zu\n", backOffset_, size, _allocatedSize);
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        void CheckPopEnd(size_t size) {
+            if ((popOffset_ + size) > backOffset_) {
+                tool::Logging(myName_, "Already pop to the end: offset %zu, pop element size %zu, total %zu\n", popOffset_, size, backOffset_);
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        size_t getRemainingSize() {
+            return backOffset_ - popOffset_;
+        }
+
+        template <class T>
+        void Push(T item) {
+            CheckFull(sizeof(T));
+            memcpy(_dataBuffer + backOffset_, (uint8_t*)&item, sizeof(T));
+            backOffset_ += sizeof(T);
+        }
+
+        template <class T>
+        void PushConst(const T item) {
+            CheckFull(sizeof(T));
+            memcpy(_dataBuffer + backOffset_, (uint8_t*)&item, sizeof(T));
+            backOffset_ += sizeof(T);
+        }
+
+        template <class T>
+        void Push64BitPointer(T item) {
+            Push((uint64_t)item); //!: unified address is 64-bit
+        }
+
+        void PushHostAddr(const void* ptr){
+            char* addrChar = new char[HOST_POINTER_SIZE];
+#ifdef _WIN32
+                sprintf_s(addrChar, 10, "%p", ptr);
+#else
+                sprintf(addrChar, "%p", ptr);
+#endif
+            PushCString(addrChar);
+            delete[] addrChar;
+        }
+
+        void PushRequestType(int reqType){
+            Push(reqType);
+        }
+
+        // ! notice that the num is the number of elements
+        template <class T>
+        void Push(T* item, size_t num = 1) {
+            if (item == NULL) {
+                Push((size_t)0);
+                return;
+            }
+            size_t totalSize = sizeof(T) * num;
+            Push(totalSize);
+            memcpy(_dataBuffer + backOffset_, (uint8_t*)item, totalSize);
+            backOffset_ += totalSize;
+        }
+
+        template <class T>
+        void PushConst(const T* item, size_t num = 1) {
+            if (item == NULL) {
+                Push((size_t)0);
+                return;
+            }
+            size_t totalSize = sizeof(T) * num;
+            Push(totalSize); //! notice that the size is storage space size, not the number of elements
+            memcpy(_dataBuffer + backOffset_, (uint8_t*)item, totalSize);
+            backOffset_ += totalSize;
+        }
+        
+        // push the len of char* , and then push all chars
+        void PushCString(const char *s){
+            size_t len = strlen(s) + 1; //! including the null-terminator
+            PushConst(s, len);
+        }
+
+        template <class T>
+        T Pop(){
+            CheckPopEnd(sizeof(T));
+            T res = *((T*)(_dataBuffer + popOffset_));
+            popOffset_ += sizeof(T);
+            return res;
+        }
+
+        // copy the array, and return the new begin address
+        template <class T>
+        T* Pop(size_t n) {
+            if(Pop<size_t>() == 0) { // first, pop the size of the array
+                return NULL;
+            }
+            CheckPopEnd(sizeof(T) * n);
+            T* res = new T[n];
+            memcpy((uint8_t*)res, _dataBuffer + popOffset_, sizeof(T) * n);
+            popOffset_ += sizeof(T) * n;
+            return res;
+        }
+
+        // copy the array to the given address
+        template <class T>
+        void Pop(T* res, size_t n) {
+            if(Pop<size_t>() == 0) { // first, pop the size of the array
+                return;
+            }
+            CheckPopEnd(sizeof(T) * n);
+            memcpy((uint8_t*)res, _dataBuffer + popOffset_, sizeof(T) * n);
+            popOffset_ += sizeof(T) * n;
+        }
+
+        template <class T>
+        T* PopAll() {
+            size_t size = Pop<size_t>(); // first, pop the size of the array
+            if(size == 0) { 
+                return NULL;
+            }
+            CheckPopEnd(size);
+            T* res = (T*)malloc(size);
+            memcpy((uint8_t*)res, _dataBuffer + popOffset_, size);
+            popOffset_ += size;
+            return res;
+        }
+
+        // pop the pointer with 64 bit address
+        template <class T>
+        T PopFromAddr() {
+            return (T)Pop<uint64_t>();
+        }
+
+        // just assign the begin address, not copy the data
+        template <class T>
+        T* AssignAddr(size_t n = 1) {
+            if(Pop<size_t>() == 0) { // first, pop the size of the array
+                return NULL;
+            }
+            CheckPopEnd(sizeof(T) * n);
+            T* res = (T*)(_dataBuffer + popOffset_);
+            popOffset_ += sizeof(T) * n;
+            return res;
+        }
+        
+        template <class T>
+        T* AssignAddrForAll() {
+            size_t size = Pop<size_t>(); // first, pop the size of the array
+            if(size == 0) { 
+                return NULL;
+            }
+            CheckPopEnd(size);
+            T* res = (T*)(_dataBuffer + popOffset_);
+            popOffset_ += size;
+            return res;
+        }
+
+        template <class T>
+        T* AssignAddrForLast() {
+            T* res = (T*)(_dataBuffer + popOffset_);
+            popOffset_ = backOffset_;
+            return res;
+        }
+
+        char* AssignCString(){
+            return AssignAddrForAll<char>();
+        }
+
+        void Print() {
+            tool::Logging(LOG_DEBUG, myName_, "RequestBuffer: popOffset_ = %u, backOffset_ = %u, _allocatedSize = %u\n", popOffset_, backOffset_, _allocatedSize);
+            size_t i = 0;
+            for (i = 0; i < backOffset_; i++) {
+                printf("%02x ", _dataBuffer[i]);
+            }
+            printf("\n");
+        }
+};
+
+#endif
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/requestIOV.h b/GPU-Virtual-Service/gpu-remoting/include/requestIOV.h
new file mode 100644
index 0000000..8d3afd6
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/requestIOV.h
@@ -0,0 +1,307 @@
+#ifndef REQUEST_IOV_H
+#define REQUEST_IOV_H
+
+#include "constVar.h"
+#include "configure.h"
+#include "ucpUtil.h"
+
+class RequestIOV : public boost::intrusive::list_base_hook<> {
+    private:
+        const char* myName_ = "RequestIOV";
+        size_t headers_[PARAM_MAX_NUM];
+        ucp_dt_iov_t iovs_[PARAM_MAX_NUM];
+        int iovIdx_;
+        int requestType_;
+        int threadID_; 
+        int popIdx_;
+        size_t paramTotalSize_;
+
+        void* dataBuffer_ = NULL;
+
+    public:
+        RequestIOV() : iovIdx_(-1), popIdx_(-1), threadID_(-1), paramTotalSize_(0) {}
+
+        RequestIOV(int threadID) : RequestIOV() {
+            threadID_ = threadID;
+        }
+
+        RequestIOV(const void *header, size_t header_length, void *data) : RequestIOV() {
+            int iovNum = (header_length - sizeof(size_t)) / sizeof(size_t); // not including the threadID
+            size_t offset = 0;
+            for (int i = 0; i < iovNum; i++) {
+                CheckFull();
+                iovIdx_++;
+                iovs_[iovIdx_].length = headers_[iovIdx_] = ((size_t*)header)[i];
+                iovs_[iovIdx_].buffer = UCS_PTR_BYTE_OFFSET(data, offset);
+                offset += iovs_[iovIdx_].length;
+            }
+            threadID_ = ((size_t*)header)[iovNum];
+        }
+
+        ~RequestIOV() {
+            iovIdx_ = -1;
+            if (dataBuffer_ != NULL) { // means that this object is cloned
+                void* realData = ((ucp_dt_iov_t*)dataBuffer_)->buffer;
+                // printf("Release RequestIOV for requestType=%d, realData=%p(size=%zu)\n", requestType_, realData, ((ucp_dt_iov_t*)dataBuffer_)->length);
+                if (requestType_ == CUDA_MEMCPY_ASYNC_H2D) {
+                    // printf("Release RequestIOV for requestType=CUDA_MEMCPY_ASYNC_H2D, realData=%p(size=%zu), iovs[0]=%p(buffer=%p)\n", realData, ((ucp_dt_iov_t*)dataBuffer_)->length, iovs_, iovs_[0].buffer);
+                }
+                if (realData != NULL) {
+                    free(realData);
+                    // realData = NULL;
+                }
+                // dataBuffer_ = NULL;
+            }
+        }
+
+        void Reset() {
+            iovIdx_ = -1;
+        }
+
+        int GetNum() {
+            return iovIdx_ + 1;
+        }
+
+        ucp_dt_iov_t* GetIOVs() {
+            return iovs_;
+        }
+
+        size_t* GetHeaders() {
+            return headers_;
+        }
+
+        size_t GetHeaderSize() {
+            return sizeof(size_t) * (iovIdx_ + 1 + (threadID_ == -1 ? 0 : 1));
+        }
+
+        size_t GetParamTotalSize() {
+            return paramTotalSize_;
+        }
+
+        int GetRequestType() {
+            return requestType_;
+        }
+
+        int GetThreadID() {
+            return threadID_;
+        }
+
+        inline void CheckFull() {
+            if (unlikely(iovIdx_ + 1 >= PARAM_MAX_NUM)) {
+                tool::Logging(LOG_ERROR, myName_, "push failed: already full, iovIdx=%zu\n", iovIdx_);
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        inline void CheckPopEnd() {
+            if (unlikely(popIdx_ + 1 > iovIdx_)) {
+                tool::Logging(LOG_ERROR, myName_, "pop failed: already pop to the end\n");
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        void PushThreadID(int threadID) {
+            CheckFull(); // check the headers is full or not
+            headers_[iovIdx_ + 1] = threadID_ = threadID; // threadID is the last element of the headers
+        }
+
+        template <class T>
+        void Push(T& item) {
+            CheckFull();
+            iovIdx_++;
+            headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T);
+            iovs_[iovIdx_].buffer = &item;
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        template <class T>
+        void PushConst(const T& item) {
+            CheckFull();
+            iovIdx_++;
+            headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T);
+            iovs_[iovIdx_].buffer = (void*)&item;
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        template <class T>
+        void Push64BitPointer(T& item) {
+            CheckFull();
+            iovIdx_++;
+            headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(uint64_t);
+            iovs_[iovIdx_].buffer = &item;
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        template <class T>
+        void Push64BitPointer(const T& item) {
+            CheckFull();
+            iovIdx_++;
+            headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(uint64_t);
+            iovs_[iovIdx_].buffer = (void*)&item;
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        void PushVar(void* ptr, size_t size) {
+            CheckFull();
+            iovIdx_++;
+            headers_[iovIdx_] = iovs_[iovIdx_].length = size;
+            iovs_[iovIdx_].buffer = ptr;
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        void PushRequestType(int reqType){
+            requestType_ = reqType;
+        }
+
+        template <class T>
+        void Push(T* item, size_t num = 1) {
+            CheckFull();
+            iovIdx_++;
+            if (item == NULL || num == 0) {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = 0;
+            } else {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num;
+                iovs_[iovIdx_].buffer = item;
+            }
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        template <class T>
+        void PushConst(const T* item, size_t num = 1) {
+            CheckFull();
+            iovIdx_++;
+            if (item == NULL || num == 0) {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = 0;
+            } else {
+                headers_[iovIdx_] = iovs_[iovIdx_].length = sizeof(T) * num;
+                iovs_[iovIdx_].buffer = (void*)item;
+            }
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        void PushCString(const char *s) {
+            CheckFull();
+            iovIdx_++;
+            headers_[iovIdx_] = iovs_[iovIdx_].length = strlen(s) + 1; //! including the null-terminator
+            iovs_[iovIdx_].buffer = (void*)s;
+            paramTotalSize_ += iovs_[iovIdx_].length;
+        }
+
+        template <class T>
+        T Pop() {
+            CheckPopEnd();
+            popIdx_++;
+            return *(T*)iovs_[popIdx_].buffer;
+        }
+
+        template <class T>
+        T* AssignAddr() {
+            CheckPopEnd();
+            popIdx_++;
+            if (iovs_[popIdx_].length == 0) {
+                return NULL;
+            }
+            else {
+                return (T*)iovs_[popIdx_].buffer;
+            }
+        }
+
+        template <class T>
+        T* AssignAddrForAll() {
+            return AssignAddr<T>();
+        }
+
+        char* AssignCString(){
+            CheckPopEnd();
+            popIdx_++;
+            if (iovs_[popIdx_].length == 1) { // including the null-terminator
+                return NULL;
+            }
+            else {
+                return (char*)iovs_[popIdx_].buffer;
+            }
+        }
+
+        void Print() {
+            tool::Logging(LOG_INFO, myName_, "RequestIOV has %zu elements, popIdx=%d\n", iovIdx_ + 1, popIdx_);
+            for (size_t i = 0; i <= iovIdx_; i++) {
+                tool::Logging(LOG_INFO, myName_, "\t[%zu]: len=%zu, buffer=%p\n", i, iovs_[i].length, *(void**)iovs_[i].buffer);
+            }
+        }
+
+        RequestIOV* Clone(uint8_t* H2Dheaders = NULL, size_t headerSize = 0) {
+            RequestIOV* newReq = new RequestIOV();
+            newReq->iovIdx_ = iovIdx_;
+            newReq->requestType_ = requestType_;
+            newReq->threadID_ = threadID_;
+            newReq->popIdx_ = popIdx_;
+            newReq->paramTotalSize_ = paramTotalSize_;
+            newReq->dataBuffer_ = newReq->iovs_;
+
+            if (H2Dheaders) {
+                memcpy((uint8_t*)newReq->headers_, H2Dheaders, headerSize);
+            //     newReq->iovs_[0].length = iovs_[0].length;
+            //     newReq->iovs_[0].buffer = iovs_[0].buffer;
+            // }
+            // else if (requestType_ == CUDA_MEMCPY_ASYNC_H2D) {
+            //     memcpy(newReq->headers_, headers_, 4+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t)+sizeof(uint64_t));
+                void* dataBuffer = malloc(paramTotalSize_);
+                size_t offset = 0;
+                for (size_t i = 0; i <= iovIdx_; i++) {
+                    newReq->iovs_[i].length = iovs_[i].length;
+                    // newReq->iovs_[i].buffer = iovs_[i].buffer;
+                    newReq->iovs_[i].buffer = UCS_PTR_BYTE_OFFSET(dataBuffer, offset);
+                    memcpy(newReq->iovs_[i].buffer, iovs_[i].buffer, newReq->iovs_[i].length);
+                    offset += newReq->iovs_[i].length;
+                }
+            }
+            else {
+                void* dataBuffer = malloc(paramTotalSize_);
+                size_t offset = 0;
+                for (size_t i = 0; i <= iovIdx_; i++) {
+                    newReq->iovs_[i].length = newReq->headers_[i] = headers_[i];
+                    newReq->iovs_[i].buffer = UCS_PTR_BYTE_OFFSET(dataBuffer, offset);
+                    memcpy(newReq->iovs_[i].buffer, iovs_[i].buffer, newReq->iovs_[i].length);
+                    offset += newReq->iovs_[i].length;
+                }
+                newReq->headers_[iovIdx_ + 1] = threadID_;
+            }
+            
+            return newReq;
+        }
+
+        bool ContainsHandle(uint64_t* handleList, size_t handleCnt) {
+            for (size_t i = 0; i <= iovIdx_; i++) {
+                if (iovs_[i].length != sizeof(uint64_t) || *(void**)iovs_[i].buffer == NULL) {
+                    continue;
+                }
+                for (size_t j = 0; j < handleCnt; j++) { // todo: may be optimized
+                    if (*(uint64_t*)iovs_[i].buffer == handleList[j]) {
+                        return true;
+                    }
+                }
+            }
+            return false;
+        }
+
+        uint64_t GetHandleByIndex(size_t idx) {
+            if (idx > iovIdx_) {
+                return 0;
+            }
+            if (iovs_[idx].length != sizeof(uint64_t) || *(void**)iovs_[idx].buffer == NULL) {
+                return 0;
+            }
+            return *(uint64_t*)iovs_[idx].buffer;
+        }
+
+        bool SetElement(size_t idx, void* buffer, size_t length) {
+            if (idx > iovIdx_) {
+                return false;
+            }
+            iovs_[idx].buffer = buffer;
+            headers_[idx] = iovs_[idx].length = length;
+            return true;
+        }
+};
+
+#endif
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/scheduler.h b/GPU-Virtual-Service/gpu-remoting/include/scheduler.h
new file mode 100644
index 0000000..3759829
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/scheduler.h
@@ -0,0 +1,544 @@
+#ifndef SCHEDULER_H
+#define SCHEDULER_H
+
+#include "configure.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cuda_device_runtime_api.h>
+#include <cuda_runtime_api.h>
+#include <netinet/in.h>
+#include <nvml.h>
+#include <driver_types.h>
+#include <queue>
+#include <set>
+#include <strings.h>
+#include <unistd.h>
+#include <unordered_map>
+#include <chrono>
+#include <utility>
+#include <cuda.h>
+#include <vector>
+#include <mutex>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include "constVar.h"
+#include "define.h"
+#include <sys/un.h>
+#include <unistd.h>
+#define  GPU_MAX_NUM 8
+#define  THRESH_ITER 11
+// #define  TIME_QUANTUM 600
+#define TIME_QUANTUM 360
+extern Configure config_;
+#define GPUNum 2
+
+using namespace std;
+
+
+typedef struct JobInfo {
+    size_t used_mem; // memory
+    uint sm_util; //SM utilization
+    double served_time;//按秒算
+    double remain_time;
+    double perIt_time;
+    int cur_It;// current iterations
+    int num_It;// the number of iterations
+    int device;
+    uint64_t client_id;
+
+    bool operator<(const JobInfo& other) const {
+        return remain_time < other.remain_time;
+    }
+} JobInfo_;
+
+class Scheduler {
+    private:
+        const char* myName_ = "Scheduler";
+        static size_t free_mem[GPU_MAX_NUM];
+        static size_t total_mem[GPU_MAX_NUM];
+        static set<JobInfo_*> jobs;
+        static queue<JobInfo_*> unknown_queue[GPU_MAX_NUM]; // 未知任务信息的队列 
+        static vector<JobInfo_*> known_queue[GPU_MAX_NUM]; // 已知任务信息队列
+        static unordered_map<JobInfo_*, int> known_map; // <job,token> token=1表示当前已知信息的任务在运行
+        // static int flag;//判断是否有已知信息的任务在运行,0表示没有
+        static int flag[GPUNum];//
+        // static int queues;//0表示轮到unknown_queue，1表示轮到known_queue
+        static int queues[GPUNum];//不同GPU下的队列情况,0表示轮到unknown_queue，1表示轮到known_queue
+        // static chrono::steady_clock::time_point timer;//计时器，判断处理哪个队列
+        static chrono::steady_clock::time_point timers[GPUNum];//计时器，判断处理哪个队列
+        static set<uint64_t> client_ids;//防止重放入
+        
+        int dev_cnt;
+        JobInfo_* job;
+        pid_t pid;
+        size_t* shm_util = 0;
+        chrono::steady_clock::time_point start;
+        int cur_pid;
+        // int token = 0;//1表示当前已知信息的任务在运行
+
+        int _sock = -1; // 用于连接monitor的静态socket
+        
+        //建立与monitor连接，并确保连接仅建立一次
+        void initialize_connection() {
+
+            if (_sock == -1) {
+                int tmptid = static_cast<pid_t>(::syscall(SYS_gettid));
+                std::cout << "clientId: " << job->client_id <<" tid:" << tmptid << " get in initalize_connection" << std::endl;
+
+                struct sockaddr_in serv_addr;
+
+                if ((_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+                    tool::Logging(LOG_ERROR, "Scheduler", "Socket creation error.\n");
+                    return;
+                }
+                memset(&serv_addr, 0, sizeof(serv_addr));
+                serv_addr.sin_family = AF_INET;
+                serv_addr.sin_port = htons(config_.GetMonPort());
+
+
+                if (inet_pton(AF_INET, config_.GetMonIp().c_str(), &serv_addr.sin_addr) <= 0) {
+                    tool::Logging(LOG_ERROR, "Scheduler", "Invalid address/ Address not supported.\n");
+                    return;
+                }
+
+                if (connect(_sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+                    tool::Logging(LOG_ERROR, "Scheduler", "Connection Failed.\n");
+                    return;
+                }
+
+                tool::Logging(LOG_INFO, "Scheduler", "Connected to Monitor.\n");
+
+            }
+        }
+
+        void release_connection() {
+            // std::lock_guard<std::mutex> lock(conn_mutex);
+            std::cout << "_sock:" << _sock << std::endl;
+            int tmptid = static_cast<pid_t>(::syscall(SYS_gettid));
+            if (_sock != -1) {
+                std::cout << "clientId: " << job->client_id <<" tid:" << tmptid << " get in release_connection" << std::endl;
+                close(_sock);
+                tool::Logging(LOG_INFO, "Scheduler", "Disconnected from Monitor.\n");
+            }
+        }
+
+        void delayed_send(const std::string& message) {
+            std::this_thread::sleep_for(std::chrono::seconds(3));  // 延迟发送，dummy? 发现初始化时间大概十几秒往上，甚至几分钟都有
+            send(_sock, message.c_str(), message.size(), 0);
+        }
+        
+
+        static std::mutex mtx_used[GPU_MAX_NUM];
+        static std::mutex mtx_sche[GPU_MAX_NUM];
+        static std::mutex mtx_exc[GPU_MAX_NUM];
+
+    public:
+        Scheduler(uint64_t clientId) {
+            int tmptid = static_cast<pid_t>(::syscall(SYS_gettid));
+            std::cout << "Scheduler::Scheduler() threadId = " << tmptid << "clientID: "<< clientId << std::endl;
+            cudaGetDeviceCount(&dev_cnt);
+            size_t free_byte, total_byte;
+            
+            for(int i = 0; i < dev_cnt; i++) {
+                cudaSetDevice(i);
+                cudaMemGetInfo(&free_byte, &total_byte);
+                free_mem[i] = free_byte;
+                total_mem[i] = total_byte;
+            }
+            std::set<JobInfo_*>::iterator iter;
+            for(iter = jobs.begin(); iter != jobs.end(); ++iter) {
+                if((*iter)->client_id == clientId) {
+                    job = *iter;
+                    tool::Logging(LOG_INFO,myName_,"get cid = %d\n",job->client_id);
+                    break;
+                }
+            }
+            if(iter == jobs.end() || jobs.empty()) {
+                job = new JobInfo_();
+                job->client_id = clientId;
+            }
+            // job->client_id = clientId;
+            cur_pid = getpid();
+
+#ifdef GV_Monitor
+            //建立与Monitor的tcp连接
+            initialize_connection();
+#endif
+
+        }
+
+        ~Scheduler() {
+            // tool::Logging(LOG_INFO,myName_,"client %lu delete\n",job->client_id);
+            int tmptid = static_cast<pid_t>(::syscall(SYS_gettid));
+            std::cout << "Scheduler::~Scheduler() threadId = " << tmptid << "clientID: "<< job->client_id << std::endl;
+
+            delete job;
+            // release_connection();
+#ifdef GV_Monitor
+            close(_sock);
+#endif
+        }
+        
+        void send_message(const std::string& message, int curDev_) {
+            // std::lock_guard<std::mutex> lock(conn_mutex);
+            int tmptid = static_cast<pid_t>(::syscall(SYS_gettid));
+            std::cout << "sock:" << _sock << " tid: " << tmptid << std::endl;
+            string msg = "";
+            if (_sock != -1) {
+                if(message == "client_runjob:"){
+                    msg = message + to_string(curDev_);
+                    // std::thread(&Scheduler::delayed_send, this, msg).detach();
+                    send(_sock, msg.c_str(), msg.size(), 0);
+                }else if(message == "client_stop:"){
+                    msg = message + to_string(curDev_);
+                    send(_sock, msg.c_str(), msg.size(), 0);
+                }
+                // send(_sock, message.c_str(), message.size(), 0);
+                tool::Logging(LOG_INFO, "Scheduler", "Message sent: %s\n", msg.c_str());
+            } else {
+                tool::Logging(LOG_ERROR, "Scheduler", "Socket is not connected.\n");
+            }
+        }
+
+        bool ready_to_del(){
+            char buffer[256];
+            size_t n = read(_sock, buffer, sizeof(buffer) - 1);
+            if(n > 0) {
+                buffer[n] = '\0';
+                tool::Logging(LOG_INFO, "Scheduler", "Received message: %s\n", buffer);
+                return true;
+            }
+            tool::Logging(LOG_ERROR, "Scheduler", "Failed to receive message.\n");
+            return false;
+        }
+        
+
+        int get_free_gpu() {
+            int max_index = -1;
+            int max_free = 0;
+            
+            for(int i = 0; i < dev_cnt; i++) {
+                cudaSetDevice(i);
+                size_t free_byte, total_byte;
+                cudaMemGetInfo(&free_byte, &total_byte);
+                if(max_free < free_byte) {
+                    max_free = free_byte;
+                    max_index = i;
+                }
+            }
+            return max_index;
+        }
+
+        void enqueue(uint64_t client_id, int device) {
+            //处理任务入队，可能是新任务，可能是已知信息任务。
+            {
+                std::lock_guard<std::mutex> lock(mtx_used[device]);
+                if(timers[device] == std::chrono::steady_clock::time_point()){
+                    tool::Logging(LOG_INFO, myName_, "device:%d , first time std::chrono::steady_clock::time_point\n", device);
+                    timers[device] = chrono::steady_clock::now();
+                }
+                else {
+                    chrono::steady_clock::time_point now_time = chrono::steady_clock::now();
+                    int duration = (chrono::duration_cast<chrono::milliseconds>(now_time - timers[device]).count() * 1.0 / 1000);
+                    // if(duration % 100 == 0){
+                    //     tool::Logging(LOG_INFO, myName_, "duration = %d\n",duration);
+                    // }
+                    if(duration >= TIME_QUANTUM) {
+                        queues[device] = (queues[device] == 0) ? 1 : 0;
+                        tool::Logging(LOG_INFO, myName_, "device:%d, change to handle queue %d\n",device,queues[device]);
+                        // SendNewInfoToDispatcher(config_);
+                        timers[device] = chrono::steady_clock::now();
+                    }
+                }
+            }
+            
+            if(jobs.count(job) == 0 && client_ids.count(job->client_id) == 0) { //新来的任务
+                {
+                    std::lock_guard<std::mutex> lock(mtx_used[device]);
+                    if(queues[device] == 1 && known_queue[device].empty()) {
+                        queues[device] = 0;
+                        tool::Logging(LOG_INFO, myName_, "device:%d,there is no job in queue1, change to handle queue %d\n",device, queues[device]);
+                    }
+                }
+                
+                jobs.insert(job);
+                client_ids.insert(job->client_id);
+                unknown_queue[device].push(job);
+                tool::Logging(LOG_INFO,myName_,"device:%d, client %lu insert unknown_queue, device:%d, flag = %d, cid = %lu, unknown_queue_size:%d\n",device, client_id, device ,flag[device], unknown_queue[device].front()->client_id,unknown_queue[device].size());
+                int threadId = static_cast<pid_t>(::syscall(SYS_gettid));
+                tool::Logging(LOG_INFO,myName_,"client %lu threadId = %d\n",client_id,threadId);
+
+                while (unknown_queue[device].front() != job || flag[device] != 0 || queues[device] != 0) {
+                    tool::Logging(LOG_INFO,myName_,"get int unknown_queue[%d].front() != job || flag != 0 || queues != 0,front() == %lu, flag == %d, queues = %d,\n",device, unknown_queue[device].front()->client_id, flag[device], queues[device]);
+                    sleep(1);
+                }
+                //独占GPU
+                if(unknown_queue[device].front() == job) {
+                    // int device = get_free_gpu();
+                    tool::Logging(LOG_INFO,myName_,"client %lu starts on gpu %d\n",job->client_id, device);
+                    // cudaSetDevice(device);
+                    job->device = device;
+                    int shmid = shmget(1234, sizeof(int), IPC_CREAT | 0666);
+                    if (shmid < 0) {
+                        perror("shmget");
+                        exit(EXIT_FAILURE);
+                    }
+
+                    shm_util = (size_t *)shmat(shmid, NULL, 0);
+                    if ((intptr_t)shm_util == -1) {
+                        perror("shmat");
+                        exit(EXIT_FAILURE);
+                    }
+                    start = chrono::steady_clock::now();
+                    pid = fork();
+                    unknown_dequeue(pid);
+                }
+            }
+            else {//已知任务信息的任务
+                if (job->cur_It < THRESH_ITER + 1) {
+                    // std::cout << "clientID:" << client_id << " get in if " << device <<std::endl;
+                    return;
+                }
+                //cal_add_It过来，也就是任务已经完成了profiling，这时候需要将任务先从unknow_queue里出队，再插入到known_queue中
+                else if (job->cur_It == (THRESH_ITER + 1) && unknown_queue[device].front() == job) {
+                    // if(pid > 0) {
+                    tool::Logging(LOG_INFO,myName_,"device:%d, finishing profiling client %lu\n", device, job->client_id);
+                    unknown_dequeue(pid);
+                    unknown_queue[device].pop();
+                    // }
+                    chrono::steady_clock::time_point now_time = chrono::steady_clock::now();
+                    job->served_time = (chrono::duration_cast<chrono::milliseconds>(now_time - start).count() * 1.0 / 1000);
+                    job->perIt_time = job->served_time / (THRESH_ITER - 1);
+                    job->remain_time = (job->num_It - job->cur_It + 1) * job->perIt_time;
+                    tool::Logging(LOG_INFO,myName_,"client %lu job Info: num_It=%d, cur_It=%d, sm_util=%d, mem=%zu, served_time = %lf,perIt_time = %lf,remain_time=%lf\n",
+                    job->client_id, job->num_It, job->cur_It, job->sm_util, job->used_mem, job->served_time, job->perIt_time, job->remain_time);
+                    known_queue[device].emplace_back(job);
+                    tool::Logging(LOG_INFO, myName_, "device:%d, client %lu insert known_queue. unknown_queue size = %d\n",device, job->client_id, unknown_queue[device].size());
+                    known_dequeue(device);
+                    start = chrono::steady_clock::now();
+                    tool::Logging(LOG_INFO, myName_, "client %lu restarts.\n", job->client_id);
+                    // }
+                }
+                else {//选择剩余时间最小的任务
+                    known_dequeue(device);
+                }
+            }
+        }
+
+        void unknown_dequeue(pid_t pid) {
+            if(pid == 0) {//child process
+                // int device;
+                // cudaGetDevice(&device);
+                nvmlReturn_t result;
+                result = nvmlInit();
+                if (result != NVML_SUCCESS) {
+                    tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: Failed to initialize NVML %d\n",job->client_id, result);
+                    return;
+                }
+                int index = 0;
+                cudaGetDevice(&index);
+                nvmlDevice_t dev;
+                result = nvmlDeviceGetHandleByIndex_v2(index, &dev);
+                if(result != NVML_SUCCESS) {
+                    tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: nvmlDeviceGetHandleByIndex %d\n",job->client_id, result);
+                    return;
+                }
+                int cnt = 0;
+                while (true) {
+                    nvmlUtilization_t util;
+                    result = nvmlDeviceGetUtilizationRates(dev, &util);
+                    if(result != NVML_SUCCESS) {
+                        tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: nvmlDeviceGetUtilizationRates\n",job->client_id);
+                        return;
+                    }
+                    if(util.gpu > 0) {
+                        job->sm_util += util.gpu;
+                        // tool::Logging(LOG_INFO,myName_,"client %lu util = %d %d %d\n",job->client_id,util.gpu,util.gpu,job->sm_util);
+                        ++cnt;
+                        *shm_util = job->sm_util / cnt;
+                        // *shm_util = max((uint)*shm_util, util.gpu);
+                    }
+ 
+                }
+            }
+            else if(pid > 0) {
+                if(job->cur_It == (THRESH_ITER + 1)) {
+                    if (kill(pid, SIGKILL) == 0) {
+                        tool::Logging(LOG_INFO,myName_,"client %lu Sent SIGKILL to child process\n",job->client_id);
+                    } else {
+                        tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: Failed to send SIGINT to child process\n",job->client_id);
+                    }
+                    job->sm_util = *shm_util;
+                }
+            }
+            else {
+                tool::Logging(LOG_ERROR,myName_,"client %lu fork error\n",job->client_id);
+            }
+        }
+
+        void known_allocate(int device) {
+            std::lock_guard<std::mutex> lock(mtx_sche[job->device]);
+            // tool::Logging(LOG_INFO,myName_,"client %lu try to known_allocate\n",job->client_id);
+            if (!flag[device]) {
+                //按剩余时间从小到大排序
+                sort(known_queue[device].begin(), known_queue[device].end());
+                // tool::Logging(LOG_INFO,myName_,"client %lu finish sort\n",job->client_id);
+
+                JobInfo_* shortest = known_queue[device][0];
+                known_map[shortest] = 1;
+                ++flag[device];
+                size_t mem = shortest->used_mem;
+                int util = shortest->sm_util;
+                tool::Logging(LOG_INFO,myName_,"choose shortest job: cid = %lu, mem = %lu, util = %u\n",shortest->client_id, mem, util);
+
+                //选择资源互补任务
+                for(auto it = known_queue[device].begin(); it != known_queue[device].end(); ++it) {
+                    tool::Logging(LOG_INFO,myName_,"jobInfo: cid = %lu, mem = %lu/%lu, util = %u\n",(*it)->client_id, (*it)->used_mem, total_mem[shortest->device],(*it)->sm_util,(*it)->remain_time);
+                    if(it == known_queue[device].begin()){
+                        continue;
+                    }
+                    JobInfo_* tmp = *it;
+                    if(tmp->device != shortest->device)
+                        continue;
+                    if((tmp->used_mem + mem) <= total_mem[shortest->device] && (tmp->sm_util + util) <= 100 && known_map[tmp] == 0) {
+                        known_map[tmp] = 1;
+                        mem += tmp->used_mem;
+                        util += tmp->sm_util;
+                        ++flag[device];
+                        tool::Logging(LOG_INFO,myName_,"choose resource-inter job: cid = %lu, mem = %lu, util = %u\n",tmp->client_id, mem, util);
+                    }
+                }
+                // tool::Logging(LOG_INFO,myName_,"client %lu finish known_allocate\n",job->client_id);
+            }
+        }
+
+        void known_dequeue(int device) {
+            {
+                std::lock_guard<std::mutex> lock(mtx_used[device]);
+                if(queues[device] == 0 && unknown_queue[device].empty()) {
+                    queues[device] = 1;
+                    tool::Logging(LOG_INFO, myName_, "device[%d]there is no job in queue0, change to handle queue %d\n",device,queues[device]);
+                }
+            }
+            
+            while (!unknown_queue[device].empty() && queues[device] == 0) {//等待新来任务完成独占profiling
+                if(known_map[job] == 1) {
+                    --flag[device];
+                    known_map[job] = 0;
+                    //TODO
+                    chrono::steady_clock::time_point now_time = chrono::steady_clock::now();
+                    job->served_time += (chrono::duration_cast<chrono::milliseconds>(now_time - start).count() * 1.0 / 1000);
+                    job->remain_time = (job->num_It - job->cur_It + 1) * job->perIt_time;
+                    tool::Logging(LOG_INFO, myName_, "client id %lu block, there is a new job, id = %lu\n", job->client_id, unknown_queue[device].front()->client_id);
+                }
+                sleep(1);
+            }
+            if(known_map[job]){
+                // std::cout << "get_in_known_job" << std::endl;
+                return;
+            }
+
+        schedule_shortest:
+            known_allocate(device);
+            //flag != 0 有任务正在运行
+            while(known_map[job] == 0) {
+                sleep(1);
+            }
+            if(!flag[device])
+                goto schedule_shortest;
+            tool::Logging(LOG_INFO, myName_, "client %lu restarts.\n", job->client_id);
+        }
+
+        void cal_add_It(uint64_t client_id) {
+            /*
+            根据zss所说，每个iteration开始会调用两次cudaMemcpyAsyncH2DHandle，所以本函数的大概意思是，如果job->cur_It ==2，开始计时；
+            THRESH_ITER+1的值为12，也就是结束了profiling，就会再次调用enqueue插入到known_queue中。
+            */
+            ++job->cur_It;
+            if(job->cur_It == 2) {
+                start = chrono::steady_clock::now();
+            }
+            if(job->cur_It == (THRESH_ITER + 1)) {
+                int device;
+                cudaGetDevice(&device);
+                std::cout << "clientId:"<< client_id << " cal_add_It_device:" << device << std::endl;
+                // tool::Logging(LOG_INFO, myName_, "cal_add_It_device_!!client %lu, Device: %d .\n", client_id, device);
+
+                enqueue(client_id,device);
+            }
+        }
+
+        void get_Iteration(int num) {
+            job->num_It = int(num);
+        }
+
+        void cal_job_mem(size_t size) {
+            job->used_mem += size;
+        }
+
+        void free_jobs(uint64_t client_id, int device) {
+            if(unknown_queue[device].front() == job) {
+                unknown_queue[device].pop();
+                {
+                    std::lock_guard<std::mutex> lock(mtx_used[device]);
+                    if(queues[device] == 0 && unknown_queue[device].empty()) {
+                        queues[device] = 1;
+                    }
+                }
+
+            }
+            else {
+                for(auto it = known_map.begin(); it != known_map.end(); ++it) {
+                    if((*it).second == 1) {
+                        (*it).second = 0;
+                        --flag[device];
+                        chrono::steady_clock::time_point now_time = chrono::steady_clock::now();
+                        job->served_time += (chrono::duration_cast<chrono::milliseconds>(now_time - start).count() * 1.0 / 1000);
+                        job->remain_time = (job->num_It - job->cur_It + 1) * job->perIt_time;
+                        tool::Logging(LOG_INFO, myName_, "client id %lu block, job %lu has finished\n", (*it).first->client_id,job->client_id);
+                    }
+                }
+                auto it = known_map.find(job);
+                if(it != known_map.end())
+                    known_map.erase(it);
+                auto it1 = find(known_queue[device].begin(), known_queue[device].end(), job);
+                if(it1 != known_queue[device].end())
+                    known_queue[device].erase(it1);
+                std::lock_guard<std::mutex> lock(mtx_used[device]);
+                if(queues[device] == 1 && known_queue[device].empty()) {
+                    queues[device] = 0;
+                }
+            }
+
+            auto it2 = jobs.find(job);
+            if(it2 != jobs.end())
+                jobs.erase(it2);
+            // add_sub_unuse_men(device, total_m, 1);
+            if (kill(pid, SIGKILL) == 0) {
+                tool::Logging(LOG_INFO,myName_,"client %lu Sent SIGKILL to child process\n",job->client_id);
+            } else {
+                tool::Logging(LOG_ERROR,myName_,"client %lu ERROR: Failed to send SIGINT to child process\n",job->client_id);
+            }
+        }
+
+
+
+};
+
+inline size_t Scheduler::free_mem[GPU_MAX_NUM];
+inline size_t Scheduler::total_mem[GPU_MAX_NUM];
+inline set<JobInfo_*> Scheduler::jobs;
+inline queue<JobInfo_*> Scheduler::unknown_queue[GPU_MAX_NUM]; // 未知任务信息的队列
+inline vector<JobInfo_*> Scheduler::known_queue[GPU_MAX_NUM]; // 已知任务信息队列
+inline int Scheduler::flag[GPUNum];//判断是否有已知信息的任务在运行,0表示没有
+inline unordered_map<JobInfo_*, int> Scheduler::known_map;
+inline int Scheduler::queues[GPUNum];//0表示轮到unknown_queue，1表示轮到known_queue
+// inline chrono::steady_clock::time_point Scheduler::timer;
+inline chrono::steady_clock::time_point Scheduler::timers[GPUNum];
+inline set<uint64_t> Scheduler::client_ids;
+
+inline std::mutex Scheduler::mtx_used[GPU_MAX_NUM];
+inline std::mutex Scheduler::mtx_sche[GPU_MAX_NUM];
+
+#endif
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h b/GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h
new file mode 100644
index 0000000..eab9ef6
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/include/serverEndpoint.h
@@ -0,0 +1,422 @@
+#ifndef SERVER_ENDPOINT_H
+#define SERVER_ENDPOINT_H
+#include <alloca.h>
+#include "configure.h"
+#include "requestBuffer.h"
+#include "requestIOV.h"
+#include "registerIOV.h"
+#include "ucpConnection.h"
+// #include "./tsl/bhopscotch_map.h"
+#include "./hashing/robin_hood.h"
+#include "./conqueue/readerwriterqueue.h"
+#include "./shmqueue/shmUtil.h"
+#include "ptxExtractor.h"
+#include "mapper.h"
+#include "asyncRequest.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cublasLt.h>
+#include <cudnn.h>
+#include <nccl.h>
+// #include <cufile.h>
+#include "scheduler.h"
+#include "elasticscheduler.h"
+#ifdef GV_MSGHANDLER
+#include "msghandler.h"
+#endif
+
+
+typedef struct {
+    DeviceBlockMapper* blockManager;
+    std::vector<Block_t> blockInfoList;
+    HandleMapper* handleManager;
+    // robin_hood::unordered_flat_map<uint64_t, Handle_t*> mapVirAddr2HandleInfo;
+    // robin_hood::unordered_flat_map<uint64_t, size_t> mapRealAddr2handleListIdx;
+    // std::vector<Handle_t> handleInfoList;
+    // std::queue<pair<size_t, size_t>> freeHandleQueue;
+    std::vector<ncclRedOp_t> ncclRedOpList;
+    // std::vector<ncclUniqueId> ncclUidList;
+
+    bool isFirstIter;
+    robin_hood::unordered_flat_map<uint64_t, CUmodule>* mapFatBinHandle2CuModule;
+    robin_hood::unordered_flat_map<uint64_t, CUfunction>* mapHost2CuFunc;          //used for cudaLaunchKernel
+    robin_hood::unordered_flat_map<uint64_t, CUdeviceptr>* mapHostVar2CuDevPtr;    //used for cudaMemcopyToSymbol
+    robin_hood::unordered_flat_map<std::string, uint64_t>* mapDevName2DevPtr;    //used for identifying the device pointer, shared by all devices
+    PTXExtractor* ptxExtractor;
+    moodycamel::BlockingReaderWriterQueue< std::pair<void*, size_t> >* imageQueue;
+} CUinfoMap_t;
+
+#define DEFINE_SERVER_AM_CALLBACK(name) \
+    ucs_status_t ServerEndpoint::name(void *arg, const void *header, size_t header_length, \
+                             void *data, size_t length, const ucp_am_recv_param_t *param)
+
+
+extern Configure config;
+class ServerEndpoint {
+    private:
+        const char* myShortName_ = "ServerEp";
+        char* myName_ = NULL;
+        ConnStatus_t connStatus_ = {true, true};
+        uint64_t clientID_;
+        char* clientIP_;
+        char* clientPort_;
+        int clientPID_ = 0;
+        size_t priority_ = 0;
+
+#ifdef GV_Scheduler
+        Scheduler* sche;
+#endif
+
+#ifdef GV_MSGHANDLER
+        MsgHandler* msghandler;
+        boost::thread* MSGRecvThread_ = nullptr;
+        bool MSGStopFlag_ = false;
+        bool createThreadFlag = false;
+        int msgResetCnt = 0;
+        bool msgWaitFlag_ = false;
+#endif
+
+#ifdef GV_eScheduler
+        eScheduler* esche;
+#endif
+
+        bool initFlag_ = false; // to initialize the shared memory (and imageQueue) only once
+        size_t curIter_ = 0;
+        size_t numIterations = 0; 
+        int ckptCnt = 0;
+
+        int curDev_ = 0; // the current device of this thread
+        bool recoveryFlag_ = false;
+        bool replayFlag_ = false;
+        int getDeviceFlag = 0;
+        int CliDev_ = -1;
+
+        SharedMemoryOpt* shmQueSizes_[3] = {NULL, NULL, NULL};
+        CMessageQueue* shmQueues_[3] = {NULL, NULL, NULL};
+        size_t lastCopyLen_[3] = {0, 0, 0};
+        uint8_t lastCopyType_ = MEMCPY_OTHER;
+        std::vector<TensorInfo_t> trainTensors_;
+
+        ncclComm_t curComm = NULL; // todo: maybe not only one stream
+        cudaStream_t commStream_ = NULL; 
+        cudaStream_t defaultStream_ = NULL;
+
+        std::list<cudaEvent_t> watchedEventsList_;
+        std::list<cudaEvent_t> notCompleteEventsList_;
+        Sync_t eventWatchedSync_;
+        boost::thread* eventWatchedThread_ = nullptr;
+        
+        std::string backupFilePath_;
+        HostBuffer_t serverState_ = {NULL, 0};
+        void* backupMemcpyBuffer_ = NULL;
+        std::vector<cudaStream_t> streamList_;
+        Sync_t backupSync_;
+        AsyncRequest stateBackup_;
+        AsyncRequest trainTensorBackup_;
+        bool bufferReady_ = false;
+        bool bufferFinished_ = false;
+        bool bufferResized_ = false;
+        // boost::thread* backupStorageThread_ = nullptr;
+        std::unique_ptr<boost::interprocess::mapped_region> fileMappingRegion_;
+        std::unique_ptr<boost::interprocess::mapped_region> trainTensorsMappingRegion_ = nullptr;
+
+    public:
+        ucp_conn_request_h _connectReq;
+        ucp_worker_h _dataWorker;
+        ucp_ep_h _clientEp;
+
+        CUinfoMap_t _cuInfoMap; 
+        boost::thread* _ptxThread = nullptr;
+        std::vector<void*> _fatbinList;
+
+        static void CreateServerEp(ucp_conn_request_h conn_request, void *arg);
+        ServerEndpoint(ucp_worker_h dataWorker, UCPConnection_t conn);
+        ~ServerEndpoint();
+        void CloseEp(uint64_t flags);
+
+        void SendResponse(RequestIOV* reqBuffer, const ucp_ep_h* ep, ucs_memory_type_t memType = UCS_MEMORY_TYPE_HOST);
+        void SendStatus(int status);
+
+        // Get the device pointer from userAddr(virtAddr)
+        inline void* GetDevPtr(uint64_t userAddr) {
+#ifdef GV_MEMORY
+            uint64_t realAddr = 0;
+            if (_cuInfoMap.blockManager->FindByVirAddr(userAddr, realAddr) == -1) {
+                return NULL;
+            }
+            else {
+                return (void*)realAddr;
+            }
+#elif defined(GV_MEMORY_PTX)
+            return (void*)(_cuInfoMap.blockInfoList[GET_BLOCK_ID(userAddr)].devPtr + GET_BLOCK_INTER_OFFSET(userAddr));
+#else
+            return (void*)userAddr;
+#endif // GV_MEMORY
+        }
+
+        // Get the block index of current device's blockList from userAddr(virtAddr)
+        inline size_t GetBlockIdx(uint64_t userAddr) {
+            return GET_BLOCK_ID(userAddr);
+        }
+
+        // Find the virtual address of the device pointer from the real address
+        inline uint64_t FindDevPtrVirAddr(void* devPtr) {
+#ifdef GV_MEMORY
+            uint64_t virtAddr = 0;
+            if (_cuInfoMap.blockManager->FindByRealAddr((const uint64_t)devPtr, virtAddr) == -1) {
+                tool::Logging(LOG_ERROR, myName_, "FindDevPtrVirAddr failed: devPtr(%p) not found\n", devPtr);
+                return 0;
+            }
+            else {
+                return virtAddr;
+            }
+#else
+            return (uint64_t)devPtr;
+#endif // GV_MEMORY
+        }
+
+        // Get the real address of the handle from userAddr(virtAddr)
+        inline void* GetHandle(uint64_t userAddr, bool reset = false) {
+#ifdef GV_HANDLE
+            return _cuInfoMap.handleManager->FindRealAddrByVirAddr(userAddr, reset);
+#else
+            return (void*)(userAddr);
+#endif // GV_HANDLE
+        }
+
+        inline Handle_t* GetHandleInfo(uint64_t userAddr) {
+            return _cuInfoMap.handleManager->GetHandleInfoByVirAddr(userAddr);
+        }
+
+        // Set the handle info 
+        inline void SetHandleVirAddr(uint64_t userAddr, void* handlePtr, enum API_REQUEST_CODE_SET handleType = __CUDA_REGISTER) {
+            _cuInfoMap.handleManager->UpdateHandle(userAddr, handlePtr, handleType);
+        }
+
+        // Add the handle info and return the virtual address of the handle
+        inline void* GetHandleVirAddr(void* handlePtr, enum API_REQUEST_CODE_SET handleType) {
+#ifdef GV_HANDLE
+            return _cuInfoMap.handleManager->AddHandle(handlePtr, handleType);
+#else
+            return handlePtr;
+#endif // GV_HANDLE
+        }
+
+        // Find the virtual address of the handle from the real address
+        inline uint64_t FindHandleVirAddr(void* handlePtr) {
+#ifdef GV_HANDLE
+            return _cuInfoMap.handleManager->FindIdxByRealAddr(handlePtr);
+#else
+            return (uint64_t)handlePtr;
+#endif // GV_HANDLE
+        }
+
+        inline ncclRedOp_t GetNcclRedOp(ncclRedOp_t userOp, bool reset = false) {
+#ifdef GV_HANDLE
+            ncclRedOp_t redOp = _cuInfoMap.ncclRedOpList[userOp];
+            if (reset) {
+                _cuInfoMap.ncclRedOpList[userOp] = static_cast<ncclRedOp_t>(0);
+            }
+            return redOp;
+#else
+            return userOp;
+#endif // GV_HANDLE
+        }
+
+        inline ncclRedOp_t GetNcclRedOpVirAddr(ncclRedOp_t redOp) {
+#ifdef GV_HANDLE
+            size_t redOpIndex = (
+                _cuInfoMap.ncclRedOpList.emplace_back(redOp),
+                _cuInfoMap.ncclRedOpList.size() - 1
+            );
+            return (ncclRedOp_t)redOpIndex;
+#else
+            return redOp;
+#endif // GV_HANDLE
+        }
+
+        inline void SetNcclRedOpVirAddr(ncclRedOp_t redOp, ncclRedOp_t userOp) {
+#ifdef GV_HANDLE
+            _cuInfoMap.ncclRedOpList[userOp] = redOp;
+#endif // GV_HANDLE
+        }
+
+        void Run();
+        void NewRun();
+        void UpdateStream(cudaStream_t stream, bool isCommStream = true);
+        void CommEventMonitor();
+        void StopCommEventMonitor();
+        void Backup2Memory();
+        void Backup2Storage();
+        void BackupTrainTensors2Storage();
+        void Persist2File(const char* fileName, const uint8_t* data, size_t size);
+        void LoadFromStorage();
+        void LoadFromStorage(size_t iter);
+        void StopBackup();
+
+        /* CUDA Runtime Internal */
+        DECLARE_AM_CALLBACK(__cudaRegisterHandle);
+        DECLARE_AM_CALLBACK(__cudaRegisterFatBinaryHandle);
+        DECLARE_AM_CALLBACK(__cudaRegisterFunctionHandle);
+        DECLARE_AM_CALLBACK(__cudaRegisterVarHandle);
+        void __cudaUnregisterFatBinaryHandle();
+
+        /* CUDA Runtime Execution */
+        DECLARE_AM_CALLBACK(cudaLaunchKernelHandle);
+        DECLARE_AM_CALLBACK(cudaFuncGetAttributesHandle);
+
+        /* CUDA Runtime Device */
+        DECLARE_AM_CALLBACK(cudaGetDeviceHandle);
+        DECLARE_AM_CALLBACK(cudaGetDeviceCountHandle);
+        DECLARE_AM_CALLBACK(cudaGetDevicePropertiesHandle);
+        DECLARE_AM_CALLBACK(cudaSetDeviceHandle);
+        DECLARE_AM_CALLBACK(cudaSetMainDeviceHandle);
+        DECLARE_AM_CALLBACK(cudaDeviceSynchronizeHandle);
+        DECLARE_AM_CALLBACK(cudaDeviceGetAttributeHandle);
+
+        /* CUDA Runtime Memory */
+        DECLARE_AM_CALLBACK(cudaMallocHandle);
+        DECLARE_AM_CALLBACK(cudaMemGetInfoHandle);
+        DECLARE_AM_CALLBACK(cudaMemsetHandle);
+        DECLARE_AM_CALLBACK(cudaMemsetAsyncHandle);
+        DECLARE_AM_CALLBACK(cudaFreeHandle);
+        DECLARE_AM_CALLBACK(cudaMemcpyH2DHandle);
+        DECLARE_AM_CALLBACK(cudaMemcpyD2HHandle);
+        DECLARE_AM_CALLBACK(cudaMemcpyD2DHandle);
+        DECLARE_AM_CALLBACK(NewIterHandle);
+        // DECLARE_AM_CALLBACK(cudaMemcpyHandle);
+        // DECLARE_AM_CALLBACK(cudaMemcpyAsyncHandle); 
+        DECLARE_AM_CALLBACK(cudaMemcpyAsyncH2DHandle);
+        DECLARE_AM_CALLBACK(cudaMemcpyAsyncD2HHandle);
+        DECLARE_AM_CALLBACK(cudaMemcpyAsyncD2DHandle);
+        DECLARE_AM_CALLBACK(cudaMemcpyToSymbolHandle);
+
+        /* CUDA Runtime Event */
+        DECLARE_AM_CALLBACK(cudaEventCreateHandle);
+        DECLARE_AM_CALLBACK(cudaEventCreateWithFlagsHandle);
+        DECLARE_AM_CALLBACK(cudaEventRecordHandle);
+        DECLARE_AM_CALLBACK(cudaEventQueryHandle);
+        DECLARE_AM_CALLBACK(cudaEventDestroyHandle);
+        DECLARE_AM_CALLBACK(cudaEventElapsedTimeHandle);
+
+        /* CUDA Runtime Stream */
+        DECLARE_AM_CALLBACK(cudaStreamCreateHandle);
+        DECLARE_AM_CALLBACK(cudaStreamCreateWithFlagsHandle);
+        DECLARE_AM_CALLBACK(cudaStreamCreateWithPriorityHandle);
+        DECLARE_AM_CALLBACK(cudaStreamWaitEventHandle);
+        DECLARE_AM_CALLBACK(cudaStreamSynchronizeHandle);
+        DECLARE_AM_CALLBACK(cudaStreamDestroyHandle);
+        DECLARE_AM_CALLBACK(cudaStreamIsCapturingHandle);
+        DECLARE_AM_CALLBACK(cudaStreamGetCaptureInfoHandle);
+
+        /* CUDA Runtime Other */
+        DECLARE_AM_CALLBACK(cudaOccupancyMaxActiveBlocksPerMultiprocessorHandle);
+        DECLARE_AM_CALLBACK(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsHandle);
+
+        /* cuBLAS */
+        DECLARE_AM_CALLBACK(cublasCreate_v2Handle);
+        DECLARE_AM_CALLBACK(cublasSgemm_v2Handle);
+        DECLARE_AM_CALLBACK(cublasSgemmStridedBatchedHandle);
+        DECLARE_AM_CALLBACK(cublasDestroy_v2Handle);
+        DECLARE_AM_CALLBACK(cublasSetStream_v2Handle);
+        DECLARE_AM_CALLBACK(cublasSetWorkspace_v2Handle);
+        DECLARE_AM_CALLBACK(cublasSetMathModeHandle);
+        DECLARE_AM_CALLBACK(cublasGetMathModeHandle);
+
+        /* cuBLASLt */
+        DECLARE_AM_CALLBACK(cublasLtCreateHandle);
+        DECLARE_AM_CALLBACK(cublasLtDestroyHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulDescCreateHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulDescDestroyHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulDescSetAttributeHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatrixLayoutCreateHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatrixLayoutDestroyHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatrixLayoutSetAttributeHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulPreferenceCreateHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulPreferenceDestroyHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulPreferenceSetAttributeHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulAlgoGetHeuristicHandle);
+        DECLARE_AM_CALLBACK(cublasLtMatmulHandle);
+
+        /* cuDNN */
+        DECLARE_AM_CALLBACK(cudnnCreateHandle);
+        DECLARE_AM_CALLBACK(cudnnDestroyHandle);
+        DECLARE_AM_CALLBACK(cudnnCreateTensorDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnDestroyTensorDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnGetTensorSizeInBytesHandle);
+        DECLARE_AM_CALLBACK(cudnnSetTensor4dDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnSetTensorNdDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnSetTensorNdDescriptorExHandle);
+        DECLARE_AM_CALLBACK(cudnnCreateTensorTransformDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnSetTensorTransformDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnDestroyTensorTransformDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnInitTransformDestHandle);
+        DECLARE_AM_CALLBACK(cudnnTransformTensorExHandle);
+        DECLARE_AM_CALLBACK(cudnnTransformFilterHandle);
+        DECLARE_AM_CALLBACK(cudnnCreateFilterDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnSetFilterNdDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnDestroyFilterDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnGetFilterSizeInBytesHandle);
+        DECLARE_AM_CALLBACK(cudnnGetFoldedConvBackwardDataDescriptorsHandle);
+        DECLARE_AM_CALLBACK(cudnnSetStreamHandle);
+        DECLARE_AM_CALLBACK(cudnnBatchNormalizationBackwardExHandle);
+        DECLARE_AM_CALLBACK(cudnnBatchNormalizationForwardTrainingExHandle);
+        DECLARE_AM_CALLBACK(cudnnBatchNormalizationForwardInferenceHandle);
+        DECLARE_AM_CALLBACK(cudnnBackendCreateDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnBackendDestroyDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnBackendSetAttributeHandle);
+        DECLARE_AM_CALLBACK(cudnnBackendGetAttributeHandle);
+        DECLARE_AM_CALLBACK(cudnnBackendExecuteHandle);
+        DECLARE_AM_CALLBACK(cudnnBackendFinalizeHandle);
+        DECLARE_AM_CALLBACK(cudnnGetBatchNormalizationBackwardExWorkspaceSizeHandle);
+        DECLARE_AM_CALLBACK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSizeHandle);
+        DECLARE_AM_CALLBACK(cudnnGetBatchNormalizationTrainingExReserveSpaceSizeHandle);
+        DECLARE_AM_CALLBACK(cudnnCreateConvolutionDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnDestroyConvolutionDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnSetConvolutionGroupCountHandle);
+        DECLARE_AM_CALLBACK(cudnnSetConvolutionMathTypeHandle);
+        DECLARE_AM_CALLBACK(cudnnSetConvolutionNdDescriptorHandle);
+        DECLARE_AM_CALLBACK(cudnnSetConvolutionReorderTypeHandle);
+        DECLARE_AM_CALLBACK(cudnnGetConvolutionForwardAlgorithm_v7Handle);
+        DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardFilterAlgorithm_v7Handle);
+        DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardDataAlgorithm_v7Handle);
+        DECLARE_AM_CALLBACK(cudnnGetConvolutionForwardWorkspaceSizeHandle);
+        DECLARE_AM_CALLBACK(cudnnConvolutionForwardHandle);
+        DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardDataWorkspaceSizeHandle);
+        DECLARE_AM_CALLBACK(cudnnConvolutionBackwardFilterHandle);
+        DECLARE_AM_CALLBACK(cudnnGetConvolutionBackwardFilterWorkspaceSizeHandle);
+        DECLARE_AM_CALLBACK(cudnnConvolutionBackwardDataHandle);
+
+        /* nccl */
+        DECLARE_AM_CALLBACK(ncclGroupStartHandle);
+        DECLARE_AM_CALLBACK(ncclGroupEndHandle);
+        DECLARE_AM_CALLBACK(ncclCommInitRankHandle);
+        DECLARE_AM_CALLBACK(ncclCommDestroyHandle);
+        DECLARE_AM_CALLBACK(ncclCommGetAsyncErrorHandle);
+        DECLARE_AM_CALLBACK(ncclGetUniqueIdHandle);
+        DECLARE_AM_CALLBACK(ncclGetVersionHandle);
+        DECLARE_AM_CALLBACK(ncclAllReduceHandle);
+        DECLARE_AM_CALLBACK(ncclReduceHandle);
+        DECLARE_AM_CALLBACK(ncclReduceScatterHandle);
+        DECLARE_AM_CALLBACK(ncclAllGatherHandle);
+        DECLARE_AM_CALLBACK(ncclBroadcastHandle);
+        DECLARE_AM_CALLBACK(ncclSendHandle);
+        DECLARE_AM_CALLBACK(ncclRecvHandle);
+        DECLARE_AM_CALLBACK(ncclCommCountHandle);
+        DECLARE_AM_CALLBACK(ncclCommUserRankHandle);
+        DECLARE_AM_CALLBACK(ncclCommCuDeviceHandle);
+        DECLARE_AM_CALLBACK(ncclCommAbortHandle);
+        DECLARE_AM_CALLBACK(ncclCommInitAllHandle);
+        DECLARE_AM_CALLBACK(ncclCommInitRankConfigHandle);
+        DECLARE_AM_CALLBACK(ncclCommSplitHandle);
+        DECLARE_AM_CALLBACK(ncclCommFinalizeHandle);
+        DECLARE_AM_CALLBACK(ncclCommRegisterHandle);
+        DECLARE_AM_CALLBACK(ncclCommDeregisterHandle);
+        DECLARE_AM_CALLBACK(ncclMemAllocHandle);
+        DECLARE_AM_CALLBACK(ncclMemFreeHandle);
+        DECLARE_AM_CALLBACK(ncclRedOpCreatePreMulSumHandle);
+        DECLARE_AM_CALLBACK(ncclRedOpDestroyHandle);
+};
+
+#endif
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp) b/GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp)
new file mode 100644
index 0000000..85a0838
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/src/client/CMakeLists.txt(clientApp)
@@ -0,0 +1,25 @@
+set(CUDA_ROOT_PATH "/usr/local/cuda")
+
+if(NOT CUDA_ROOT_PATH)
+    message(FATAL_ERROR "Please set the CUDA_PATH environment variable to the path of your desired CUDA installation.")
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH ${CUDA_ROOT_PATH})
+find_package(CUDA REQUIRED)
+list(APPEND CUDA_NVCC_FLAGS "-lcudart")
+
+set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+
+aux_source_directory(. CLIENT_SRC)
+cuda_add_library(ClientCore ${CLIENT_SRC})
+target_link_libraries(ClientCore 
+    CommCore 
+    ${CUDA_LIBRARIES}
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda.so
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufile.so
+)
+
+target_include_directories(ClientCore PUBLIC ${CUDA_INCLUDE_DIRS})
+
+add_executable(Client clientApp.cc)
+target_link_libraries(Client ${CLIENT_FINAL_OBJ})
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak b/GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak
new file mode 100644
index 0000000..69d2acc
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientApp.cc.bak
@@ -0,0 +1,278 @@
+#include <iostream>
+#include <cuda_runtime.h>
+#include "../../include/configure.h"
+#include "../../include/ucpConnection.h"
+#include "../../include/clientEndpoint.h"
+using namespace std;
+
+Configure config_("config.json");
+const char* myName = "ClientApp";
+ClientEndpoint* clientEp;
+UCPConnection* connectionObj;
+
+void cleanup() {
+    delete clientEp;
+    delete connectionObj;
+}
+
+int main(int argc, char* argv[]) {
+    // connectionObj = new UCPConnection(config_.GetServerIp().c_str(), config_.GetServerPort());
+    // clientEp = new ClientEndpoint(connectionObj);
+    // //clientEp->Run();
+    // tool::Logging(myName, "start the client endpoint.\n");
+
+    // int reqNum = 2;
+    // int amID = 0;
+    // ucp_dt_iov_t *iov = (ucp_dt_iov_t *)alloca(reqNum * sizeof(ucp_dt_iov_t));
+    // size_t cuda_size = 4 * 1024 * 1024;
+
+    // size_t* header = (size_t*)malloc(sizeof(size_t) * reqNum);
+
+    // /* batch1: cudaMalloc + cudaSetDevice */
+    // // req1
+    // RequestBuffer reqBuf = RequestBuffer(sizeof(int) + sizeof(size_t));
+    // reqBuf.Push<int>(CUDA_MALLOC);
+    // reqBuf.Push<size_t>(cuda_size);
+    // iov[0].length = reqBuf.GetSize();
+    // iov[0].buffer = reqBuf._dataBuffer;
+    // header[0] = iov[0].length;
+    // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length);
+
+    // //req2
+    // RequestBuffer reqBuf2 = RequestBuffer(sizeof(int) + sizeof(int));
+    // reqBuf2.Push<int>(CUDA_SET_DEVICE);
+    // reqBuf2.Push<int>(0);
+    // // tool::Logging(myName, "reqBuf2.GetSize() = %zu\n", reqBuf2.GetSize());
+    // // char* msg = (char*)malloc(8000);
+    // // reqBuf2.PushCString(msg);
+    // // tool::Logging(myName, "reqBuf2.GetSize() = %zu\n", reqBuf2.GetSize());
+    // iov[1].length = reqBuf2.GetSize();
+    // iov[1].buffer = reqBuf2._dataBuffer;
+    // header[1] = iov[1].length;
+    // tool::Logging(myName, "iov[1].length = %zu\n", iov[1].length);
+
+    // SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+
+    // ucp_dt_iov_t *recv_iov;
+    // uint64_t devPtr = 0;
+    // PrepareSingleIOV(&recv_iov, &devPtr, sizeof(uint64_t));
+    // size_t tmp = 0;
+
+    // ReceiveData(recv_iov, &tmp, &devPtr, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL);
+    // tool::Logging(myName, "devPtr = %p(%llu), size = %zu\n", devPtr, devPtr, tmp);
+
+
+    // // /* batch2: cudaFree + cudaSetDevice */
+    // // // req1
+    // // RequestBuffer reqBuf3 = RequestBuffer(sizeof(int) + sizeof(uint64_t));
+    // // reqBuf3.Push<int>(CUDA_FREE);
+    // // reqBuf3.Push64BitPointer(devPtr);
+    // // iov[0].length = reqBuf3.GetSize();
+    // // iov[0].buffer = reqBuf3._dataBuffer;
+    // // header[0] = iov[0].length;
+    // // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); 
+
+    // // //req2
+    // // RequestBuffer reqBuf4 = RequestBuffer(sizeof(int) + sizeof(int));
+    // // reqBuf4.Push<int>(CUDA_SET_DEVICE);
+    // // reqBuf4.Push<int>(0);
+    // // iov[1].length = reqBuf4.GetSize();
+    // // iov[1].buffer = reqBuf4._dataBuffer;
+    // // header[1] = iov[1].length;
+    // // tool::Logging(myName, "iov[1].length = %zu\n", iov[1].length);
+    // // SendData(iov, 2, header, sizeof(size_t) * 2, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+
+
+    // /* batch3: cudaSetDevice + cudaMemset  */
+    // // req1
+    // RequestBuffer reqBuf5 = RequestBuffer(sizeof(int) + sizeof(int));
+    // reqBuf5.Push<int>(CUDA_SET_DEVICE);
+    // reqBuf5.Push<int>(0);
+    // iov[0].length = reqBuf5.GetSize();
+    // iov[0].buffer = reqBuf5._dataBuffer;
+    // header[0] = iov[0].length;
+    // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length); 
+
+    // //req2
+    // RequestBuffer reqBuf6 = RequestBuffer(sizeof(int) + sizeof(uint64_t) + sizeof(int) + sizeof(size_t) + 3 * 1024 * 1024);
+    // reqBuf6.Push<int>(CUDA_MEMSET);
+    // reqBuf6.Push64BitPointer(devPtr);
+    // reqBuf6.Push<int>(1);
+    // reqBuf6.Push<size_t>(cuda_size);
+    // iov[1].length = reqBuf6.GetSize();
+    // iov[1].buffer = reqBuf6._dataBuffer;
+    // header[1] = iov[1].length;
+    // tool::Logging(myName, "iov[1].length = %zu\n", iov[1].length); 
+    // SendData(iov, 2, header, sizeof(size_t) * 2, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+
+    // /* batch4: cudaMemcpy(host -> remote dev) */
+    // //req1
+    // RequestBuffer reqBuf7 = RequestBuffer(sizeof(int) + sizeof(cudaMemcpyKind) + sizeof(size_t) + sizeof(uint64_t));
+    // reqBuf7.Push<int>(CUDA_MEMCPY);
+    // reqBuf7.Push(cudaMemcpyHostToDevice);
+    // reqBuf7.Push<size_t>(cuda_size);
+    // reqBuf7.Push64BitPointer(devPtr);
+    // iov[0].length = reqBuf7.GetSize();
+    // iov[0].buffer = reqBuf7._dataBuffer;
+    // header[0] = iov[0].length;
+    // tool::Logging(myName, "cudaMemcpyHostToDevice, iov[0].length = %zu\n", iov[0].length); 
+    // SendData(iov, 1, header, sizeof(size_t) * 1, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+
+    // //req1data
+    // int* hostBuffer;
+    // hostBuffer = (int*)malloc(cuda_size);
+    // for (int i = 0; i < (cuda_size / sizeof(int)); i++) {
+    //     hostBuffer[i] = i;
+    // }
+    // void* devBuffer;
+    // cudaMalloc(&devBuffer, cuda_size);
+    // cudaMemcpy(devBuffer, hostBuffer, cuda_size, cudaMemcpyHostToDevice);
+    
+    // void* dataBuffer = hostBuffer;
+
+    // iov[0].length = cuda_size;
+    // iov[0].buffer = dataBuffer;
+    // header[0] = iov[0].length;
+    // tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length);
+    // SendData(iov, 1, header, sizeof(size_t) * 0, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+
+    // /* batch5: cudaMemcpy(remote dev -> host) */
+    // //req1
+    // RequestBuffer reqBuf8 = RequestBuffer(sizeof(int) + sizeof(cudaMemcpyKind) + sizeof(size_t) + sizeof(uint64_t));
+    // reqBuf8.Push<int>(CUDA_MEMCPY);
+    // reqBuf8.Push(cudaMemcpyDeviceToHost);
+    // reqBuf8.Push<size_t>(cuda_size);
+    // reqBuf8.Push64BitPointer(devPtr);
+    // iov[0].length = reqBuf8.GetSize();
+    // iov[0].buffer = reqBuf8._dataBuffer;
+    // header[0] = iov[0].length;
+    // tool::Logging(myName, "cudaMemcpyDeviceToHost, iov[0].length = %zu\n", iov[0].length); 
+    // SendData(iov, 1, header, sizeof(size_t) * 1, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+
+    // int* hostBuffer2;
+    // hostBuffer2 = (int*)malloc(cuda_size);
+    // memset(hostBuffer2, 0, cuda_size);
+    // ucp_dt_iov_t *recv_iov2;
+    // size_t iovNum = 0;
+
+    // PrepareSingleIOV(&recv_iov2, hostBuffer2, cuda_size);
+    // ReceiveData(recv_iov2, &iovNum, hostBuffer2, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL);
+    // printf("iovNum = %zu, recv data: ", iovNum);
+    // for (int i = 0; i < 20; i++) {
+    //     printf("%02x ", ((uint8_t*)hostBuffer2)[i]);
+    // }
+    // printf("\n");
+
+
+    
+    // hyf: STREAM TEST
+    connectionObj = new UCPConnection(config_.GetServerIp().c_str(), config_.GetServerPort());
+    clientEp = new ClientEndpoint(connectionObj);
+    tool::Logging(myName, "start the client endpoint.\n");
+    int amID = 0;
+    int reqNum = 1;
+    ucp_dt_iov_t *iov = (ucp_dt_iov_t *)alloca(reqNum * sizeof(ucp_dt_iov_t));
+
+    size_t cuda_size = 4 * 1024 * 1024;
+    size_t* header = (size_t*)malloc(sizeof(size_t) * reqNum);
+
+    // req1: cudaStreamCreateWithFlags
+    unsigned int flags = 1; // 0阻塞流，1非阻塞流
+    RequestBuffer reqBuf_1 = RequestBuffer(sizeof(uint));
+    reqBuf_1.Push<int>(CUDA_STREAM_CREATE_WITH_FLAGS);
+    reqBuf_1.Push<uint>(flags);
+    iov[0].length = reqBuf_1.GetSize();
+    iov[0].buffer = reqBuf_1._dataBuffer;
+    header[0] = iov[0].length;
+    tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length);
+
+    SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+    
+    ucp_dt_iov_t *recv_iov;
+    cudaStream_t stream = NULL;
+    PrepareSingleIOV(&recv_iov, &stream, sizeof(cudaStream_t));
+    size_t tmp = 0;
+    // recv pStream
+    ReceiveData(recv_iov, &tmp, &stream, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL);
+    tool::Logging(myName, "pStream = %p, size = %zu\n", stream, tmp);
+
+    // req2: cudaStreamSynchronize
+    RequestBuffer reqBuf_2 = RequestBuffer(sizeof(cudaStream_t));
+    reqBuf_2.Push<int>(CUDA_STREAM_SYNCHRONIZE);
+    reqBuf_2.Push64BitPointer(stream);    // pStreamcudaStreamGetCaptureInfoHandle
+    iov[0].length = reqBuf_2.GetSize();
+    iov[0].buffer = reqBuf_2._dataBuffer;
+    header[0] = iov[0].length;
+
+    tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length);
+    SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+    
+    // req3: cudaStreamIsCapturing
+    enum cudaStreamCaptureStatus pCaptureStatus;
+    RequestBuffer reqBuf_3 = RequestBuffer(sizeof(cudaStream_t) + sizeof(enum cudaStreamCaptureStatus *));
+    reqBuf_3.Push<int>(CUDA_STREAM_IS_CAPTURING);
+    reqBuf_3.Push64BitPointer(stream);
+    iov[0].length = reqBuf_3.GetSize();
+    iov[0].buffer = reqBuf_3._dataBuffer;
+    tool::Logging(myName, "iov[1].length = %zu\n", iov[0].length);
+    SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+    ucp_dt_iov_t *recv_iov3;
+    PrepareSingleIOV(&recv_iov3, &pCaptureStatus, sizeof(enum cudaStreamCaptureStatus *));
+    size_t tmp3 = 0;
+    // recv pCaptureStatus
+    ReceiveData(recv_iov3, &tmp3, &pCaptureStatus, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL);
+    tool::Logging(myName, "pCaptureStatus = %d, size = %zu\n", pCaptureStatus, tmp3);
+
+ 
+    // req4: cudaStreamGetCaptureInfo
+    enum cudaStreamCaptureStatus pCaptureStatusofStream;
+    unsigned long long pId;
+    RequestBuffer reqBuf_4 = RequestBuffer(sizeof(cudaStream_t) + sizeof(enum cudaStreamCaptureStatus *) + sizeof(unsigned long long *));
+    reqBuf_4.Push<int>(CUDA_STREAM_GET_CAPTURE_INFO);
+    reqBuf_4.Push64BitPointer(stream);
+
+    iov[0].length = reqBuf_4.GetSize();
+    iov[0].buffer = reqBuf_4._dataBuffer;
+
+    tool::Logging(myName, "iov[2].length = %zu\n", iov[0].length);
+    SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+    // ucp_dt_iov_t *recv_iov4;
+    // // recv pCaptureStatus
+    // PrepareSingleIOV(&recv_iov4, &pCaptureStatusofStream, sizeof(enum cudaStreamCaptureStatus*));
+    // size_t tmp4 = 0;
+    // ReceiveData(recv_iov4, &tmp4, &pCaptureStatusofStream, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL);
+    // tool::Logging(myName, "pCaptureStatusofStream = %d, size = %zu\n", pCaptureStatusofStream, tmp4);
+    // recv pId
+
+    reqBuf_4.Reset(sizeof(enum cudaStreamCaptureStatus*)+sizeof(unsigned long long *), reqBuf_4._dataBuffer);
+
+    ucp_dt_iov_t *recv_iov5;
+    PrepareSingleIOV(&recv_iov5, reqBuf_4._dataBuffer, sizeof(enum cudaStreamCaptureStatus*)+sizeof(unsigned long long *));
+    size_t tmp5 = 0;
+    ReceiveData(recv_iov5, &tmp5, reqBuf_4._dataBuffer, 0, &clientEp->_dataWorker, UCS_MEMORY_TYPE_HOST, NULL);
+    pCaptureStatus = reqBuf_4.Pop<enum cudaStreamCaptureStatus>();
+    pId = reqBuf_4.Pop<unsigned long long>();
+    printf("cudaStreamGetCaptureInfo success, status = %d, pId = %lld\n", pCaptureStatus, pId);
+    
+
+
+    // req5: cudaStreamDestroy
+    RequestBuffer reqBuf_5 = RequestBuffer(sizeof(cudaStream_t));
+    reqBuf_5.Push<int>(CUDA_STREAM_DESTROY);
+    reqBuf_5.Push64BitPointer(stream);    
+    iov[0].length = reqBuf_5.GetSize();
+    iov[0].buffer = reqBuf_5._dataBuffer;
+    tool::Logging(myName, "iov[0].length = %zu\n", iov[0].length);
+    SendData(iov, reqNum, header, sizeof(size_t) * reqNum, amID, &clientEp->_dataWorker, &clientEp->_clientEp);
+
+    // end 
+
+    //free(iov);
+
+    //sleep(5);
+
+    cleanup();
+    
+    return 0;
+}
+
diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc b/GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc
new file mode 100644
index 0000000..381b1f4
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientCkpt.cc
@@ -0,0 +1,275 @@
+#include "../../include/clientEndpoint.h"
+#include <nccl.h>
+
+void ClientEndpoint::Checkpointing() {
+    // std::ofstream outFile("api_records.log", std::ios::out); 
+    // for (auto it = reqIOVList.begin(); it != reqIOVList.end(); ++it) {
+    //     RequestIOV* tmpReqIOV = &(*it);
+    //     outFile << tmpReqIOV->GetRequestType() << std::endl;
+    // }
+    // outFile.close();
+
+    auto start = std::chrono::high_resolution_clock::now();
+    ShrinkReqIOVList();
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> duration = end - start;
+
+    tool::Logging(LOG_INFO, myName_, "reqIOVList has %zu requests after shrinking(using %f seconds)\n", reqIOVList.size(), duration.count());
+    isReConnected = false; // reset reconnection flag
+
+    // std::ofstream outFile2("api_records_shrink.log", std::ios::out); 
+    // for (auto it = reqIOVList.begin(); it != reqIOVList.end(); ++it) {
+    //     RequestIOV* tmpReqIOV = &(*it);
+    //     outFile2 << tmpReqIOV->GetRequestType();
+    //     if (tmpReqIOV->GetRequestType() == CUDA_EVENT_RECORD) {
+    //         outFile2 << " " << tmpReqIOV->GetHandleByIndex(0) << ", " << tmpReqIOV->GetHandleByIndex(1);
+    //     }
+    //     outFile2 << std::endl;
+    // }
+    // outFile2.close();
+
+    // debug:
+    // if (ckptCnt >= 2) {
+    //     exit(EXIT_FAILURE);
+    // }
+    // else {
+    //     ckptCnt ++;
+    // }
+
+    auto it = reqIOVList.end();
+    --it;
+    recordedReq = &(*it);
+
+    // ckptIter = 0;
+}
+
+void ClientEndpoint::Replay() {
+    auto start_replay = std::chrono::high_resolution_clock::now();
+    tool::Logging(LOG_INFO, myName_, "start replaying %zu requests from registerIOVList\n", regIOVList.size());
+    SendRegisterRequest(this, true);
+
+    // std::ofstream outFile2("api_records_replay.log", std::ios::out);     
+    tool::Logging(LOG_INFO, myName_, "start replaying %zu requests from requestIOVList\n", reqIOVList.size());
+    for (auto it = reqIOVList.begin(); it != reqIOVList.end(); ++it) {
+        RequestIOV* tmpReqIOV = &(*it);
+        // outFile2 << tmpReqIOV->GetRequestType() << std::endl;
+
+        if (tmpReqIOV->GetRequestType() == NCCL_GET_UNIQUE_ID) {
+            ncclUniqueId uniqueId;
+            RequestIOV resBuf = RequestIOV();
+            resBuf.Push(uniqueId);
+            SendRequestRecvResponse(tmpReqIOV, &resBuf, true, false);
+#ifdef GV_GPUMAP
+            gpuIdMap->UpdateUniqueID((uint8_t*)&uniqueId, sizeof(ncclUniqueId));
+            tool::Logging(LOG_INFO, myName_, "[pid:%d, tid:%d] updated uniqueId\n", _processID, _threadID);
+#else
+            tool::Logging(LOG_ERROR, myName_, "Replaying for ncclGetUniqueId is not supported in non-GV_GPUMAP mode\n");
+            exit(EXIT_FAILURE);
+#endif // GV_GPUMAP
+            continue;
+        } 
+        else if (tmpReqIOV->GetRequestType() == NCCL_COMM_INIT_RANK
+              || tmpReqIOV->GetRequestType() == NCCL_COMM_INIT_RANK_CONFIG) {
+            ncclUniqueId uniqueId;
+#ifdef GV_GPUMAP
+            gpuIdMap->RequestUniqueID((uint8_t*)&uniqueId, sizeof(ncclUniqueId));
+            tmpReqIOV->SetElement(2, &uniqueId, sizeof(ncclUniqueId));
+            SendRequest(tmpReqIOV, true, false);
+            tool::Logging(LOG_INFO, myName_, "[pid:%d, tid:%d] requested uniqueId\n", _processID, _threadID);
+#else
+            tool::Logging(LOG_ERROR, myName_, "Replaying for ncclCommInitRank is not supported in non-GV_GPUMAP mode\n");
+            exit(EXIT_FAILURE);
+#endif // GV_GPUMAP
+        }
+        else if (tmpReqIOV->GetRequestType() == CUDA_MEMCPY_ASYNC_H2D) {
+            SendRequestH2D(tmpReqIOV, (uint8_t*)tmpReqIOV->GetHeaders(), sizeof(cudaMemcpyKind)+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t)+sizeof(uint64_t), false, false);
+        }
+        else if (tmpReqIOV->GetRequestType() == CUDA_STREAM_SYNCHRONIZE || tmpReqIOV->GetRequestType() == NCCL_COMM_GET_ASYNC_ERROR) {
+            int result = 0;
+            RequestIOV resBuf = RequestIOV();
+            resBuf.Push(&result);
+            SendRequestRecvResponse(tmpReqIOV, &resBuf, true, false);
+        }
+        else {
+            SendRequest(tmpReqIOV, true, false);
+        }
+    }
+    // outFile2.close();
+
+    // eval for replay time
+    size_t free = 0;
+    size_t total = 0;
+    int replayFinished = 1;
+    RequestIOV reqBuf = RequestIOV();
+    reqBuf.PushRequestType(CUDA_MEM_GET_INFO);
+    reqBuf.Push(replayFinished); // dummy 
+    RequestIOV resBuf = RequestIOV();
+    resBuf.Push(&free);
+    resBuf.Push(&total);
+    SendRequestRecvResponse(&reqBuf, &resBuf);
+    const char* recovery_path = std::getenv("FLEXGV_RECOVERY_CSV_PATH");
+    if (recovery_path) {
+        auto end_replay = std::chrono::high_resolution_clock::now();
+        tool::Logging(LOG_INFO, myName_, "replay finished with %f seconds\n", std::chrono::duration<double>(end_replay - start_replay).count());
+
+        std::ofstream outFile;
+        outFile.open(recovery_path, std::ios::app);
+        if (outFile.tellp() == 0) {
+            outFile << "Time" << std::endl;
+        }
+        outFile << std::chrono::duration<double>(end_replay - start_replay).count() << std::endl;
+        outFile.close();
+    }
+
+    tool::Logging(LOG_INFO, myName_, "replay finished\n");
+}
+
+void ClientEndpoint::UpdateReqIOVList(RequestIOV* reqBuffer) {
+    // if (recordedReq == NULL) {
+    //     return; // not started yet
+    // }
+
+    int reqType = reqBuffer->GetRequestType();
+
+    if (std::binary_search(std::begin(NotNeedRecordAPIs), std::end(NotNeedRecordAPIs), reqType)) {
+        return; // no need to record these apis
+    } else {
+        RequestIOV* bakReqBuffer = reqBuffer->Clone();
+        listLock_.lock();
+        reqIOVList.push_back(*bakReqBuffer);
+        listLock_.unlock();
+    }
+}
+
+void ClientEndpoint::UpdateReqIOVList(RequestIOV* reqBuffer, uint8_t* header, size_t headerSize) {
+    size_t          memcpyCnt       = *(size_t*)(header + sizeof(cudaMemcpyKind));
+    cudaStream_t    memcpyStream    = *(cudaStream_t*)(header + sizeof(cudaMemcpyKind)+sizeof(size_t));
+    void*           memcpyDst       = *(void**)(header + sizeof(cudaMemcpyKind)+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t));
+    size_t          memsetValue     = 0;
+    bool            isTrainTensor   = false;
+
+    for (const auto& tensor : tensorByteList) {
+        if (tensor.size == memcpyCnt) {
+            return; // means this memcpy request is for train dataloading
+        }
+    }
+
+    // RequestIOV reqBuf = RequestIOV();
+    // reqBuf.PushRequestType(CUDA_MEMSET_ASYNC);
+    // reqBuf.Push64BitPointer(memcpyDst);
+    // reqBuf.Push(memsetValue);
+    // reqBuf.Push(memcpyCnt);
+    // reqBuf.Push64BitPointer(memcpyStream);
+    // reqBuf.PushThreadID(ttID);
+    // RequestIOV* bakReqBuffer = reqBuf.Clone();
+
+    RequestIOV* bakReqBuffer = reqBuffer->Clone(header, headerSize);
+
+    tool::Logging(LOG_DEBUG, myName_, "recorded cudaMemcpyAsync request: dst=%p, count=%zu\n", memcpyDst, memcpyCnt);
+
+    listLock_.lock();
+    reqIOVList.push_back(*bakReqBuffer);
+    listLock_.unlock();
+}
+
+void ClientEndpoint::ShrinkReqIOVList() {
+    robin_hood::unordered_flat_set<uint64_t> destroyHandles;
+    robin_hood::unordered_flat_set<std::string> setHandles;
+
+    // bool shrinkWhole = reqIOVList.size() > BACKUP_API_MAX_NUM ? true : false;
+    bool shrinkWhole = true;
+    bool exitFlag = false;
+    bool lastNewIter = true;
+
+    int tensorIdx = tensorByteList.size() - 1;
+    auto it = reqIOVList.end();
+    --it;
+    while (!exitFlag) {
+        auto eraseIt = it; // avoid invalid iterator
+        
+        RequestIOV* tmpReqIOV = &(*eraseIt);
+        int reqType = tmpReqIOV->GetRequestType();
+
+        bool needErase = false;
+        if (tmpReqIOV == recordedReq) {
+            needErase = true;
+            exitFlag = shrinkWhole == false ? true : false;
+        }
+        else if (reqType == NEW_ITERATION_REQ) {
+            needErase = lastNewIter == false ? true : false;
+            lastNewIter = false;
+        }
+        // else if (tensorIdx >= 0) { 
+        //     needErase = false; // reserve the last memcpy request for each tensor
+        //     if (reqType == CUDA_MEMCPY_ASYNC_H2D) {
+        //         uint8_t* headers = (uint8_t*)tmpReqIOV->GetHeaders();
+        //         size_t memcpyCnt = *(size_t*)(headers + sizeof(cudaMemcpyKind));
+        //         void* memcpyDst = *(void**)(headers + sizeof(cudaMemcpyKind)+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t));
+        //         if (memcpyCnt == tensorByteList[tensorIdx].size && memcpyDst == tensorByteList[tensorIdx].devPtr) {
+        //             tool::Logging(LOG_INFO, myName_, "reserved memcpy request for tensor#%d: dst=%p, count=%zu\n", tensorIdx, memcpyDst, memcpyCnt);
+        //             tensorIdx--;
+        //         }
+        //         else {
+        //             tool::Logging(LOG_INFO, myName_, "skipped memcpy request for tensor#%d: dst=%p, count=%zu\n", tensorIdx, memcpyDst, memcpyCnt);
+        //         }
+        //     }
+        //     else {
+        //         tool::Logging(LOG_INFO, myName_, "skipped request#%d\n", reqType);
+        //     }
+        // }
+        else if (std::binary_search(std::begin(ComputeAPIs), std::end(ComputeAPIs), reqType)
+              || reqType == CUDA_MALLOC
+              || reqType == CUDA_MEMSET || reqType == CUDA_MEMSET_ASYNC 
+              || reqType == CUDA_MEMCPY_H2D || reqType == CUDA_MEMCPY_ASYNC_H2D || reqType == CUDA_MEMCPY_ASYNC_D2D
+            ) { //todo: memory-related APIs
+            needErase = true;
+        }
+        else if (std::binary_search(std::begin(DestroyAPIs), std::end(DestroyAPIs), reqType)) {
+            if (reqType == NCCL_COMM_DEREGISTER) {
+                destroyHandles.insert(tmpReqIOV->GetHandleByIndex(1));
+            }
+            else {
+                destroyHandles.insert(tmpReqIOV->GetHandleByIndex(0));
+            }
+            needErase = true;
+        }
+        else if (reqType == CUDNN_SET_STREAM 
+              || reqType == CUBLAS_SET_STREAM_V2 || reqType == CUBLAS_SET_MATH_MODE) {
+            std::string tmpKey = std::to_string(reqType) + "_" + std::to_string(tmpReqIOV->GetHandleByIndex(0));
+            if (setHandles.contains(tmpKey)) {
+                needErase = true;
+            }
+            else {
+                setHandles.insert(tmpKey);
+            }
+        }
+        else {
+            ucp_dt_iov_t* iovs = tmpReqIOV->GetIOVs();
+            size_t iovNum = tmpReqIOV->GetNum();
+            for (size_t i = 0; i < iovNum; i++) {
+                if (iovs[i].length != sizeof(uint64_t) || *(void**)iovs[i].buffer == NULL) {
+                    continue;
+                }
+                if (destroyHandles.contains(*(uint64_t*)iovs[i].buffer)) {
+                    needErase = true;
+                    break;
+                }
+            }
+        }
+
+        if (eraseIt != reqIOVList.begin()) {
+            it--;
+        }
+        else {
+            exitFlag = true;
+        }
+
+        if (needErase) {
+            reqIOVList.erase(eraseIt);
+            delete tmpReqIOV;
+        }
+    }
+
+    destroyHandles.clear();
+    recordedReq = NULL;
+}
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc b/GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc
new file mode 100644
index 0000000..fb8c69a
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientEndpoint.cc
@@ -0,0 +1,345 @@
+#include "../../include/clientEndpoint.h"
+
+ClientEndpoint::ClientEndpoint(uint64_t clientID, size_t priority, ucp_worker_h clientWorker, int dev){
+    _clientID = clientID;
+    priority_ = priority;
+    _dataWorker = clientWorker;
+
+    clientIP_ = (char*)malloc(IP_STRING_LEN);
+    clientPort_ = (char*)malloc(PORT_STRING_LEN);
+
+    _myDevIdx = dev;
+
+    _threadID = static_cast<pid_t>(::syscall(SYS_gettid));
+    _processID = getpid();
+    Connect();
+
+    _shmOpt = shmOpt;
+
+    tool::Logging(LOG_DEBUG, myName_, "[pid:%d, tid:%d] client endpoint object is created successfully.\n", _processID, _threadID);
+}
+
+void ClientEndpoint::Connect(bool replay) {
+    uint16_t serverPort;
+#ifdef GV_GPUMAP
+    GpuInfoEntry_t* ginfo;
+
+#ifdef GV_BACKUP
+    if(replay){
+        std::cout << "replay" << std::endl;
+        gpuIdMap->ReallocGPU();
+    }
+#endif
+
+    gpuIdMap->GetGPUinfo(_myDevIdx, &ginfo);
+    const std::string serverIP(ginfo->nodeIp); 
+    serverPort = ginfo->nodePort;
+    shmOpt->WriteIpPort(ginfo->dataPort, ginfo->dataIp);
+#else
+    const std::string& serverIP = config_->GetServerIp();
+    serverPort = config_->GetServerPort();
+#endif
+
+    ucp_ep_params_t ep_params;
+    ucs_status_t    status;
+    struct sockaddr_storage serverAddr;
+    tool::SetSockAddr(serverIP.c_str(), serverPort, &serverAddr, AF_INET);
+    ep_params.field_mask                = UCP_EP_PARAM_FIELD_FLAGS       |
+                                          UCP_EP_PARAM_FIELD_SOCK_ADDR   |
+                                          UCP_EP_PARAM_FIELD_ERR_HANDLER |
+                                          UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE;                                          
+
+    ep_params.err_mode                  = UCP_ERR_HANDLING_MODE_PEER;
+    ep_params.err_handler.cb            = ClientErrorCallback;
+    ep_params.err_handler.arg           = &connStatus_;
+    ep_params.flags                     = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER |
+                                          UCP_EP_PARAMS_FLAGS_SEND_CLIENT_ID;
+    ep_params.sockaddr.addr             = (const struct sockaddr*)&serverAddr;
+    ep_params.sockaddr.addrlen          = sizeof(struct sockaddr_storage);
+
+    if ((status = ucp_ep_create(_dataWorker, &ep_params, &_serverEp)) != UCS_OK) {
+        tool::Logging(LOG_ERROR, myName_, "failed to create an endpoint and connect to the server (%s)\n", ucs_status_string(status));
+        ucp_worker_destroy(_dataWorker);
+    }
+    else {
+        ucp_am_handler_param_t param;
+        param.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID |
+                           UCP_AM_HANDLER_PARAM_FIELD_CB |
+                           UCP_AM_HANDLER_PARAM_FIELD_ARG;
+        param.id         = SERVER_STATUS;
+        param.cb         = ServerStatusCallback;
+        param.arg        = this;
+        if ((status = ucp_worker_set_am_recv_handler(_dataWorker, &param)) != UCS_OK) {
+            tool::Logging(LOG_ERROR, myName_, "failed to set am handler (%s)\n", ucs_status_string(status));
+        }
+        else {
+            SendMainDevice(replay);
+        }
+    }
+}
+
+/**
+ * Close UCP endpoint.
+ */
+void ClientEndpoint::CloseEp(uint64_t flags) {
+    ucp_request_param_t param;
+    ucs_status_t status;
+    void *close_req;
+
+    param.op_attr_mask = UCP_OP_ATTR_FIELD_FLAGS;
+    param.flags        = flags;
+    close_req          = ucp_ep_close_nbx(_serverEp, &param);
+    if (UCS_PTR_IS_PTR(close_req)) {
+        do {
+            ucp_worker_progress(_dataWorker);
+            status = ucp_request_check_status(close_req);
+        } while (status == UCS_INPROGRESS);
+        ucp_request_free(close_req);
+    } else {
+        status = UCS_PTR_STATUS(close_req);
+    }
+
+    if (status != UCS_OK) {
+        fprintf(stderr, "failed to close ep %p: %s\n", (void*)_serverEp,
+                ucs_status_string(status));
+    }
+}
+
+ClientEndpoint::~ClientEndpoint() {
+    CloseEp(UCP_EP_CLOSE_MODE_FLUSH);
+    ucp_worker_destroy(_dataWorker);
+
+    // if (_shmOpt != NULL) { //todo
+    //     delete _shmOpt;
+    // } 
+    tool::Logging(LOG_INFO, myName_, "close the client endpoint object(GPU #%d).\n", _myDevIdx);
+}
+
+ucs_status_t ClientEndpoint::SendRequest(RequestIOV* reqBuffer, bool forcedEager, bool isCheckpoint) {
+    tool::Logging(LOG_COMM, "SendRequest", "ready prepare amID: %d, apiID: %d, ttID: %d\n", (ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType(), reqBuffer->GetRequestType(), ttID);
+
+    reqBuffer->PushThreadID(ttID);
+    ucs_status_t status = SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(),
+         reqBuffer->GetHeaders(), reqBuffer->GetHeaderSize(), 
+         reqBuffer->GetRequestType(), &_dataWorker, &_serverEp,
+          true, UCS_MEMORY_TYPE_HOST, forcedEager);
+#ifdef GV_BACKUP
+    if (status == UCS_ERR_CONNECTION_RESET) {
+        {
+            std::unique_lock<std::mutex> lock(reConnectMutex);
+            if (!isReConnected) { // only the first thread can reconnect
+                tool::Logging(LOG_INFO, "SendRequest", "ready to reconnect...\n");
+                sleep(10);
+                Connect(true);
+                isReConnected = true;
+            }
+        }
+
+        tool::Logging(LOG_INFO, "SendRequest", "[pid:%d, ttid:%d] ready to resend previous request (requestType: %d)\n", _processID, ttID, reqBuffer->GetRequestType());
+        ucs_status_t status = SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(),
+         reqBuffer->GetHeaders(), reqBuffer->GetHeaderSize(), 
+         reqBuffer->GetRequestType(), &_dataWorker, &_serverEp,
+          true, UCS_MEMORY_TYPE_HOST, forcedEager);
+        if (status != UCS_OK) {
+            tool::Logging(LOG_ERROR, "SendRequest", "[pid:%d, ttid:%d] failed to resend the request (requestType: %d)\n", _processID, ttID, reqBuffer->GetRequestType());
+            exit(EXIT_FAILURE);
+        }
+    }
+    // isCheckpoint = false;
+    if (isCheckpoint) {
+        UpdateReqIOVList(reqBuffer);
+    }
+#endif // GV_BACKUP
+    CheckTensors(reqBuffer->GetRequestType());
+    return status;
+}
+
+void ClientEndpoint::SendRegisterRequest(ClientEndpoint* curEp, bool forcedEager) {
+    // tool::Logging(LOG_COMM, "SendRequest", "ready prepare amID: %d, apiID: %d, ttID: %d\n", (ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType(), reqBuffer->GetRequestType(), ttID);
+
+    // reqBuffer->PushThreadID(ttID);
+    // reqBuffer->Print();
+    for (RegisterIOV* reqBuffer : regIOVList) {
+        SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(),
+                reqBuffer->GetUcpHeaders(), reqBuffer->GetUcpHeaderSize(), 
+                reqBuffer->GetRequestType(), &curEp->_dataWorker, &curEp->_serverEp,
+                true, UCS_MEMORY_TYPE_HOST, forcedEager);
+    }
+}
+
+void ClientEndpoint::SendRequestH2D(RequestIOV* reqBuffer, uint8_t* header, size_t headerSize, bool forcedEager, bool isCheckpoint) {
+    // no response required for RNDV request, so no need to push threadID
+    SendData(reqBuffer->GetIOVs(), reqBuffer->GetNum(),
+         (size_t*)header, headerSize, 
+         reqBuffer->GetRequestType(), &_dataWorker, &_serverEp,
+          false, UCS_MEMORY_TYPE_HOST, forcedEager);
+    CheckTensors(reqBuffer->GetRequestType());
+
+#ifdef GV_BACKUP
+    if (memcpyRecord_ && isCheckpoint) {
+        UpdateReqIOVList(reqBuffer, header, headerSize);
+    }
+#endif // GV_BACKUP
+}
+
+void ClientEndpoint::SendNewIterRequest(size_t iterNum) {
+    // Send a request to the server to notify the start of a new iteration
+    RequestIOV reqBuffer;
+    reqBuffer.PushRequestType(NEW_ITERATION_REQ);
+    reqBuffer.PushConst(tensorByteList.size());
+    reqBuffer.Push(tensorByteList.data(), tensorByteList.size());
+    SendRequest(&reqBuffer, true);
+
+#ifdef GV_BACKUP
+    if (iterNum + 1 == BACKUP_PERIOD) {
+        memcpyRecord_ = true; // prepare for recording the memcpy requests for the next iteration (before shrinking the list)
+    }
+    if (iterNum > 0 && iterNum % BACKUP_PERIOD == 0) {
+        Checkpointing();
+    }
+#endif // BACKUP
+
+    // cudaStreamSynchronize(NULL);
+}
+
+void ClientEndpoint::SendMainDevice(bool replay) {
+    bool loadData = replay;
+    int gpuIdInNode = _myDevIdx;
+#ifdef GV_GPUMAP
+    gpuIdMap->GetGPUId(_myDevIdx, &gpuIdInNode);
+#endif // GV_GPUMAP
+
+    RequestIOV reqBuffer;
+    reqBuffer.PushRequestType(CUDA_SET_MAIN_DEVICE);
+    reqBuffer.Push(gpuIdInNode);
+    reqBuffer.Push(_processID);
+    reqBuffer.Push(priority_);
+    reqBuffer.Push(loadData);
+    reqBuffer.PushThreadID(ttID);
+    SendData(reqBuffer.GetIOVs(), reqBuffer.GetNum(),
+         reqBuffer.GetHeaders(), reqBuffer.GetHeaderSize(), 
+         reqBuffer.GetRequestType(), &_dataWorker, &_serverEp,
+          false, UCS_MEMORY_TYPE_HOST, true);
+    
+
+    ucp_ep_attr_t ep_attr;
+    ep_attr.field_mask = UCP_EP_ATTR_FIELD_LOCAL_SOCKADDR |
+                            UCP_EP_ATTR_FIELD_REMOTE_SOCKADDR;
+    ucs_status_t status = ucp_ep_query(_serverEp, &ep_attr);
+    if (status != UCS_OK) {
+        tool::Logging(LOG_ERROR, myName_, "failed to query the endpoint: %s\n", ucs_status_string(status));
+        return;
+    }
+    else {
+        tool::GetIpStrFromSockaddr(&ep_attr.local_sockaddr, clientIP_, IP_STRING_LEN);
+        tool::GetPortStrFromSockaddr(&ep_attr.local_sockaddr, clientPort_, PORT_STRING_LEN);
+        tool::Logging(LOG_INFO, myName_, "[pid:%d, tid:%d] client(%s:%s) has connected to server and will use GPU #%d(IDinNode:#%d)\n", _processID, _threadID, clientIP_, clientPort_, _myDevIdx, gpuIdInNode);
+    }
+    connStatus_.isClosed = false;
+
+    if (replay) {
+        Replay();
+    }
+}
+
+ucs_status_t ClientEndpoint::SendRequestRecvResponse(RequestIOV* reqBuffer, RequestIOV* responseBuffer, bool forcedEager, bool isCheckpoint) {
+    const char* myName = "SendRequestRecvResponse";
+    ucs_status_t status = UCS_OK;
+    // isCheckpoint = false;
+
+    // prepare for receiving response first (avoid receiving message before setting the handler)
+    ucp_dt_iov_t *iov   = responseBuffer->GetIOVs();
+    size_t       iovNum = responseBuffer->GetNum();
+    NewActiveMessageDesc_t am_request_ctx = { .complete = 0, .is_rndv = 0, .mem_type = UCS_MEMORY_TYPE_HOST, .desc = NULL, 
+                                           .iov = iov, .iov_num = iovNum};
+                                        //    .iov = iov, .iov_num = iovNum, .send_amID = ((ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType())};
+    Request_t recv_request_ctx = {.type = 1, .complete = 0};
+    ucp_am_handler_param_t param1;
+    param1.field_mask   = UCP_AM_HANDLER_PARAM_FIELD_ID |
+                          UCP_AM_HANDLER_PARAM_FIELD_CB |
+                          UCP_AM_HANDLER_PARAM_FIELD_ARG;
+    param1.id           = (ttID << RECV_AM_SHIFT_BIT) + reqBuffer->GetRequestType(); 
+    param1.cb           = RetrieveData;
+    param1.arg          = &am_request_ctx;
+    tool::Logging(LOG_COMM, myName, "amID: %d, requestType: %d, ttID: %d\n", param1.id, reqBuffer->GetRequestType(), ttID);
+    status = ucp_worker_set_am_recv_handler(_dataWorker, &param1);
+    if (status != UCS_OK) {
+        tool::Logging(LOG_ERROR, myName, "failed to set am handler: %s\n", ucs_status_string(status));
+        return status;
+    }
+    else {
+        tool::Logging(LOG_COMM, myName, "set am handler successfully for param1.id: %d\n", param1.id);
+    }
+
+    // send request and wait for completion
+    SendRequest(reqBuffer, forcedEager, false);
+
+    // wait for response
+    tool::Logging(LOG_COMM, myName, "waiting for server to send response.\n");
+    while (!am_request_ctx.complete && !connStatus_.isClosed) { // waiting ActiveMessageRecvCallback() to be invoked
+        ucp_worker_progress(_dataWorker);
+    }
+    if (connStatus_.isClosed) {
+        tool::Logging(LOG_DEBUG, myName, "connection is closed, try to resend the request.\n");
+        
+        // re-send request and wait for completion
+        SendRequest(reqBuffer, forcedEager, false);
+        ucp_worker_set_am_recv_handler(_dataWorker, &param1); // make sure the handler is set
+        tool::Logging(LOG_COMM, myName, "waiting for server to send response again.\n");
+        while (!am_request_ctx.complete && !connStatus_.isClosed) { // waiting ActiveMessageRecvCallback() to be invoked
+            ucp_worker_progress(_dataWorker);
+        }
+        if (connStatus_.isClosed) {
+            tool::Logging(LOG_ERROR, myName, "connection is still closed, failed to receive response.\n");
+            return UCS_ERR_CONNECTION_RESET;
+        }
+        tool::Logging(LOG_COMM, myName, "response has arrived after re-sending the request.\n");
+    }
+
+    if (!am_request_ctx.is_rndv) {
+        tool::Logging(LOG_COMM, myName, "Eager response has arrived.\n");
+    }
+    else {
+        tool::Logging(LOG_COMM, myName, "Rendezvous response has arrived.\n");
+        recv_request_ctx.complete = 0;
+        ucp_request_param_t param2;
+        param2.op_attr_mask     = UCP_OP_ATTR_FIELD_CALLBACK |
+                                  UCP_OP_ATTR_FIELD_DATATYPE |
+                                  UCP_OP_ATTR_FIELD_USER_DATA|
+                                  UCP_OP_ATTR_FIELD_MEMORY_TYPE;
+        param2.op_attr_mask    |= UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
+        param2.datatype         = ucp_dt_make_contig(1);
+        param2.user_data        = &recv_request_ctx;
+        param2.cb.recv_am       = (ucp_am_recv_data_nbx_callback_t)RecvCallBack;
+        param2.memory_type      = UCS_MEMORY_TYPE_HOST;
+        Request_t* rndv_request = (Request_t*)ucp_am_recv_data_nbx(_dataWorker,
+                                              am_request_ctx.desc,
+                                              iov[0].buffer, iov[0].length, 
+                                             &param2);
+        status = Wait(rndv_request, &recv_request_ctx, &_dataWorker); 
+        if (status != UCS_OK) {
+            tool::Logging(LOG_ERROR, myName, "ucp_am_recv_data_nbx failed: %s\n", ucs_status_string(status));
+        }
+        else {
+            tool::Logging(LOG_COMM, myName, "ucp_am_recv_data_nbx completed successfully.\n");
+        }
+        ucp_request_free(rndv_request);
+    }
+
+#ifdef GV_BACKUP
+    if (isCheckpoint) {
+        int reqType = reqBuffer->GetRequestType();
+        if (std::binary_search(std::begin(CreateAPIs), std::end(CreateAPIs), reqType)) {
+            UpdateReqIOVList(reqBuffer);
+        }
+        else if (reqType == CUDA_STREAM_SYNCHRONIZE) {
+            UpdateReqIOVList(reqBuffer);
+        }
+        else if (reqType == CUDA_MALLOC) {
+            UpdateReqIOVList(reqBuffer);
+        }
+    }
+#endif // GV_BACKUP
+
+    return status;
+}
\ No newline at end of file
diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc
new file mode 100644
index 0000000..81273e8
--- /dev/null
+++ b/GPU-Virtual-Service/gpu-remoting/src/client/clientHook.cc
@@ -0,0 +1,224 @@
+#include "../../include/hook/hook.h"
+
+
+int mainDevIdx; // different ranks(processes) may use different devices
+std::vector<ClientEndpoint*> clientEpList;
+std::vector<bool> threadValidList;
+
+Configure* config_;
+UCPConnection* connectionObj;
+SharedMemoryOpt* shmOpt;
+GPUidMap* gpuIdMap;
+std::once_flag initFlag;
+std::once_flag registerFlag;
+std::vector<RegisterIOV*> regIOVList;
+std::vector<KernelInfo_t*> registeredKernels;
+robin_hood::unordered_flat_map<uint64_t, KernelInfo_t*> mapHost2KernelInfo;
+
+bool isReConnected = false;
+std::mutex reConnectMutex;
+std::shared_mutex threadSharedMutex;
+int processID; // the process ID of the current process
+int threadNum; // how many sub-threads have been created in this process
+int commDevIdx; // the device index for the communicator
+thread_local int threadID; // e.g, 1641432
+thread_local int ttID; // e.g, 1, 2, 3, ...
+thread_local ClientEndpoint* clientEpObj; // the client endpoint object for the current thread
+thread_local int myDevIdx;
+thread_local int lastReqType = -1;
+thread_local bool isTraining = false;
+thread_local bool batchCollected = false;
+thread_local int curTensorIdx = -1;
+thread_local size_t curIter = 0;
+thread_local std::vector<TensorInfo_t> tensorByteList;
+
+void Intialize() {
+    config_ = new Configure("config.json", true);
+    // regIOV = new RegisterIOV();
+    regIOVList.reserve(240);
+    registeredKernels.reserve(1000);
+    clientEpList.reserve(config_->GetReqGPUnum());
+    for (int i = 0; i < config_->GetReqGPUnum(); i++) {
+        clientEpList.push_back(nullptr);
+    }
+
+    processID = getpid();
+    std::string shmName = "/flexgv_shm_" + std::to_string(config_->GetClientID()) + "_" + std::to_string(processID) + "_datatype";
+    // CurType + dataFeeder IP + dataFeeder Port + CurBatchSize
+    shmOpt = new SharedMemoryOpt(shmName, sizeof(BatchInfo_t) + sizeof(int) + IP_STRING_LEN); 
+    
+
+    connectionObj = new UCPConnection(true);
+#ifdef GV_GPUMAP
+#ifndef GV_MSGHANDLER
+    gpuIdMap = new GPUidMap(config_->GetReqGPUnum(), config_->GetClientID(), config_->GetProxyIp(), config_->GetProxyPort());
+#else
+    gpuIdMap = new GPUidMap(config_->GetReqGPUnum(), config_->GetClientID(), config_->GetModel(), config_->GetBatchSize(), config_->GetProxyIp(), config_->GetProxyPort());
+#endif
+// #elif 
+//     mainDevIdx = 0;  // todo: retrieve the available device from shared memory
+//     SwitchClientEp(mainDevIdx); // switch to the main device as default
+#endif 
+
+    // threadID = static_cast<pid_t>(::syscall(SYS_gettid));
+    // threadNum++;
+    // ttID = threadNum;
+}
+
+void SwitchClientEp(int dev, bool threadInit) { // used for cudaSetDevice or first-time initialization
+    ClientEndpoint* localClientEpObj = nullptr;
+    {
+        std::shared_lock<std::shared_mutex> readLock(threadSharedMutex);
+        localClientEpObj = clientEpList[dev];   // read from the list
+    }
+    if (localClientEpObj == nullptr             // the clientEp for dev is not created yet
+        || threadInit) {                        // or the thread is newly created, so need to update the threadID
+        std::unique_lock<std::shared_mutex> writeLock(threadSharedMutex);
+
+        // re-check: another thread may have created the object while waiting for the lock
+        if (clientEpList[dev] == nullptr) { // not found, create a new one for dev
+            uint64_t clientID = config_->GetClientID();
+            size_t priority = config_->GetPriority();
+            ClientEndpoint* newClientEpObj = new ClientEndpoint(clientID, priority, connectionObj->CreateWorker(true, clientID), dev); 
+            clientEpList[dev] = newClientEpObj;
+        }
+        localClientEpObj = clientEpList[dev];
+
+        if (threadInit) { // used for sub-thread initialization
+            threadID = static_cast<pid_t>(::syscall(SYS_gettid));
+            threadNum++;
+            ttID = threadNum;
+            threadValidList.push_back(true);
+            tool::Logging(LOG_DEBUG, "SetupClientEpIfNeeded", "new thread#%d(%d) in process(%d) has been created\n", ttID, threadID, processID);
+        }
+        else {
+            threadValidList[ttID - 1] = true;
+        }
+    }
+    clientEpObj = localClientEpObj; // switch to the target object
+}
+
+void DestoryResources() {
+    if (config_ != nullptr) {
+        delete config_;
+        config_ = nullptr;
+    }
+    if (!mapHost2KernelInfo.empty()) {
+        for(auto it = mapHost2KernelInfo.begin(); it != mapHost2KernelInfo.end(); it++) {
+            free(it->second);
+        }
+        mapHost2KernelInfo.clear();
+    }
+    if (!regIOVList.empty()) {
+        for(auto it = regIOVList.begin(); it != regIOVList.end(); it++) {
+            delete *it;
+        }
+        regIOVList.clear();
+    }
+    if (!clientEpList.empty()) {
+        for(auto it = clientEpList.begin(); it != clientEpList.end(); it++) {
+            delete *it;
+        }
+        clientEpList.clear();
+        delete connectionObj;
+        delete shmOpt;
+#ifdef GV_GPUMAP
+        delete gpuIdMap;
+#endif
+    }
+}
+
+ucs_status_t ServerStatusCallback(void *arg, const void *header, size_t header_length, void *data, size_t length, const ucp_am_recv_param_t *param) {
+    const char* myName = "ServerStatusCallback";
+    ClientEndpoint* curEp = (ClientEndpoint*)arg;
+    printf("%s: header_length: %zu, length: %zu\n", myName, header_length, length);
+    RequestIOV reqBuf = RequestIOV(header, header_length, data);
+    int serverStatus = reqBuf.Pop<int>();
+    if (serverStatus == 0) {
+        tool::Logging(LOG_INFO, myName, "sucess: the server has received the request.\n");
+    }
+    else {
+        tool::Logging(LOG_ERROR, myName, "failed: the server's status is %d.\n", serverStatus);
+    }
+    return UCS_OK;
+}
+
+void ClientErrorCallback(void *arg, ucp_ep_h ep, ucs_status_t status) {
+    const char* myName = "ClientErrorCallback";
+    ConnStatus_t* connStatus = (ConnStatus_t*)arg;
+    if (status == UCS_ERR_CONNECTION_RESET) {
+        tool::Logging(LOG_INFO, myName, "connection failed: the server has shutdown the connection early.\n");
+        connStatus->isClosed = true;
+#ifndef GV_BACKUP
+        DestoryResources();
+        exit(EXIT_FAILURE);
+#endif
+        // exit(0);
+        // ReConnect();
+    }
+    else if (status == UCS_ERR_ENDPOINT_TIMEOUT) {
+        tool::Logging(LOG_ERROR, myName, "connection failed: the connection is timed out.\n");
+        DestoryResources();
+        exit(EXIT_FAILURE);
+    }
+    else if (status == UCS_ERR_NOT_CONNECTED) {
+        tool::Logging(LOG_ERROR, myName, "connection failed: the server is not connected or the connection is closed.\n");
+        DestoryResources();
+        exit(EXIT_FAILURE);
+    } 
+    else {
+        tool::Logging(LOG_ERROR, myName, "connection failed: %d(%s)\n",
+            status, ucs_status_string(status));
+        connStatus->isClosed = true;
+    }
+}
+
+void CheckTensors(int reqType) {
+    if (reqType != CUDA_MEMCPY_ASYNC_H2D && batchCollected == false && isTraining == true) {
+        if (lastReqType == CUDA_MEMCPY_ASYNC_H2D && reqType == CUDA_STREAM_SYNCHRONIZE) {
+            // maybe next request is cudaMemcpyAsync
+            tool::Logging(LOG_DEBUG, HOOK_LOG_TAG, "continue to collect next tensor size\n");
+        }
+        else if (tensorByteList.size() >= 2) { // at least 2 tensors are collected
+            // now cudaMemcpyAsync is no longer called consecutively
+            batchCollected = true;
+            
+            // debug
+            for (size_t i = 0; i < tensorByteList.size(); i++) {
+                tool::Logging(LOG_INFO, HOOK_LOG_TAG, "batch[%zu]: %zu\n", i, tensorByteList[i].size);
+            }
+        }
+        else {
+            tool::Logging(LOG_DEBUG, HOOK_LOG_TAG, "only one tensor is collected, curReqType: %d\n", reqType);
+            tensorByteList.clear();
+        }
+    }
+    lastReqType = reqType;
+}
+
+bool CheckIteration(void* dst, size_t size) {
+    bool isNewIter = false;
+    if (isTraining == false) {
+    }
+    else if (batchCollected == false) {
+        tensorByteList.push_back({NULL, size});
+    }
+    else if (curTensorIdx >= 0 && (lastReqType != CUDA_MEMCPY_ASYNC_H2D && lastReqType != CUDA_STREAM_SYNCHRONIZE)) {
+        curTensorIdx = -1;
+    }
+    else if (tensorByteList[curTensorIdx + 1].size == size) {
+        curTensorIdx ++;
+        tensorByteList[curTensorIdx].devPtr = dst;
+        if (curTensorIdx == tensorByteList.size() - 1) {
+            curTensorIdx = -1;
+            curIter ++;
+            isNewIter = true;
+            tool::Logging(HOOK_LOG_TAG, "[pid:%d, tid:%d] curIter: %zu (#%zu in period)\n", processID, threadID, curIter, curIter % BACKUP_PERIOD);
+            clientEpObj->SendNewIterRequest(curIter);
+        }
+    }
+    else {
+        curTensorIdx = -1;
+    }
+    return isNewIter;
+}
\ No newline at end of file