-
Notifications
You must be signed in to change notification settings - Fork 744
[Scheduler] Defer block recycling to accelerate LRU node freeing #7885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1340,20 +1340,24 @@ def free_nodes_directly(self, node): | |
| logger.error(f"free_nodes_directly: error: {type(e)} {e}, {traceback.format_exc()}") | ||
| raise e | ||
|
|
||
| def _handle_free_gpu_node_without_cpu(self, node): | ||
| def _handle_free_gpu_node_without_cpu(self, node, defer_recycle=False): | ||
| """ | ||
| GPU node eviction | ||
| """ | ||
| node.cache_status = CacheStatus.CPU | ||
|
|
||
| self.node_id_pool.append(node.node_id) | ||
| if node.node_id in self.node_map: | ||
| del self.node_map[node.node_id] | ||
| logger.info(f"free_block_ids_async: free node {node}") | ||
| logger.info(f"_handle_free_gpu_node_without_cpu: free node {node.node_id}") | ||
|
|
||
This comment was marked as outdated.
Sorry, something went wrong.
This comment was marked as outdated.
Sorry, something went wrong. |
||
| self.recycle_gpu_blocks(node.reverved_dec_block_ids) | ||
| node.reverved_dec_block_ids = [] | ||
| self.recycle_gpu_blocks(node.block_id) | ||
| blocks_to_recycle = list(node.reverved_dec_block_ids) + [node.block_id] | ||
| if not defer_recycle: | ||
| self.recycle_gpu_blocks(blocks_to_recycle) | ||
| logger.info( | ||
| f"_handle_free_gpu_node_without_cpu: recycle blocks for node {node.node_id}, {blocks_to_recycle}" | ||
| ) | ||
| return [] | ||
| else: | ||
| return blocks_to_recycle | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug 原代码在 建议修复:在 blocks_to_recycle = list(node.reverved_dec_block_ids) + [node.block_id]
node.reverved_dec_block_ids = [] # 立即清空,防止双重回收
if not defer_recycle:
...
else:
return blocks_to_recycleThere was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug 原代码在函数入口处设置 建议修复:在函数开头恢复状态更新(或调整为更合适的驱逐态枚举值): node.cache_status = CacheStatus.CPU # 或新增 EVICTED 状态 |
||
|
|
||
| def _handle_free_gpu_node_with_cpu( | ||
| self, | ||
|
|
@@ -1449,6 +1453,9 @@ def free_block_ids_async(self, need_block_num): | |
| hash_value_flush_info = {} # {input_hash_value: (token_ids, min_depth)} | ||
| total_gpu_free_count = 0 | ||
|
|
||
| # Defer block recycling to avoid busy heap operations in loop | ||
| blocks_deferred_to_recycle = [] | ||
|
|
||
| while True: | ||
| if len(self.gpu_lru_leaf_heap) == 0: | ||
| logger.info("free_block_ids_async: no more gpu leaf node available.") | ||
|
|
@@ -1463,24 +1470,34 @@ def free_block_ids_async(self, need_block_num): | |
| key = node.input_hash_value | ||
| if key not in hash_value_flush_info or node.depth < hash_value_flush_info[key][1]: | ||
| hash_value_flush_info[key] = (node.input_ids, node.depth) | ||
| self._handle_free_gpu_node_without_cpu(node) | ||
| blocks_deferred_to_recycle.extend( | ||
| self._handle_free_gpu_node_without_cpu(node, defer_recycle=True) | ||
| ) | ||
| total_gpu_free_count += 1 | ||
| cur_node = node | ||
| node = node.parent | ||
| if cur_node.hash_value in node.children: | ||
| del node.children[cur_node.hash_value] | ||
| if not node.children: | ||
| if node in self.gpu_lru_leaf_set: | ||
|
|
||
| # Disconnect node from its parent node | ||
| parent = node.parent | ||
| if node.hash_value in parent.children: | ||
| del parent.children[node.hash_value] | ||
|
|
||
| if not parent.children: | ||
| if parent in self.gpu_lru_leaf_set: | ||
| logger.warning( | ||
| f"Node {parent.node_id} is already in gpu lru leaf heap, duplicated node free may occured!" | ||
| ) | ||
| continue | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 当 建议确认:父节点被加入 gpu_lru_leaf_heap 前,是否应先检查其是否已被从 node_map 删除(即是否已经历过 |
||
| if ( | ||
| node != self.radix_tree_root | ||
| and node.shared_count == 0 | ||
| and node.is_gpu_leaf_node | ||
| and node.is_persistent is False | ||
| parent != self.radix_tree_root | ||
| and parent.shared_count == 0 | ||
| and parent.is_gpu_leaf_node | ||
| and parent.is_persistent is False | ||
| ): | ||
| heapq.heappush(self.gpu_lru_leaf_heap, node) | ||
| self.gpu_lru_leaf_set.add(node) | ||
| heapq.heappush(self.gpu_lru_leaf_heap, parent) | ||
| self.gpu_lru_leaf_set.add(parent) | ||
| else: | ||
| logger.warning( | ||
| f"Node {node.node_id} popped out of gpu lru leaf heap, but its shared count is not zero." | ||
| ) | ||
| continue | ||
| else: | ||
| if node.shared_count == 0 and node.is_gpu_leaf_node: | ||
|
|
@@ -1512,6 +1529,12 @@ def free_block_ids_async(self, need_block_num): | |
| f"free_block_ids_async: need_block_num {need_block_num}, free_block_num {total_gpu_free_count}." | ||
| ) | ||
|
|
||
| if blocks_deferred_to_recycle: | ||
| self.recycle_gpu_blocks(blocks_deferred_to_recycle) | ||
| logger.info( | ||
| f"free_block_ids_async: deferred recycling {len(blocks_deferred_to_recycle)} blocks, {blocks_deferred_to_recycle}" | ||
| ) | ||
|
|
||
| if ( | ||
| envs.FD_AS_ONLY_FLUSH | ||
| and self.kvcache_storage_backend == "attention_store" | ||
|
|
||
This comment was marked as outdated.
Sorry, something went wrong.
Uh oh!
There was an error while loading. Please reload this page.