-
Notifications
You must be signed in to change notification settings - Fork 20
git push --set-upstream origin add-openclaw-training #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
9bea880
208b0ba
504c5e4
fe7f637
db4890a
81dd586
ce15691
ea15e7e
64322e7
6768f47
40b3c73
1f3f3ef
c523a40
f906d2e
33f8177
e5e6730
38eee49
45d8177
0b8b500
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,8 +65,6 @@ def __init__( | |
| token_arr=[], | ||
| token_begin_index=-1, | ||
| token_end_index=-1, | ||
| clip=False, | ||
| clip_token_limit=8192, | ||
| tokenizer: PreTrainedTokenizer = None, # type: ignore | ||
| token_generator="manual", | ||
| build_from_uuid="", | ||
|
|
@@ -85,9 +83,8 @@ def __init__( | |
| self.token_begin_index = token_begin_index | ||
| self.token_end_index = token_end_index | ||
| self.invalid_log_prob_value = INVALID_LOG_PROB_VALUE | ||
| self._content_for_future = "" | ||
| self._content_for_compare = "" | ||
| self._info = "" | ||
|
Comment on lines
+86
to
87
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The renaming of |
||
| self.clip = clip | ||
| self.tools = tools | ||
| self.tool_calls = tool_calls | ||
| self.tool_call_id = tool_call_id | ||
|
|
@@ -101,14 +98,8 @@ def __init__( | |
| self.manual_loss_mask_override = [] | ||
| self.lack_normal_eos = False | ||
|
|
||
| if not clip: | ||
| self.generate_content_for_future(tokenizer=None, clip=False) | ||
| else: | ||
| self.generate_content_for_future( | ||
| tokenizer=tokenizer, | ||
| clip=True, | ||
| clip_token_limit=clip_token_limit, | ||
| ) | ||
| self.generate_content_for_compare(tokenizer=None) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The replacement of conditional |
||
|
|
||
| self.eos_token_id = tokenizer.eos_token_id | ||
|
|
||
| if token_generator == "auto": | ||
|
|
@@ -127,9 +118,9 @@ def auto_tokenize(self, tokenizer, tools): | |
| if not self.first_message: | ||
| self.token_arr = self.auto_tokenize_non_first_message(tokenizer=tokenizer, tools=tools) | ||
| else: | ||
| auto_tokenize_target = { | ||
| auto_tokenize_target:dict = { | ||
| "role": self.role, | ||
| "content": self.content_for_future, | ||
| "content": self.content_for_compare, | ||
| } | ||
| if self.tool_calls: | ||
| auto_tokenize_target.update({"tool_calls": self.tool_calls}) | ||
|
|
@@ -144,9 +135,9 @@ def auto_tokenize(self, tokenizer, tools): | |
| def auto_tokenize_non_first_message(self, tokenizer, tools): | ||
| try: | ||
| # completion_token_arr will contain generation_prompt header | ||
| auto_tokenize_target = { | ||
| auto_tokenize_target:dict = { | ||
| "role": self.role, | ||
| "content": self.content_for_future, | ||
| "content": self.content_for_compare, | ||
| } | ||
| if self.tool_calls: | ||
| auto_tokenize_target.update({"tool_calls": self.tool_calls}) | ||
|
|
@@ -160,7 +151,7 @@ def auto_tokenize_non_first_message(self, tokenizer, tools): | |
| ) | ||
| except Exception as e: | ||
| raise ValueError( | ||
| f"Cannot tokenize {self.role} --- {self.content_for_future}, \n\n Error: {e}" | ||
| f"Cannot tokenize {self.role} --- {self.content_for_compare}, \n\n Error: {e}" | ||
| ) | ||
| self.token_arr, _ = self.get_inc_simple( | ||
| text_frag_from=ajet_apply_chat_template( | ||
|
|
@@ -175,12 +166,12 @@ def auto_tokenize_non_first_message(self, tokenizer, tools): | |
| return self.token_arr | ||
|
|
||
| @property | ||
| def content_for_future(self): | ||
| if self._content_for_future == "": | ||
| def content_for_compare(self): | ||
| if self._content_for_compare == "": | ||
| if not self.tool_calls: | ||
| logger.exception("content_for_future is not set, or previous llm output is empty!") | ||
| self._content_for_future | ||
| return self._content_for_future | ||
| logger.exception("content_for_compare is not set, or previous llm output is empty!") | ||
| self._content_for_compare | ||
| return self._content_for_compare | ||
|
|
||
| @property | ||
| def need_training(self): | ||
|
|
@@ -191,19 +182,9 @@ def need_training(self): | |
| ), f"author {self.author} is not identified" | ||
| return self.author in NEED_TRAIN_AUTHORS | ||
|
|
||
| def generate_content_for_future(self, tokenizer, clip, clip_token_limit=-1): | ||
| def generate_content_for_compare(self, tokenizer): | ||
| _content: str = self.content | ||
| if clip: | ||
| assert clip_token_limit > 0, "clip_token_limit must be set when clip is True" | ||
| n_token = len(tokenizer(_content, return_tensors="pt", padding=False)["input_ids"][0]) | ||
| if n_token > clip_token_limit: | ||
| # 8000 > 4000 | ||
| n_char = len(_content) # 10,000 | ||
| eps = 100 # token | ||
| preserve_percent = (clip_token_limit - eps) / n_token # 3900 / 8000 | ||
| n_char_to_preserve = int(n_char * preserve_percent) | ||
| _content = _content[:n_char_to_preserve] + "... truncate ..." | ||
| self._content_for_future = _content | ||
| self._content_for_compare = _content | ||
|
|
||
| def get_loss_mask(self, blackout_token_combo): | ||
| if self.need_training: | ||
|
|
@@ -315,7 +296,7 @@ def merge_tool_group(group, tokenizer): | |
| ) | ||
| # re-compute token_arr | ||
| auto_tokenize_targets = [ | ||
| {"role": msg.role, "content": msg.content_for_future} for msg in group | ||
| {"role": msg.role, "content": msg.content_for_compare} for msg in group | ||
| ] | ||
| merged.token_arr, _ = merged.get_inc_simple( | ||
| text_frag_from=ajet_apply_chat_template( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The removal of
clipandclip_token_limitparameters from the__init__method, along with the associated clipping logic, represents a functional change. If this clipping functionality was previously used or intended for future use, its removal should be clearly documented or justified to ensure no unintended side effects or loss of features.