fix: handle adjacent @@ variable tokens in split_words() (#15)

hwu71 · mahaloz · web-flow · commit 043432445562 · 2026-02-22T17:35:40.000-07:00
* fix: handle adjacent @@ variable tokens in split_words()

When variables appear adjacent without spaces in decompiled code
(e.g., func(a,b,c)), the @@ placeholder tokens merge into one word.
re.search() only matched the first pattern, silently losing the rest
and causing a holder/mask count mismatch that discards all predictions.

Replace re.search() with re.finditer() to extract all @@ patterns.

* fix broken tests with pin

---------

Co-authored-by: mahaloz &lt;zion@zionbasque.com&gt;
diff --git a/setup.cfg b/setup.cfg
@@ -15,7 +15,7 @@ long_description_content_type = text/markdown
 [options]
 install_requires =
     torch
-    transformers
+    transformers>=5.2.0
     tqdm
     dailalib
     libbs>=1.18.1
diff --git a/varbert/__init__.py b/varbert/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.3.0"
+__version__ = "2.3.1"
 
 import importlib.resources
 import tarfile
diff --git a/varbert/model.py b/varbert/model.py
@@ -107,7 +107,7 @@ def varec_init(self):
             str(self.model_base_dir),
             avar_vocab_size = self.vocab_size,
             from_tf=False,
-            config=config            
+            config=config
         )
 
         model.to(device)
@@ -116,21 +116,24 @@ def varec_init(self):
     @staticmethod
     def create_inputs_for_model(code_txt, tokenizer):
         input_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(code_txt))
-        input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)
+        input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
         return torch.tensor(input_ids, dtype=torch.long)
 
     @staticmethod
     def split_words(text: str):
         words = text.replace("\n", " ").split(" ")
         r = []
         for w in words:
-            m = re.search(r"@@[^\s@]+@@[^\s@]+@@", w)
-            if m is not None:
-                if m.start() > 0:
-                    r.append(w[: m.start()])
-                r.append(w[m.start(): m.end()])
-                if m.end() < len(w):
-                    r.append(w[m.end():])
+            matches = list(re.finditer(r"@@[^\s@]+@@[^\s@]+@@", w))
+            if matches:
+                pos = 0
+                for m in matches:
+                    if m.start() > pos:
+                        r.append(w[pos: m.start()])
+                    r.append(w[m.start(): m.end()])
+                    pos = m.end()
+                if pos < len(w):
+                    r.append(w[pos:])
             else:
                 r.append(w)
         r = [w for w in r if len(w) > 0]
@@ -206,7 +209,7 @@ def preprocess_word_mask(self, ftext, tokenizer):
                     tpwords.append(vocab[t])
                     towords.append(vocab[t])
                     pos += 1
-        
+
         assert len(tpwords) == len(towords)
         assert None not in tpwords
         assert None not in towords
@@ -280,7 +283,7 @@ def process(self, code: str):
         # _code = "\n".join(_code_lines)
 
         input_ids = self.preprocess_word_mask(_code, tokenizer)[0]
-        input_ids_with_special_tokens = tokenizer.build_inputs_with_special_tokens(input_ids)
+        input_ids_with_special_tokens = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
         if len(input_ids_with_special_tokens) < 800:
             # padding
             padded_input_ids = input_ids_with_special_tokens[:-1] + [1] * 800 + [2]
@@ -411,4 +414,3 @@ def forward(
     "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
     "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
 }
-

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "2.3.0"`
	`1`	`+__version__ = "2.3.1"`
`2`	`2`
`3`	`3`	`import importlib.resources`
`4`	`4`	`import tarfile`