secure-software-engineering · bgoconnor · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/src/target_tools/headergen/Dockerfile b/src/target_tools/headergen/Dockerfile
@@ -10,7 +10,7 @@ WORKDIR /app
 
 # Install dependencies
 RUN apt-get update \
-    && apt-get -y install git gcc
+    && apt-get -y install git gcc g++
 
 
 COPY requirements.txt /app/requirements.txt

diff --git a/src/target_tools/headergen/src/runner.py b/src/target_tools/headergen/src/runner.py
@@ -62,6 +62,7 @@ def main_runner(args):
             logger.info(file)
 
             inferred = process_file(file)
+            inferred = translator.enrich_with_col_offsets(file, inferred)
 
             json_file_path = str(file).replace(".py", "_result.json")
 

diff --git a/src/target_tools/headergen/src/translator.py b/src/target_tools/headergen/src/translator.py
@@ -1,6 +1,8 @@
 import argparse
+import ast
 import json
 import os
+from collections import defaultdict
 from pathlib import Path
 
 
@@ -9,6 +11,75 @@ def list_json_files(folder_path):
     return python_files
 
 
+def build_position_map(source_path):
+    """Map (name, line_number) -> [1-indexed col_offsets] for every name
+    occurrence in the source. HeaderGen's server doesn't emit col_offset, but
+    for any (name, line) it gives us, the column is determined by the source.
+    We keep all candidates so the enrichment can skip ambiguous cases."""
+    positions = defaultdict(list)
+    try:
+        with open(source_path) as f:
+            tree = ast.parse(f.read())
+    except Exception:
+        return positions
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Name):
+            positions[(node.id, node.lineno)].append(node.col_offset + 1)
+        elif isinstance(node, ast.arg):
+            positions[(node.arg, node.lineno)].append(node.col_offset + 1)
+        elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            prefix = (
+                "async def " if isinstance(node, ast.AsyncFunctionDef) else "def "
+            )
+            positions[(node.name, node.lineno)].append(
+                node.col_offset + len(prefix) + 1
+            )
+        elif isinstance(node, ast.ClassDef):
+            positions[(node.name, node.lineno)].append(
+                node.col_offset + len("class ") + 1
+            )
+    return positions
+
+
+def _lookup_name(entry):
+    """Return the source-level name to look up for this entry's position."""
+    if "variable" in entry:
+        # Subscript/attribute accesses like 'h[0]' or 'self.child' are
+        # reported as the full expression; the col_offset GT expects is
+        # where the base name begins.
+        name = entry["variable"]
+        for sep in ("[", "."):
+            if sep in name:
+                name = name.split(sep, 1)[0]
+                break
+        return name
+    if "parameter" in entry:
+        return entry["parameter"]
+    if "function" in entry:
+        # Nested functions are reported as 'outer.inner'; the position
+        # we want is the inner name's own column.
+        return entry["function"].rsplit(".", 1)[-1]
+    return None
+
+
+def enrich_with_col_offsets(source_path, entries):
+    """Augment HeaderGen entries with col_offset by looking up the position
+    of each entry's identifying name in the source file. Skip ambiguous
+    cases (multiple candidates) so we never guess a position."""
+    positions = build_position_map(source_path)
+    for entry in entries:
+        if "col_offset" in entry:
+            continue
+        name = _lookup_name(entry)
+        if name is None:
+            continue
+        cands = sorted(set(positions.get((name, entry["line_number"]), [])))
+        if len(cands) == 1:
+            entry["col_offset"] = cands[0]
+    return entries
+
+
 def translate_content(file_path):
     with open(file_path) as f:
         data = json.load(f)

diff --git a/src/target_tools/jedi/src/jedi_type_inference.py b/src/target_tools/jedi/src/jedi_type_inference.py
@@ -100,17 +100,24 @@ def find_types_by_execute(self, jedi_obj):
         return _type
 
     def get_function_name(self, jedi_obj):
+        """Return the qualified name of jedi_obj relative to its module,
+        walking up parent scopes so nested functions become 'outer.inner'."""
         try:
             if jedi_obj.name == "<lambda>":
-                func_name = "lambda"
-            else:
-                parts = jedi_obj.full_name.split(".", 1)
-                func_name = parts[-1] if len(parts) > 1 else jedi_obj.full_name
-        except Exception as e:
+                return "lambda"
+            parts = []
+            current = jedi_obj
+            while current is not None and current.type != "module":
+                name = "lambda" if current.name == "<lambda>" else current.name
+                parts.append(name)
+                try:
+                    current = current.parent()
+                except Exception:
+                    break
+            return ".".join(reversed(parts)) if parts else jedi_obj.name
+        except Exception:
             print("full_name not found in jedi_obj?")
-            func_name = jedi_obj.name
-
-        return func_name
+            return jedi_obj.name
 
     def infer_types(self):
         """
@@ -143,24 +150,46 @@ def infer_types(self):
                 if _infer:
                     for inferred in _infer:
                         if inferred.type == "function":
-                            # _type = self.parse_type_hint(inferred.get_type_hint())
-                            # if not _type:
-                            #     self.find_types_by_execute(inferred)
-
-                            _type = self.find_types_by_execute(inferred)
-
-                            _info = {
-                                "file": node.name,
-                                "line_number": pos["line"],
-                            }
-                            if inferred.name != "<lambda>":
-                                _info["function"] = self.get_function_name(inferred)
-                            _info["type"] = _type if _type else {"any"}
-
-                            variable_name = var.split(":")[0].strip()
-                            if variable_name != self.get_function_name(inferred):
-                                _info["variable"] = variable_name
-                            if _type:
+                            # Distinguish between the function's own definition
+                            # site (return-type is what's wanted, e.g. for `def
+                            # func1():`) and a reference to it (callable is
+                            # what's wanted, e.g. for `a = func1`).
+                            at_def_site = (
+                                pos["line"] == inferred.line
+                                and pos["column"] == inferred.column
+                            )
+
+                            if at_def_site:
+                                _type = self.find_types_by_execute(inferred)
+
+                                _info = {
+                                    "file": node.name,
+                                    "line_number": pos["line"],
+                                    "col_offset": pos["column"] + 1,
+                                }
+                                if inferred.name != "<lambda>":
+                                    _info["function"] = self.get_function_name(inferred)
+                                _info["type"] = _type if _type else {"any"}
+
+                                variable_name = var.split(":")[0].strip()
+                                if variable_name != self.get_function_name(inferred):
+                                    _info["variable"] = variable_name
+                                if _type:
+                                    output_inferred.append(_info)
+                            else:
+                                variable_name = var.split(":")[0].strip()
+                                _info = {
+                                    "file": node.name,
+                                    "line_number": pos["line"],
+                                    "col_offset": pos["column"] + 1,
+                                    "variable": variable_name,
+                                    "type": {"callable"},
+                                }
+                                parent = pos["jedi_obj"].parent()
+                                if parent and parent.type != "module":
+                                    parent_func = self.get_function_name(parent)
+                                    if parent_func:
+                                        _info["function"] = parent_func
                                 output_inferred.append(_info)
 
                         elif inferred.type == "instance":
@@ -187,17 +216,15 @@ def infer_types(self):
                             _info = {
                                 "file": node.name,
                                 "line_number": pos["line"],
+                                "col_offset": pos["column"] + 1,
                                 "variable": var.split(":")[0],
                                 "type": {_type},
                             }
-                            if (
-                                not pos["jedi_obj"].parent().name
-                                == pos["jedi_obj"].parent().module_name
-                            ):
-                                if self.get_function_name(pos["jedi_obj"].parent()):
-                                    _info["function"] = self.get_function_name(
-                                        pos["jedi_obj"].parent()
-                                    )
+                            parent = pos["jedi_obj"].parent()
+                            if parent and parent.type != "module":
+                                parent_func = self.get_function_name(parent)
+                                if parent_func:
+                                    _info["function"] = parent_func
                             if _type:
                                 output_inferred.append(_info)
 
@@ -206,6 +233,7 @@ def infer_types(self):
                             _info = {
                                 "file": node.name,
                                 "line_number": pos["line"],
+                                "col_offset": pos["column"] + 1,
                                 "variable": var.split(":")[0],
                                 "function": self.get_function_name(
                                     pos["jedi_obj"].parent()
@@ -225,6 +253,7 @@ def infer_types(self):
                         _info = {
                             "file": node.name,
                             "line_number": pos["line"],
+                            "col_offset": pos["column"] + 1,
                             "parameter": var.split(":")[0],
                             "function": self.get_function_name(
                                 pos["jedi_obj"].parent()

diff --git a/src/target_tools/scalpel/src/runner.py b/src/target_tools/scalpel/src/runner.py
@@ -4,6 +4,7 @@
 import os
 from pathlib import Path
 
+import translator
 import utils
 from scalpel.typeinfer.typeinfer import TypeInference
 
@@ -42,6 +43,7 @@ def main_runner(args):
         try:
             # logger.debug(file)
             inferred = process_file(file)
+            inferred = translator.enrich_with_col_offsets(file, inferred)
             json_file_path = str(file).replace(".py", "_result.json")
 
             with open(json_file_path, "w") as json_file:

diff --git a/src/target_tools/scalpel/src/translator.py b/src/target_tools/scalpel/src/translator.py
@@ -1,6 +1,8 @@
 import argparse
+import ast
 import json
 import os
+from collections import defaultdict
 from pathlib import Path
 
 
@@ -9,6 +11,70 @@ def list_json_files(folder_path):
     return python_files
 
 
+def build_position_map(source_path):
+    """Map (name, line_number) -> [1-indexed col_offsets] for every name
+    occurrence in the source. Scalpel's runner doesn't emit col_offset, but
+    for any (name, line) it gives us, the column is determined by the source.
+    We keep all candidates so the enrichment can skip ambiguous cases."""
+    positions = defaultdict(list)
+    try:
+        with open(source_path) as f:
+            tree = ast.parse(f.read())
+    except Exception:
+        return positions
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Name):
+            positions[(node.id, node.lineno)].append(node.col_offset + 1)
+        elif isinstance(node, ast.arg):
+            positions[(node.arg, node.lineno)].append(node.col_offset + 1)
+        elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            prefix = (
+                "async def " if isinstance(node, ast.AsyncFunctionDef) else "def "
+            )
+            positions[(node.name, node.lineno)].append(
+                node.col_offset + len(prefix) + 1
+            )
+        elif isinstance(node, ast.ClassDef):
+            positions[(node.name, node.lineno)].append(
+                node.col_offset + len("class ") + 1
+            )
+    return positions
+
+
+def _lookup_name(entry):
+    """Return the source-level name to look up for this entry's position."""
+    if "variable" in entry:
+        name = entry["variable"]
+        for sep in ("[", "."):
+            if sep in name:
+                name = name.split(sep, 1)[0]
+                break
+        return name
+    if "parameter" in entry:
+        return entry["parameter"]
+    if "function" in entry:
+        return entry["function"].rsplit(".", 1)[-1]
+    return None
+
+
+def enrich_with_col_offsets(source_path, entries):
+    """Augment entries with col_offset by looking up the position of each
+    entry's identifying name in the source file. Skip ambiguous cases
+    (multiple candidates) so we never guess a position."""
+    positions = build_position_map(source_path)
+    for entry in entries:
+        if "col_offset" in entry:
+            continue
+        name = _lookup_name(entry)
+        if name is None:
+            continue
+        cands = sorted(set(positions.get((name, entry["line_number"]), [])))
+        if len(cands) == 1:
+            entry["col_offset"] = cands[0]
+    return entries
+
+
 def main_translator(args):
     json_files = list_json_files(args.bechmark_path)
     error_count = 0