Skip to content

Commit ccfaa6e

Browse files
yoffCopilot
andcommitted
Python: migrate dataflow library to new CFG + shared SSA
Switches the trunk dataflow library and all in-tree consumers (frameworks, ApiGraphs, Concepts, regexp, security customisations, test harness) from the legacy Flow.qll/ESSA stack to the new shared-CFG facade (Cfg.qll) and the ESSA-shaped adapter on the shared-SSA library (SsaImpl.qll). Highlights: * DataFlowPublic/Private/Dispatch, Attributes, VariableCapture, IterableUnpacking, ImportResolution, ImportStar, LocalSources, TaintTrackingPrivate, MatchUnpacking, TypeTrackingImpl, SsaImpl, Builtins all now qualify CFG/SSA references with Cfg:: / SsaImpl:: and stop pulling in semmle.python.essa.*. * AstNodeImpl.qll/Cfg.qll: ImportMember exposes its inner ImportExpr, DefinitionNode.getValue covers Alias / AnnAssign / AugAssign / AssignExpr / For-target / Parameter-default, ForNode is treated as an expression node, AnnotatedExitNode is canonical, and BoolExprNode.getAnOperand drops the dominance constraint that did not hold for short-circuit BBs. * SsaImpl.qll: parameters always get a ParameterDefinition (so unused parameters still have SSA defs), scope-entry defs for module globals require an actual store somewhere, scope-exit has a synthetic use so reaching-defs survives to module boundary, and the legacy SsaSourceVariable / EssaVariable surface (getName, getScope, getAUse, getASourceUse, getAnImplicitUse) is reinstated for downstream queries. * DataFlowPublic.qll: GuardNode redesigned around the new structural outcome nodes (isAfterTrue / isAfterFalse). The legacy ConditionBlock + flipped indirection is gone; controlsBlock walks UP through 'not' / '==True' / 'is False' etc. via outcomeOfGuard, accumulating polarity cleanly. Only BarrierGuard<...> is preserved as public API. * ModuleVariableNode.getAWrite and LocalFlow::definitionFlowStep bypass SSA and consult Cfg::NameNode.defines / Cfg::DefinitionNode.getValue directly, so that write defs pruned by shared SSA (because the variable has no in-scope read) still produce dataflow steps. * Frameworks + downstream consumers: replace EssaVariable.hasDefiningNode, getAReturnValueFlowNode, Parameter.getDefault, Scope.getEntryNode / getANormalExit etc. with CFG-side bridges through Cfg::ControlFlowNode. The legacy Flow.qll / Essa.qll stack is untouched and remains available for queries that import it directly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent cccd207 commit ccfaa6e

49 files changed

Lines changed: 821 additions & 468 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

python/ql/lib/semmle/python/ApiGraphs.qll

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
* directed and labeled; they specify how the components represented by nodes relate to each other.
77
*/
88

9-
// Importing python under the `py` namespace to avoid importing `CallNode` from `Flow.qll` and thereby having a naming conflict with `API::CallNode`.
9+
// Importing python under the `py` namespace to avoid importing `Cfg::CallNode` from `Flow.qll` and thereby having a naming conflict with `API::CallNode`.
1010
private import python as PY
11+
private import semmle.python.controlflow.internal.Cfg as Cfg
1112
import semmle.python.dataflow.new.DataFlow
1213
private import semmle.python.internal.CachedStages
1314

@@ -282,15 +283,15 @@ module API {
282283
index = this.getIndex() and
283284
(
284285
// subscripting
285-
exists(PY::SubscriptNode subscript |
286+
exists(Cfg::SubscriptNode subscript |
286287
subscript.getObject() = this.getAValueReachableFromSource().asCfgNode() and
287288
subscript.getIndex() = index.asSink().asCfgNode()
288289
|
289290
// reading
290291
subscript = result.asSource().asCfgNode()
291292
or
292293
// writing
293-
subscript.(PY::DefinitionNode).getValue() = result.asSink().asCfgNode()
294+
subscript.(Cfg::DefinitionNode).getValue() = result.asSink().asCfgNode()
294295
)
295296
or
296297
// dictionary literals
@@ -684,7 +685,7 @@ module API {
684685
* Ignores relative imports, such as `from ..foo.bar import baz`.
685686
*/
686687
private predicate imports(DataFlow::CfgNode imp, string name) {
687-
exists(PY::ImportExprNode iexpr |
688+
exists(Cfg::ImportExprNode iexpr |
688689
imp.getNode() = iexpr and
689690
not iexpr.getNode().isRelative() and
690691
name = iexpr.getNode().getImportedModuleName()
@@ -775,7 +776,7 @@ module API {
775776
// list literals, from `x` to `[x]`
776777
// TODO: once convenient, this should be done at a higher level than the AST,
777778
// at least at the CFG layer, to take splitting into account.
778-
// Also consider `SequenceNode for generality.
779+
// Also consider `Cfg::SequenceNode for generality.
779780
exists(PY::List list | list = pred.(DataFlow::ExprNode).getNode().getNode() |
780781
rhs.(DataFlow::ExprNode).getNode().getNode() = list.getAnElt() and
781782
lbl = Label::subscript()
@@ -805,7 +806,7 @@ module API {
805806
subscript = trackUseNode(src).getSubscript(index)
806807
|
807808
// from `x` to a definition of `x[...]`
808-
rhs.asCfgNode() = subscript.asCfgNode().(PY::DefinitionNode).getValue() and
809+
rhs.asCfgNode() = subscript.asCfgNode().(Cfg::DefinitionNode).getValue() and
809810
lbl = Label::subscript()
810811
or
811812
// from `x` to `"key"` in `x["key"]`

python/ql/lib/semmle/python/Concepts.qll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*/
66

77
private import python
8+
private import semmle.python.controlflow.internal.Cfg as Cfg
89
private import semmle.python.dataflow.new.DataFlow
910
private import semmle.python.dataflow.new.internal.DataFlowImplSpecific
1011
private import semmle.python.dataflow.new.RemoteFlowSources
@@ -214,7 +215,7 @@ module Path {
214215
SafeAccessCheck() { this = DataFlow::BarrierGuard<safeAccessCheck/3>::getABarrierNode() }
215216
}
216217

217-
private predicate safeAccessCheck(DataFlow::GuardNode g, ControlFlowNode node, boolean branch) {
218+
private predicate safeAccessCheck(DataFlow::GuardNode g, Cfg::ControlFlowNode node, boolean branch) {
218219
g.(SafeAccessCheck::Range).checks(node, branch)
219220
}
220221

@@ -223,7 +224,7 @@ module Path {
223224
/** A data-flow node that checks that a path is safe to access in some way, for example by having a controlled prefix. */
224225
abstract class Range extends DataFlow::GuardNode {
225226
/** Holds if this guard validates `node` upon evaluating to `branch`. */
226-
abstract predicate checks(ControlFlowNode node, boolean branch);
227+
abstract predicate checks(Cfg::ControlFlowNode node, boolean branch);
227228
}
228229
}
229230
}

python/ql/lib/semmle/python/controlflow/internal/AstNodeImpl.qll

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
* - Intermediate nodes for multi-operand boolean expressions.
1111
*/
1212

13+
overlay[local?]
14+
module;
15+
1316
private import python as Py
1417
private import codeql.controlflow.ControlFlowGraph
1518
private import codeql.controlflow.SuccessorType
@@ -1193,6 +1196,30 @@ module Ast implements AstSig<Py::Location> {
11931196
override AstNode getChild(int index) { index = 0 and result = this.getObject() }
11941197
}
11951198

1199+
/**
1200+
* An `import x.y` module expression. Modelled as a leaf — the dotted
1201+
* name is just a string.
1202+
*/
1203+
additional class ImportExpression extends Expr {
1204+
ImportExpression() { this.asExpr() instanceof Py::ImportExpr }
1205+
}
1206+
1207+
/**
1208+
* A `from m import x` member access. The module sub-expression is a
1209+
* child so that the CFG visits both the module load and this
1210+
* attribute selection.
1211+
*/
1212+
additional class ImportMemberExpr extends Expr {
1213+
private Py::ImportMember im;
1214+
1215+
ImportMemberExpr() { this = TPyExpr(im) }
1216+
1217+
/** Gets the module expression `m` in `from m import x`. */
1218+
Expr getModule() { result.asExpr() = im.getModule() }
1219+
1220+
override AstNode getChild(int index) { index = 0 and result = this.getModule() }
1221+
}
1222+
11961223
/** A tuple literal. */
11971224
additional class TupleExpr extends Expr {
11981225
private Py::Tuple tuple;
@@ -1581,4 +1608,6 @@ Py::AstNode astNodeToPyNode(Ast::AstNode n) {
15811608
result = n.asStmt()
15821609
or
15831610
result = n.asScope()
1611+
or
1612+
result = n.asPattern()
15841613
}

python/ql/lib/semmle/python/controlflow/internal/Cfg.qll

Lines changed: 131 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,24 @@ module;
2020
private import python as Py
2121
private import semmle.python.controlflow.internal.AstNodeImpl as CfgImpl
2222
private import codeql.controlflow.SuccessorType
23+
private import codeql.controlflow.BasicBlock as BB
24+
25+
/**
26+
* A nested sub-module that explicitly implements `BB::CfgSig`, so this
27+
* `Cfg` facade can be passed to parameterised shared modules such as
28+
* `codeql.dataflow.VariableCapture::Flow<L, Cfg, ...>`. The sub-module
29+
* exposes the *raw* shared-CFG types from `AstNodeImpl.qll` (where the
30+
* signature is satisfied natively), not the facade's wrapped types.
31+
*/
32+
module CfgSigImpl implements BB::CfgSig<Py::Location> {
33+
class ControlFlowNode = CfgImpl::ControlFlowNode;
34+
35+
class BasicBlock = CfgImpl::BasicBlock;
36+
37+
class EntryBasicBlock = CfgImpl::Cfg::EntryBasicBlock;
38+
39+
predicate dominatingEdge = CfgImpl::Cfg::dominatingEdge/2;
40+
}
2341

2442
/**
2543
* Gets the Python AST node corresponding to CFG node `n`, if any.
@@ -51,6 +69,11 @@ private predicate isCanonical(CfgImpl::ControlFlowNode n) {
5169
n instanceof CfgImpl::ControlFlow::EntryNode
5270
or
5371
n instanceof CfgImpl::ControlFlow::ExitNode
72+
or
73+
// Annotated exit nodes (normal + abnormal) — needed so that dataflow
74+
// consumers can ask "is this the normal-exit of a scope?" and also
75+
// so that scope-exit synthetic uses in SsaImpl can attach here.
76+
n instanceof CfgImpl::ControlFlow::AnnotatedExitNode
5477
}
5578

5679
/**
@@ -369,6 +392,35 @@ class BasicBlock extends CfgImpl::BasicBlock {
369392
or
370393
forex(BasicBlock immsucc | immsucc = super.getASuccessor() | immsucc.alwaysReaches(succ))
371394
}
395+
396+
/**
397+
* Holds if this basic block ends in a node that branches on a boolean
398+
* outcome, and `other` is dominated by the corresponding successor
399+
* for `branch` while not being reachable from the other branch
400+
* without going through this BB.
401+
*
402+
* In other words: any execution that reaches `other` must have just
403+
* evaluated the last node of this BB and taken the `branch` outcome.
404+
* This mirrors the legacy `ConditionBlock.controls(BB, branch)`.
405+
*/
406+
predicate controls(BasicBlock other, boolean branch) {
407+
exists(BasicBlock succ |
408+
branch = true and succ = this.getATrueSuccessor()
409+
or
410+
branch = false and succ = this.getAFalseSuccessor()
411+
|
412+
succ.dominates(other) and
413+
// The other branch must not also reach `other` — otherwise
414+
// `other` is not actually controlled by `branch`.
415+
not exists(BasicBlock otherSucc |
416+
branch = true and otherSucc = this.getAFalseSuccessor()
417+
or
418+
branch = false and otherSucc = this.getATrueSuccessor()
419+
|
420+
otherSucc.reaches(other)
421+
)
422+
)
423+
}
372424
}
373425

374426
// ===========================================================================
@@ -734,8 +786,7 @@ class BoolExprNode extends ControlFlowNode {
734786
ControlFlowNode getAnOperand() {
735787
exists(Py::BoolExpr be |
736788
be = toAst(this) and
737-
be.getAValue() = toAst(result) and
738-
result.getBasicBlock().dominates(this.getBasicBlock())
789+
be.getAValue() = toAst(result)
739790
)
740791
}
741792
}
@@ -766,20 +817,79 @@ class DefinitionNode extends ControlFlowNode {
766817

767818
/** Gets the value assigned, if any. */
768819
ControlFlowNode getValue() {
769-
exists(Py::Expr target, Py::Expr value |
770-
target = toAst(this) and
771-
value = toAst(result) and
772-
result.getBasicBlock().dominates(this.getBasicBlock())
773-
|
774-
// x = value
775-
exists(Py::Assign a | a.getATarget() = target and a.getValue() = value)
776-
or
777-
// x = y = value (nested chained-assign target)
778-
exists(Py::Assign a | a.getATarget().(Py::Tuple).getElt(_) = target and a.getValue() = value)
820+
// For-target: the value is the for-loop's iter expression (which
821+
// is also where `Cfg::ForNode` lives — its `getNode()` returns the
822+
// enclosing `Py::For` statement). Treated specially because there
823+
// is no AST node holding the result of `iter(next(seq))`; we use
824+
// the iter expression's CFG node as the stand-in.
825+
exists(Py::For f |
826+
f.getTarget() = toAst(this) and
827+
toAst(result) = f.getIter()
828+
)
829+
or
830+
exists(Py::AstNode value | value = assignedValue(toAst(this)) |
831+
toAst(result) = value and
832+
(
833+
result.getBasicBlock().dominates(this.getBasicBlock())
834+
or
835+
result.isImport()
836+
or
837+
// The default value for a parameter is evaluated in the same basic block as
838+
// the function definition, but the parameter belongs to the basic block of the
839+
// function, so there is no dominance relationship between the two.
840+
exists(Py::Parameter param | toAst(this) = param.asName())
841+
)
779842
)
780843
}
781844
}
782845

846+
/**
847+
* Gets the AST node that holds the value assigned to `lhs` in a binding
848+
* context. Mirrors `Flow.qll::assigned_value`.
849+
*/
850+
private Py::AstNode assignedValue(Py::Expr lhs) {
851+
// lhs = result
852+
exists(Py::Assign a | a.getATarget() = lhs and result = a.getValue())
853+
or
854+
// lhs := result
855+
exists(Py::AssignExpr a | a.getTarget() = lhs and result = a.getValue())
856+
or
857+
// lhs: annotation = result
858+
exists(Py::AnnAssign a | a.getTarget() = lhs and result = a.getValue())
859+
or
860+
// import result as lhs (also covers plain `import lhs`, where alias.getAsname() = lhs)
861+
exists(Py::Alias a | a.getAsname() = lhs and result = a.getValue())
862+
or
863+
// lhs += x -> result is the (lhs + x) binary expression
864+
exists(Py::AugAssign a, Py::BinaryExpr b |
865+
b = a.getOperation() and result = b and lhs = b.getLeft()
866+
)
867+
or
868+
// Nested sequence assign: ..., lhs, ... = ..., result, ...
869+
exists(Py::Assign a | nestedSequenceAssign(a.getATarget(), a.getValue(), lhs, result))
870+
or
871+
// Parameter default
872+
exists(Py::Parameter param | lhs = param.asName() and result = param.getDefault())
873+
}
874+
875+
/**
876+
* Helper for nested sequence assignments such as `(a, b), c = (1, 2), 3`.
877+
*/
878+
private predicate nestedSequenceAssign(
879+
Py::Expr leftParent, Py::Expr rightParent, Py::Expr left, Py::Expr right
880+
) {
881+
exists(int i |
882+
leftParent.(Py::Tuple).getElt(i) = left and rightParent.(Py::Tuple).getElt(i) = right
883+
or
884+
leftParent.(Py::List).getElt(i) = left and rightParent.(Py::List).getElt(i) = right
885+
)
886+
or
887+
exists(Py::Expr leftMid, Py::Expr rightMid |
888+
nestedSequenceAssign(leftParent, rightParent, leftMid, rightMid) and
889+
nestedSequenceAssign(leftMid, rightMid, left, right)
890+
)
891+
}
892+
783893
/** A control flow node corresponding to a deletion (`del x`). */
784894
class DeletionNode extends ControlFlowNode {
785895
DeletionNode() { this.isDelete() }
@@ -789,8 +899,6 @@ class DeletionNode extends ControlFlowNode {
789899
class ForNode extends ControlFlowNode {
790900
ForNode() { exists(Py::For f | toAst(this) = f.getIter()) }
791901

792-
override Py::For getNode() { exists(Py::For f | toAst(this) = f.getIter() | result = f) }
793-
794902
/** Gets the iterable expression. */
795903
ControlFlowNode getIter() {
796904
result = this and result = result // canonical "after" of the iterable
@@ -943,8 +1051,15 @@ class DictNode extends ControlFlowNode {
9431051
/** A control flow node corresponding to an iterable in a `for` loop. */
9441052
class IterableNode extends ControlFlowNode {
9451053
IterableNode() {
946-
exists(Py::For f | toAst(this) = f.getIter())
1054+
this instanceof SequenceNode
1055+
or
1056+
this instanceof SetNode
1057+
}
1058+
1059+
/** Gets the control flow node for an element of this iterable. */
1060+
ControlFlowNode getAnElement() {
1061+
result = this.(SequenceNode).getAnElement()
9471062
or
948-
exists(Py::Comp c | toAst(this) = c.getIterable())
1063+
result = this.(SetNode).getAnElement()
9491064
}
9501065
}

python/ql/lib/semmle/python/dataflow/new/BarrierGuards.qll

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
/** Provides commonly used BarrierGuards. */
22

33
private import python
4+
private import semmle.python.controlflow.internal.Cfg as Cfg
45
private import semmle.python.dataflow.new.DataFlow
56

6-
private predicate constCompare(DataFlow::GuardNode g, ControlFlowNode node, boolean branch) {
7-
exists(CompareNode cn | cn = g |
8-
exists(ImmutableLiteral const, Cmpop op, ControlFlowNode c |
7+
private predicate constCompare(DataFlow::GuardNode g, Cfg::ControlFlowNode node, boolean branch) {
8+
exists(Cfg::CompareNode cn | cn = g |
9+
exists(ImmutableLiteral const, Cmpop op, Cfg::ControlFlowNode c |
910
c.getNode() = const and
1011
(
1112
op = any(Eq eq) and branch = true
@@ -18,7 +19,7 @@ private predicate constCompare(DataFlow::GuardNode g, ControlFlowNode node, bool
1819
cn.operands(node, op, c)
1920
)
2021
or
21-
exists(NameConstant const, Cmpop op, ControlFlowNode c |
22+
exists(NameConstant const, Cmpop op, Cfg::ControlFlowNode c |
2223
c.getNode() = const and
2324
(
2425
op = any(Is is_) and branch = true
@@ -31,12 +32,12 @@ private predicate constCompare(DataFlow::GuardNode g, ControlFlowNode node, bool
3132
cn.operands(node, op, c)
3233
)
3334
or
34-
exists(IterableNode const_iterable, Cmpop op |
35+
exists(Cfg::IterableNode const_iterable, Cmpop op |
3536
op = any(In in_) and branch = true
3637
or
3738
op = any(NotIn ni) and branch = false
3839
|
39-
forall(ControlFlowNode elem | elem = const_iterable.getAnElement() |
40+
forall(Cfg::ControlFlowNode elem | elem = const_iterable.getAnElement() |
4041
elem.getNode() instanceof ImmutableLiteral
4142
) and
4243
cn.operands(node, op, const_iterable)

0 commit comments

Comments
 (0)