Skip to content

Commit 20fadc8

Browse files
committed
Proper ContentSet
1 parent e877929 commit 20fadc8

10 files changed

Lines changed: 176 additions & 97 deletions

File tree

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll

Lines changed: 58 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,7 @@ predicate jumpStepNotSharedWithTypeTracker(Node nodeFrom, Node nodeTo) {
753753
* As of 2024-04-02 the type-tracking library only supports precise content, so there is
754754
* no reason to include steps for list content right now.
755755
*/
756-
predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
756+
predicate storeStepCommon(Node nodeFrom, Content c, Node nodeTo) {
757757
tupleStoreStep(nodeFrom, c, nodeTo)
758758
or
759759
dictStoreStep(nodeFrom, c, nodeTo)
@@ -767,29 +767,31 @@ predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
767767
* Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
768768
* content `c`.
769769
*/
770-
predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
771-
storeStepCommon(nodeFrom, c, nodeTo)
772-
or
773-
listStoreStep(nodeFrom, c, nodeTo)
774-
or
775-
setStoreStep(nodeFrom, c, nodeTo)
776-
or
777-
attributeStoreStep(nodeFrom, c, nodeTo)
778-
or
779-
matchStoreStep(nodeFrom, c, nodeTo)
780-
or
781-
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
770+
predicate storeStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
771+
exists(Content c | cs = singleton(c) |
772+
storeStepCommon(nodeFrom, c, nodeTo)
773+
or
774+
listStoreStep(nodeFrom, c, nodeTo)
775+
or
776+
setStoreStep(nodeFrom, c, nodeTo)
777+
or
778+
attributeStoreStep(nodeFrom, c, nodeTo)
779+
or
780+
matchStoreStep(nodeFrom, c, nodeTo)
781+
or
782+
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
783+
or
784+
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
785+
or
786+
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
787+
or
788+
yieldStoreStep(nodeFrom, c, nodeTo)
789+
or
790+
VariableCapture::storeStep(nodeFrom, c, nodeTo)
791+
)
782792
or
783-
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
793+
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
784794
nodeTo.(FlowSummaryNode).getSummaryNode())
785-
or
786-
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
787-
or
788-
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
789-
or
790-
yieldStoreStep(nodeFrom, c, nodeTo)
791-
or
792-
VariableCapture::storeStep(nodeFrom, c, nodeTo)
793795
}
794796

795797
/**
@@ -985,7 +987,7 @@ predicate attributeStoreStep(Node nodeFrom, AttributeContent c, Node nodeTo) {
985987
/**
986988
* Subset of `readStep` that should be shared with type-tracking.
987989
*/
988-
predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
990+
predicate readStepCommon(Node nodeFrom, Content c, Node nodeTo) {
989991
subscriptReadStep(nodeFrom, c, nodeTo)
990992
or
991993
iterableUnpackingReadStep(nodeFrom, c, nodeTo)
@@ -994,23 +996,25 @@ predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
994996
/**
995997
* Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`.
996998
*/
997-
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
998-
readStepCommon(nodeFrom, c, nodeTo)
999-
or
1000-
matchReadStep(nodeFrom, c, nodeTo)
1001-
or
1002-
forReadStep(nodeFrom, c, nodeTo)
1003-
or
1004-
attributeReadStep(nodeFrom, c, nodeTo)
999+
predicate readStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
1000+
exists(Content c | cs = singleton(c) |
1001+
readStepCommon(nodeFrom, c, nodeTo)
1002+
or
1003+
matchReadStep(nodeFrom, c, nodeTo)
1004+
or
1005+
forReadStep(nodeFrom, c, nodeTo)
1006+
or
1007+
attributeReadStep(nodeFrom, c, nodeTo)
1008+
or
1009+
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
1010+
or
1011+
VariableCapture::readStep(nodeFrom, c, nodeTo)
1012+
)
10051013
or
1006-
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
1014+
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
10071015
nodeTo.(FlowSummaryNode).getSummaryNode())
10081016
or
1009-
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
1010-
or
1011-
VariableCapture::readStep(nodeFrom, c, nodeTo)
1012-
or
1013-
Conversions::readStep(nodeFrom, c, nodeTo)
1017+
Conversions::readStep(nodeFrom, cs, nodeTo)
10141018
}
10151019

10161020
/** Data flows from a sequence to a subscript of the sequence. */
@@ -1074,23 +1078,15 @@ module Conversions {
10741078
nodeFrom = decoding.getAnInput() and
10751079
nodeTo = decoding.getOutput()
10761080
) and
1077-
(
1078-
c instanceof TupleElementContent
1079-
or
1080-
c instanceof DictionaryElementContent
1081-
)
1081+
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
10821082
}
10831083

10841084
predicate encoderReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
10851085
exists(Encoding encoding |
10861086
nodeFrom = encoding.getAnInput() and
10871087
nodeTo = encoding.getOutput()
10881088
) and
1089-
(
1090-
c instanceof TupleElementContent
1091-
or
1092-
c instanceof DictionaryElementContent
1093-
)
1089+
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
10941090
}
10951091

10961092
predicate formatReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
@@ -1099,13 +1095,13 @@ module Conversions {
10991095
fmt.getOp() instanceof Mod and
11001096
fmt.getRight() = nodeFrom.asCfgNode()
11011097
) and
1102-
c instanceof TupleElementContent
1098+
c.isAnyTupleElement()
11031099
or
11041100
// format_map
11051101
// see https://docs.python.org/3/library/stdtypes.html#str.format_map
11061102
nodeTo.(MethodCallNode).calls(_, "format_map") and
11071103
nodeTo.(MethodCallNode).getArg(0) = nodeFrom and
1108-
c instanceof DictionaryElementContent
1104+
c.isAnyDictionaryElement()
11091105
}
11101106

11111107
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
@@ -1122,18 +1118,20 @@ module Conversions {
11221118
* any value stored inside `f` is cleared at the pre-update node associated with `x`
11231119
* in `x.f = newValue`.
11241120
*/
1125-
predicate clearsContent(Node n, ContentSet c) {
1126-
matchClearStep(n, c)
1127-
or
1128-
attributeClearStep(n, c)
1129-
or
1130-
dictClearStep(n, c)
1131-
or
1132-
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), c)
1133-
or
1134-
dictSplatParameterNodeClearStep(n, c)
1121+
predicate clearsContent(Node n, ContentSet cs) {
1122+
exists(Content c | cs = singleton(c) |
1123+
matchClearStep(n, c)
1124+
or
1125+
attributeClearStep(n, c)
1126+
or
1127+
dictClearStep(n, c)
1128+
or
1129+
dictSplatParameterNodeClearStep(n, c)
1130+
or
1131+
VariableCapture::clearsContent(n, c)
1132+
)
11351133
or
1136-
VariableCapture::clearsContent(n, c)
1134+
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), cs)
11371135
}
11381136

11391137
/**

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,8 @@ newtype TContent =
758758
// data-flow-private)
759759
index in [0 .. 7]
760760
} or
761+
/** An element of a tuple at any index. */
762+
TTupleElementAnyContent() or
761763
/** An element of a dictionary under a specific key. */
762764
TDictionaryElementContent(string key) {
763765
// {"key": ...}
@@ -870,6 +872,13 @@ class DictionaryElementAnyContent extends TDictionaryElementAnyContent, Content
870872
override string getMaDRepresentation() { result = "DictionaryElementAny" }
871873
}
872874

875+
/** An element of a tuple at any index. */
876+
class TupleElementAnyContent extends TTupleElementAnyContent, Content {
877+
override string toString() { result = "Any tuple element" }
878+
879+
override string getMaDRepresentation() { result = "TupleElementAny" }
880+
}
881+
873882
/** An object attribute. */
874883
class AttributeContent extends TAttributeContent, Content {
875884
private string attr;
@@ -898,19 +907,65 @@ class CapturedVariableContent extends Content, TCapturedVariableContent {
898907
override string getMaDRepresentation() { none() }
899908
}
900909

910+
/**
911+
* An entity that represents a set of `Content`s.
912+
*
913+
* Most `ContentSet`s are singletons (i.e. they consist of a single `Content`),
914+
* but `AnyDictionaryElement` and `AnyTupleElement` act as wildcards on the
915+
* read side: a read at such a `ContentSet` matches any specific dictionary
916+
* key / tuple index store, as well as the "unknown-bucket" Content
917+
* (`DictionaryElementAnyContent` / `TupleElementAnyContent`).
918+
*
919+
* Keeping these as wildcard `ContentSet`s (rather than enumerating one
920+
* `ContentSet` per key/index) keeps the dataflow `readSetEx` relation small
921+
* when implicit reads are used (e.g. at sinks via `defaultImplicitTaintRead`).
922+
*/
923+
private newtype TContentSet =
924+
TSingletonContent(Content c) or
925+
TAnyTupleElement() or
926+
TAnyDictionaryElement()
927+
901928
/**
902929
* An entity that represents a set of `Content`s.
903930
*
904931
* The set may be interpreted differently depending on whether it is
905932
* stored into (`getAStoreContent`) or read from (`getAReadContent`).
906933
*/
907-
class ContentSet instanceof Content {
934+
class ContentSet extends TContentSet {
935+
/** Holds if this content set is the singleton `{c}`. */
936+
predicate isSingleton(Content c) { this = TSingletonContent(c) }
937+
938+
/** Holds if this content set is the wildcard for all tuple elements. */
939+
predicate isAnyTupleElement() { this = TAnyTupleElement() }
940+
941+
/** Holds if this content set is the wildcard for all dictionary elements. */
942+
predicate isAnyDictionaryElement() { this = TAnyDictionaryElement() }
943+
908944
/** Gets a content that may be stored into when storing into this set. */
909-
Content getAStoreContent() { result = this }
945+
Content getAStoreContent() { this = TSingletonContent(result) }
910946

911947
/** Gets a content that may be read from when reading from this set. */
912-
Content getAReadContent() { result = this }
948+
Content getAReadContent() {
949+
this = TSingletonContent(result)
950+
or
951+
// Wildcard expansion: a read at "any tuple element" matches a store at any
952+
// specific tuple index, as well as the unknown-index bucket.
953+
this = TAnyTupleElement() and
954+
(result instanceof TupleElementContent or result instanceof TupleElementAnyContent)
955+
or
956+
this = TAnyDictionaryElement() and
957+
(result instanceof DictionaryElementContent or result instanceof DictionaryElementAnyContent)
958+
}
913959

914960
/** Gets a textual representation of this content set. */
915-
string toString() { result = super.toString() }
961+
string toString() {
962+
exists(Content c | this = TSingletonContent(c) | result = c.toString())
963+
or
964+
this = TAnyTupleElement() and result = "Any tuple element"
965+
or
966+
this = TAnyDictionaryElement() and result = "Any dictionary element"
967+
}
916968
}
969+
970+
/** Gets the singleton `ContentSet` wrapping the `Content` `c`. */
971+
ContentSet singleton(Content c) { result = TSingletonContent(c) }

python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -66,21 +66,29 @@ module Input implements InputSig<Location, DataFlowImplSpecific::PythonDataFlow>
6666
}
6767

6868
string encodeContent(ContentSet cs, string arg) {
69-
cs = TListElementContent() and result = "ListElement" and arg = ""
70-
or
71-
cs = TSetElementContent() and result = "SetElement" and arg = ""
72-
or
73-
exists(int index |
74-
cs = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
75-
)
76-
or
77-
exists(string key |
78-
cs = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
69+
exists(Content c | cs.isSingleton(c) |
70+
c = TListElementContent() and result = "ListElement" and arg = ""
71+
or
72+
c = TSetElementContent() and result = "SetElement" and arg = ""
73+
or
74+
exists(int index |
75+
c = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
76+
)
77+
or
78+
c = TTupleElementAnyContent() and result = "TupleElementAny" and arg = ""
79+
or
80+
exists(string key |
81+
c = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
82+
)
83+
or
84+
c = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
85+
or
86+
exists(string attr | c = TAttributeContent(attr) and result = "Attribute" and arg = attr)
7987
)
8088
or
81-
cs = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
89+
cs.isAnyTupleElement() and result = "AnyTupleElement" and arg = ""
8290
or
83-
exists(string attr | cs = TAttributeContent(attr) and result = "Attribute" and arg = attr)
91+
cs.isAnyDictionaryElement() and result = "AnyDictionaryElement" and arg = ""
8492
}
8593

8694
bindingset[token]
@@ -139,27 +147,34 @@ module Private {
139147
predicate withContent = SC::withContent/1;
140148

141149
/** Gets a summary component that represents a list element. */
142-
SummaryComponent listElement() { result = content(any(ListElementContent c)) }
150+
SummaryComponent listElement() { result = content(singleton(any(ListElementContent c))) }
143151

144152
/** Gets a summary component that represents a set element. */
145-
SummaryComponent setElement() { result = content(any(SetElementContent c)) }
153+
SummaryComponent setElement() { result = content(singleton(any(SetElementContent c))) }
146154

147155
/** Gets a summary component that represents a tuple element. */
148156
SummaryComponent tupleElement(int index) {
149-
exists(TupleElementContent c | c.getIndex() = index and result = content(c))
157+
exists(TupleElementContent c | c.getIndex() = index and result = content(singleton(c)))
158+
}
159+
160+
/** Gets a summary component that represents a tuple element at any index. */
161+
SummaryComponent tupleElementAny() {
162+
result = content(singleton(any(TupleElementAnyContent c)))
150163
}
151164

152165
/** Gets a summary component that represents a dictionary element. */
153166
SummaryComponent dictionaryElement(string key) {
154-
exists(DictionaryElementContent c | c.getKey() = key and result = content(c))
167+
exists(DictionaryElementContent c | c.getKey() = key and result = content(singleton(c)))
155168
}
156169

157170
/** Gets a summary component that represents a dictionary element at any key. */
158-
SummaryComponent dictionaryElementAny() { result = content(any(DictionaryElementAnyContent c)) }
171+
SummaryComponent dictionaryElementAny() {
172+
result = content(singleton(any(DictionaryElementAnyContent c)))
173+
}
159174

160175
/** Gets a summary component that represents an attribute element. */
161176
SummaryComponent attribute(string attr) {
162-
exists(AttributeContent c | c.getAttribute() = attr and result = content(c))
177+
exists(AttributeContent c | c.getAttribute() = attr and result = content(singleton(c)))
163178
}
164179

165180
/** Gets a summary component that represents the return value of a call. */

python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@ predicate defaultTaintSanitizer(DataFlow::Node node) { none() }
1717
*/
1818
bindingset[node]
1919
predicate defaultImplicitTaintRead(DataFlow::Node node, DataFlow::ContentSet c) {
20-
// We allow implicit reads of precise content
21-
// imprecise content has already bubled up.
20+
// We allow implicit reads of precise content; imprecise content has already
21+
// bubbled up. We use the wildcard content sets here rather than the
22+
// per-key/per-index ones to avoid blowing up the size of `Stage1::readSetEx`
23+
// (otherwise this predicate would expand to one row per (node, distinct key
24+
// or index) and the framework's read-set relation grows quadratically).
25+
// `ContentSet.getAReadContent` expands these wildcards back to the specific
26+
// contents when matching against stores.
2227
exists(node) and
23-
(
24-
c instanceof DataFlow::TupleElementContent
25-
or
26-
c instanceof DataFlow::DictionaryElementContent
27-
)
28+
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
2829
}
2930

3031
private module Cached {

0 commit comments

Comments
 (0)