Skip to content

Commit 3d7595a

Browse files
author
Chris Rossi
authored
perf: improve count query performance for simple queries (#516)
For simple queries (queries that map directly to single Datastore queries, ie not multiqueries, or queries with post filters), `Query.count` can be otimized by setting a high offset and counting the number of entities skipped. Note that this still requires Datastore to assemble and iterate over a result set and Datastore will still only "skip" a certain number of entities at a time. (1000 at the time of this writing.) So this doesn't dramatically impact the amount of work that has to be done on the Datastore side, nor does it reduce the number gRPC calls necessary to count a large result set. It does reduce significantly, however, the amount of I/O required, which has sped up some large counts in testing by a factor of around 4x.
1 parent 6a12de2 commit 3d7595a

File tree

4 files changed

+292
-87
lines changed

4 files changed

+292
-87
lines changed

packages/google-cloud-ndb/google/cloud/ndb/_datastore_query.py

Lines changed: 83 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
log = logging.getLogger(__name__)
3636

3737
MoreResultsType = query_pb2.QueryResultBatch.MoreResultsType
38-
MORE_RESULTS_TYPE_NOT_FINISHED = MoreResultsType.Value("NOT_FINISHED")
38+
NO_MORE_RESULTS = MoreResultsType.Value("NO_MORE_RESULTS")
39+
NOT_FINISHED = MoreResultsType.Value("NOT_FINISHED")
3940
MORE_RESULTS_AFTER_LIMIT = MoreResultsType.Value("MORE_RESULTS_AFTER_LIMIT")
4041

4142
ResultType = query_pb2.EntityResult.ResultType
@@ -112,6 +113,70 @@ def fetch(query):
112113
raise tasklets.Return(entities)
113114

114115

116+
def count(query):
117+
"""Count query results.
118+
119+
Args:
120+
query (query.QueryOptions): The query spec.
121+
122+
Returns:
123+
tasklets.Future: Results is int: Number of results that would be
124+
returned by the query.
125+
"""
126+
filters = query.filters
127+
if filters:
128+
if filters._multiquery or filters._post_filters():
129+
return _count_brute_force(query)
130+
131+
return _count_by_skipping(query)
132+
133+
134+
@tasklets.tasklet
135+
def _count_brute_force(query):
136+
query = query.copy(projection=["__key__"], order_by=None)
137+
results = iterate(query, raw=True)
138+
count = 0
139+
limit = query.limit
140+
while (yield results.has_next_async()):
141+
count += 1
142+
if limit and count == limit:
143+
break
144+
145+
results.next()
146+
147+
raise tasklets.Return(count)
148+
149+
150+
@tasklets.tasklet
151+
def _count_by_skipping(query):
152+
limit = query.limit
153+
query = query.copy(projection=["__key__"], order_by=None, limit=1)
154+
count = 0
155+
more_results = NOT_FINISHED
156+
cursor = None
157+
158+
while more_results != NO_MORE_RESULTS:
159+
if limit:
160+
offset = limit - count - 1
161+
else:
162+
offset = 10000
163+
164+
query = query.copy(offset=offset, start_cursor=cursor)
165+
response = yield _datastore_run_query(query)
166+
batch = response.batch
167+
168+
more_results = batch.more_results
169+
count += batch.skipped_results
170+
count += len(batch.entity_results)
171+
172+
if limit and count >= limit:
173+
break
174+
175+
cursor = Cursor(batch.end_cursor)
176+
177+
raise tasklets.Return(count)
178+
179+
115180
def iterate(query, raw=False):
116181
"""Get iterator for query results.
117182
@@ -307,9 +372,7 @@ def _next_batch(self):
307372
for result_pb in response.batch.entity_results
308373
]
309374

310-
self._has_next_batch = more_results = (
311-
batch.more_results == MORE_RESULTS_TYPE_NOT_FINISHED
312-
)
375+
self._has_next_batch = more_results = batch.more_results == NOT_FINISHED
313376

314377
self._more_results_after_limit = batch.more_results == MORE_RESULTS_AFTER_LIMIT
315378

@@ -935,3 +998,19 @@ def to_websafe_string(self):
935998
def urlsafe(self):
936999
# Documented in official Legacy NDB docs
9371000
return base64.urlsafe_b64encode(self.cursor)
1001+
1002+
def __eq__(self, other):
1003+
if isinstance(other, Cursor):
1004+
return self.cursor == other.cursor
1005+
1006+
return NotImplemented
1007+
1008+
def __ne__(self, other):
1009+
# required for Python 2.7 compatibility
1010+
result = self.__eq__(other)
1011+
if result is NotImplemented:
1012+
result = False
1013+
return not result
1014+
1015+
def __hash__(self):
1016+
return hash(self.cursor)

packages/google-cloud-ndb/google/cloud/ndb/query.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2172,7 +2172,6 @@ def count(self, limit=None, **kwargs):
21722172
"""
21732173
return self.count_async(_options=kwargs["_options"]).result()
21742174

2175-
@tasklets.tasklet
21762175
@_query_options
21772176
@utils.keyword_only(
21782177
offset=None,
@@ -2201,19 +2200,7 @@ def count_async(self, limit=None, **kwargs):
22012200
# Avoid circular import in Python 2.7
22022201
from google.cloud.ndb import _datastore_query
22032202

2204-
_options = kwargs["_options"]
2205-
options = _options.copy(projection=["__key__"], order_by=None)
2206-
results = _datastore_query.iterate(options, raw=True)
2207-
count = 0
2208-
limit = options.limit
2209-
while (yield results.has_next_async()):
2210-
count += 1
2211-
if limit and count == limit:
2212-
break
2213-
2214-
results.next()
2215-
2216-
raise tasklets.Return(count)
2203+
return _datastore_query.count(kwargs["_options"])
22172204

22182205
@_query_options
22192206
@utils.keyword_only(

packages/google-cloud-ndb/tests/unit/test__datastore_query.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,172 @@ def test_fetch(iterate):
8181
iterate.assert_called_once_with("foo")
8282

8383

84+
class Test_count:
85+
@staticmethod
86+
@pytest.mark.usefixtures("in_context")
87+
@mock.patch("google.cloud.ndb._datastore_query.iterate")
88+
def test_count_brute_force(iterate):
89+
class DummyQueryIterator:
90+
def __init__(self, items):
91+
self.items = list(items)
92+
93+
def has_next_async(self):
94+
return utils.future_result(bool(self.items))
95+
96+
def next(self):
97+
return self.items.pop()
98+
99+
iterate.return_value = DummyQueryIterator(range(5))
100+
query = query_module.QueryOptions(
101+
filters=mock.Mock(_multiquery=True, spec=("_multiquery",))
102+
)
103+
104+
future = _datastore_query.count(query)
105+
assert future.result() == 5
106+
iterate.assert_called_once_with(
107+
query_module.QueryOptions(filters=query.filters, projection=["__key__"]),
108+
raw=True,
109+
)
110+
111+
@staticmethod
112+
@pytest.mark.usefixtures("in_context")
113+
@mock.patch("google.cloud.ndb._datastore_query.iterate")
114+
def test_count_brute_force_with_limit(iterate):
115+
class DummyQueryIterator:
116+
def __init__(self, items):
117+
self.items = list(items)
118+
119+
def has_next_async(self):
120+
return utils.future_result(bool(self.items))
121+
122+
def next(self):
123+
return self.items.pop()
124+
125+
iterate.return_value = DummyQueryIterator(range(5))
126+
query = query_module.QueryOptions(
127+
filters=mock.Mock(
128+
_multiquery=False,
129+
_post_filters=mock.Mock(return_value=True),
130+
spec=("_multiquery", "_post_filters"),
131+
),
132+
limit=3,
133+
)
134+
135+
future = _datastore_query.count(query)
136+
assert future.result() == 3
137+
iterate.assert_called_once_with(
138+
query_module.QueryOptions(
139+
filters=query.filters, projection=["__key__"], limit=3
140+
),
141+
raw=True,
142+
)
143+
144+
@staticmethod
145+
@pytest.mark.usefixtures("in_context")
146+
@mock.patch("google.cloud.ndb._datastore_query._datastore_run_query")
147+
def test_count_by_skipping(run_query):
148+
run_query.side_effect = utils.future_results(
149+
mock.Mock(
150+
batch=mock.Mock(
151+
more_results=_datastore_query.NOT_FINISHED,
152+
skipped_results=1000,
153+
entity_results=[],
154+
end_cursor=b"himom",
155+
spec=(
156+
"more_results",
157+
"skipped_results",
158+
"entity_results",
159+
"end_cursor",
160+
),
161+
),
162+
spec=("batch",),
163+
),
164+
mock.Mock(
165+
batch=mock.Mock(
166+
more_results=_datastore_query.NO_MORE_RESULTS,
167+
skipped_results=100,
168+
entity_results=[],
169+
end_cursor=b"hellodad",
170+
spec=(
171+
"more_results",
172+
"skipped_results",
173+
"entity_results",
174+
"end_cursor",
175+
),
176+
),
177+
spec=("batch",),
178+
),
179+
)
180+
181+
query = query_module.QueryOptions()
182+
future = _datastore_query.count(query)
183+
assert future.result() == 1100
184+
185+
expected = [
186+
mock.call(
187+
query_module.QueryOptions(
188+
limit=1,
189+
offset=10000,
190+
projection=["__key__"],
191+
)
192+
),
193+
(
194+
(
195+
query_module.QueryOptions(
196+
limit=1,
197+
offset=10000,
198+
projection=["__key__"],
199+
start_cursor=_datastore_query.Cursor(b"himom"),
200+
),
201+
),
202+
{},
203+
),
204+
]
205+
assert run_query.call_args_list == expected
206+
207+
@staticmethod
208+
@pytest.mark.usefixtures("in_context")
209+
@mock.patch("google.cloud.ndb._datastore_query._datastore_run_query")
210+
def test_count_by_skipping_with_limit(run_query):
211+
run_query.return_value = utils.future_result(
212+
mock.Mock(
213+
batch=mock.Mock(
214+
more_results=_datastore_query.MORE_RESULTS_AFTER_LIMIT,
215+
skipped_results=99,
216+
entity_results=[object()],
217+
end_cursor=b"himom",
218+
spec=(
219+
"more_results",
220+
"skipped_results",
221+
"entity_results",
222+
"end_cursor",
223+
),
224+
),
225+
spec=("batch",),
226+
)
227+
)
228+
229+
query = query_module.QueryOptions(
230+
filters=mock.Mock(
231+
_multiquery=False,
232+
_post_filters=mock.Mock(return_value=None),
233+
spec=("_multiquery", "_post_filters"),
234+
),
235+
limit=100,
236+
)
237+
future = _datastore_query.count(query)
238+
assert future.result() == 100
239+
240+
run_query.assert_called_once_with(
241+
query_module.QueryOptions(
242+
limit=1,
243+
offset=99,
244+
projection=["__key__"],
245+
filters=query.filters,
246+
)
247+
)
248+
249+
84250
class Test_iterate:
85251
@staticmethod
86252
@mock.patch("google.cloud.ndb._datastore_query._QueryIteratorImpl")
@@ -1448,3 +1614,22 @@ def test_urlsafe():
14481614
urlsafe = base64.urlsafe_b64encode(b"123")
14491615
cursor = _datastore_query.Cursor(b"123")
14501616
assert cursor.urlsafe() == urlsafe
1617+
1618+
@staticmethod
1619+
def test__eq__same():
1620+
assert _datastore_query.Cursor(b"123") == _datastore_query.Cursor(b"123")
1621+
assert not _datastore_query.Cursor(b"123") != _datastore_query.Cursor(b"123")
1622+
1623+
@staticmethod
1624+
def test__eq__different():
1625+
assert _datastore_query.Cursor(b"123") != _datastore_query.Cursor(b"234")
1626+
assert not _datastore_query.Cursor(b"123") == _datastore_query.Cursor(b"234")
1627+
1628+
@staticmethod
1629+
def test__eq__different_type():
1630+
assert _datastore_query.Cursor(b"123") != b"234"
1631+
assert not _datastore_query.Cursor(b"123") == b"234"
1632+
1633+
@staticmethod
1634+
def test__hash__():
1635+
assert hash(_datastore_query.Cursor(b"123")) == hash(b"123")

0 commit comments

Comments
 (0)