optimizing query class

googleapis · daniel-sanche · Aug 17, 2023 · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023
commit 3d1804c063e8d2cef6ca168b531cb0f16f7cb4f9
diff --git a/google/cloud/bigtable/data/read_rows_query.py b/google/cloud/bigtable/data/read_rows_query.py
@@ -17,7 +17,6 @@
 from bisect import bisect_left
 from bisect import bisect_right
 from collections import defaultdict
-from dataclasses import dataclass
 from google.cloud.bigtable.data.row_filters import RowFilter
 
 from google.cloud.bigtable_v2.types import RowRange as RowRangePB
@@ -29,27 +28,13 @@
     from google.cloud.bigtable.data import ShardedQuery
 
 
-@dataclass
-class _RangePoint:
-    """Model class for a point in a row range"""
-
-    key: bytes
-    is_inclusive: bool
-
-    def __hash__(self) -> int:
-        return hash((self.key, self.is_inclusive))
-
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, _RangePoint):
-            return NotImplemented
-        return self.key == other.key and self.is_inclusive == other.is_inclusive
-
-
 class RowRange:
     """
     Represents a range of keys in a ReadRowsQuery
     """
 
+    __slots__ = ("_pb",)
+
     def __init__(
         self,
         start_key: str | bytes | None = None,
@@ -95,58 +80,63 @@ def __init__(
         if start_key is not None and end_key is not None and start_key > end_key:
             raise ValueError("start_key must be less than or equal to end_key")
 
-        self._start: _RangePoint | None = (
-            _RangePoint(start_key, start_is_inclusive)
-            if start_key is not None
-            else None
-        )
-        self._end: _RangePoint | None = (
-            _RangePoint(end_key, end_is_inclusive) if end_key is not None else None
-        )
+        init_dict = {}
+        if start_key is not None:
+            if start_is_inclusive:
+                init_dict["start_key_closed"] = start_key
+            else:
+                init_dict["start_key_open"] = start_key
+        if end_key is not None:
+            if end_is_inclusive:
+                init_dict["end_key_closed"] = end_key
+            else:
+                init_dict["end_key_open"] = end_key
+        self._pb = RowRangePB(**init_dict)
 
     @property
     def start_key(self) -> bytes | None:
         """
         Returns the start key of the range. If None, the range is unbounded on the left.
         """
-        return self._start.key if self._start is not None else None
+        return self._pb.start_key_closed or self._pb.start_key_open or None
 
     @property
     def end_key(self) -> bytes | None:
         """
         Returns the end key of the range. If None, the range is unbounded on the right.
         """
-        return self._end.key if self._end is not None else None
+        return self._pb.end_key_closed or self._pb.end_key_open or None
 
     @property
     def start_is_inclusive(self) -> bool:
         """
         Returns whether the range is inclusive of the start key.
         Returns True if the range is unbounded on the left.
         """
-        return self._start.is_inclusive if self._start is not None else True
+        return bool(self._pb.start_key_closed)
 
     @property
     def end_is_inclusive(self) -> bool:
         """
         Returns whether the range is inclusive of the end key.
         Returns True if the range is unbounded on the right.
         """
-        return self._end.is_inclusive if self._end is not None else True
+        return bool(self._pb.end_key_closed)
 
     def _to_dict(self) -> dict[str, bytes]:
         """Converts this object to a dictionary"""
         output = {}
-        if self._start is not None:
+        if self.start_key is not None:
             key = "start_key_closed" if self.start_is_inclusive else "start_key_open"
-            output[key] = self._start.key
-        if self._end is not None:
+            output[key] = self.start_key
+        if self.end_key is not None:
             key = "end_key_closed" if self.end_is_inclusive else "end_key_open"
-            output[key] = self._end.key
+            output[key] = self.end_key
         return output
 
-    def __hash__(self) -> int:
-        return hash((self._start, self._end))
+    def _to_pb(self) -> RowRangePB:
+        """Converts this object to a protobuf"""
+        return self._pb
 
     @classmethod
     def _from_dict(cls, data: dict[str, bytes]) -> RowRange:
@@ -163,30 +153,23 @@ def _from_dict(cls, data: dict[str, bytes]) -> RowRange:
         )
 
     @classmethod
-    def _from_points(
-        cls, start: _RangePoint | None, end: _RangePoint | None
-    ) -> RowRange:
-        """Creates a RowRange from two RangePoints"""
-        kwargs: dict[str, Any] = {}
-        if start is not None:
-            kwargs["start_key"] = start.key
-            kwargs["start_is_inclusive"] = start.is_inclusive
-        if end is not None:
-            kwargs["end_key"] = end.key
-            kwargs["end_is_inclusive"] = end.is_inclusive
-        return cls(**kwargs)
+    def _from_pb(cls, data: RowRangePB) -> RowRange:
+        """Creates a RowRange from a protobuf"""
+        instance = cls()
+        instance._pb = data
+        return instance
 
     def __bool__(self) -> bool:
         """
         Empty RowRanges (representing a full table scan) are falsy, because
         they can be substituted with None. Non-empty RowRanges are truthy.
         """
-        return self._start is not None or self._end is not None
+        return bool(self._pb.start_key_closed or self._pb.start_key_open or self._pb.end_key_closed or self._pb.end_key_open)
 
     def __eq__(self, other: Any) -> bool:
         if not isinstance(other, RowRange):
             return NotImplemented
-        return self._start == other._start and self._end == other._end
+        return self._pb == other._pb
 
     def __str__(self) -> str:
         """
@@ -206,7 +189,7 @@ def __repr__(self) -> str:
         if self.start_is_inclusive is False:
             # only show start_is_inclusive if it is different from the default
             args_list.append(f"start_is_inclusive={self.start_is_inclusive}")
-        if self.end_is_inclusive is True and self._end is not None:
+        if self.end_is_inclusive is True and self.end_key is not None:
             # only show end_is_inclusive if it is different from the default
             args_list.append(f"end_is_inclusive={self.end_is_inclusive}")
         return f"RowRange({', '.join(args_list)})"
@@ -235,24 +218,30 @@ def __init__(
                 default: None (no limit)
           - row_filter: a RowFilter to apply to the query
         """
-        self.row_keys: set[bytes] = set()
-        self.row_ranges: set[RowRange] = set()
-        if row_ranges is not None:
-            if isinstance(row_ranges, RowRange):
-                row_ranges = [row_ranges]
-            for r in row_ranges:
-                self.add_range(r)
-        if row_keys is not None:
-            if not isinstance(row_keys, list):
-                row_keys = [row_keys]
-            for k in row_keys:
-                self.add_key(k)
-        self.limit: int | None = limit
-        self.filter: RowFilter | None = row_filter
+        row_ranges = row_ranges or []
+        row_keys = row_keys or []
+        if not isinstance(row_ranges, list):
+            row_ranges = [row_ranges]
+        if not isinstance(row_keys, list):
+            row_keys = [row_keys]
+        self._row_set = RowSetPB(row_keys=row_keys, row_ranges=[r._pb for r in row_ranges])
+        self._pb = ReadRowsRequestPB(
+            rows=self._row_set,
+            filter=row_filter._to_pb() if row_filter else None,
+            rows_limit=limit,
+        )
+
+    @property
+    def row_keys(self) -> list[bytes]:
+        return self._pb.rows.row_keys
+
+    @property
+    def row_ranges(self) -> list[RowRange]:
+        return [RowRange._from_pb(r) for r in self._pb.rows.row_ranges]
 
     @property
     def limit(self) -> int | None:
-        return self._limit
+        return self._pb.rows_limit or None
 
     @limit.setter
     def limit(self, new_limit: int | None):
@@ -270,11 +259,11 @@ def limit(self, new_limit: int | None):
         """
         if new_limit is not None and new_limit < 0:
             raise ValueError("limit must be >= 0")
-        self._limit = new_limit
+        self._pb.rows_limit = new_limit or 0
 
     @property
     def filter(self) -> RowFilter | None:
-        return self._filter
+        return self._pb.filter
 
     @filter.setter
     def filter(self, row_filter: RowFilter | None):
@@ -286,7 +275,7 @@ def filter(self, row_filter: RowFilter | None):
         Returns:
           - a reference to this query for chaining
         """
-        self._filter = row_filter
+        self._pb.filter = row_filter._to_pb() if row_filter else None
 
     def add_key(self, row_key: str | bytes):
         """
@@ -305,7 +294,7 @@ def add_key(self, row_key: str | bytes):
             row_key = row_key.encode()
         elif not isinstance(row_key, bytes):
             raise ValueError("row_key must be string or bytes")
-        self.row_keys.add(row_key)
+        self._pb.rows.row_keys.append(row_key)
 
     def add_range(
         self,
@@ -317,7 +306,7 @@ def add_range(
         Args:
           - row_range: a range of row keys to add to this query
         """
-        self.row_ranges.add(row_range)
+        self._pb.rows.row_ranges.append(row_range._pb)
 
     def shard(self, shard_keys: RowKeySamples) -> ShardedQuery:
         """
@@ -383,24 +372,24 @@ def _shard_range(
           - a list of tuples, containing a segment index and a new sub-range.
         """
         # 1. find the index of the segment the start key belongs to
-        if orig_range._start is None:
+        if orig_range.start_key is None:
             # if range is open on the left, include first segment
             start_segment = 0
         else:
             # use binary search to find the segment the start key belongs to
             # bisect method determines how we break ties when the start key matches a split point
             # if inclusive, bisect_left to the left segment, otherwise bisect_right
-            bisect = bisect_left if orig_range._start.is_inclusive else bisect_right
-            start_segment = bisect(split_points, orig_range._start.key)
+            bisect = bisect_left if orig_range.start_is_inclusive else bisect_right
+            start_segment = bisect(split_points, orig_range.start_key)
 
         # 2. find the index of the segment the end key belongs to
-        if orig_range._end is None:
+        if orig_range.end_key is None:
             # if range is open on the right, include final segment
             end_segment = len(split_points)
         else:
             # use binary search to find the segment the end key belongs to.
             end_segment = bisect_left(
-                split_points, orig_range._end.key, lo=start_segment
+                split_points, orig_range.end_key, lo=start_segment
             )
             # note: end_segment will always bisect_left, because split points represent inclusive ends
             # whether the end_key is includes the split point or not, the result is the same segment
@@ -415,18 +404,22 @@ def _shard_range(
             # 3a. add new range for first segment this_range spans
             # first range spans from start_key to the split_point representing the last key in the segment
             last_key_in_first_segment = split_points[start_segment]
-            start_range = RowRange._from_points(
-                start=orig_range._start,
-                end=_RangePoint(last_key_in_first_segment, is_inclusive=True),
+            start_range = RowRange(
+                start_key=orig_range.start_key,
+                start_is_inclusive=orig_range.start_is_inclusive,
+                end_key=last_key_in_first_segment,
+                end_is_inclusive=True,
             )
             results.append((start_segment, start_range))
             # 3b. add new range for last segment this_range spans
             # we start the final range using the end key from of the previous segment, with is_inclusive=False
             previous_segment = end_segment - 1
             last_key_before_segment = split_points[previous_segment]
-            end_range = RowRange._from_points(
-                start=_RangePoint(last_key_before_segment, is_inclusive=False),
-                end=orig_range._end,
+            end_range = RowRange(
+                start_key=last_key_before_segment,
+                start_is_inclusive=False,
+                end_key=orig_range.end_key,
+                end_is_inclusive=orig_range.end_is_inclusive,
             )
             results.append((end_segment, end_range))
             # 3c. add new spanning range to all segments other than the first and last
@@ -474,16 +467,10 @@ def _to_pb(self, table) -> ReadRowsRequestPB:
         Convert this query into a dictionary that can be used to construct a
         ReadRowsRequest protobuf
         """
-        return ReadRowsRequestPB(
-            table_name=table.table_name,
-            app_profile_id=table.app_profile_id,
-            rows=RowSetPB(
-                row_keys=list(self.row_keys),
-                row_ranges=[RowRangePB(r._to_dict()) for r in self.row_ranges],
-            ),
-            filter=self._filter._to_pb() if self._filter else None,
-            rows_limit=self.limit,
-        )
+        # self._pb.table_name = table.table_name
+        # if table.app_profile_id:
+        #     self._pb.app_profile_id = table.app_profile_id
+        return ReadRowsRequestPB(self._pb, table_name=table.table_name, app_profile_id=table.app_profile_id)
 
     def __eq__(self, other):
         """