From 1d5b468c6b707ac446667f96b5b1ca7a476e8d16 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 09:37:25 -0400 Subject: [PATCH 01/35] remove --- .../concurrent_declarative_source.py | 6 --- .../availability_strategy/__init__.py | 6 +-- ...stract_file_based_availability_strategy.py | 28 +----------- .../stream/abstract_file_based_stream.py | 1 - .../file_based/stream/concurrent/adapters.py | 6 +-- .../sources/streams/availability_strategy.py | 1 + .../streams/concurrent/abstract_stream.py | 7 --- .../sources/streams/concurrent/adapters.py | 43 ------------------- .../streams/concurrent/default_stream.py | 9 ---- ...hread_based_concurrent_stream_scenarios.py | 10 ----- .../streams/concurrent/test_adapters.py | 32 -------------- .../streams/concurrent/test_default_stream.py | 13 ------ 12 files changed, 4 insertions(+), 158 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 1d629f0c7..cc59b1554 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -52,9 +52,6 @@ from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AlwaysAvailableAvailabilityStrategy, -) from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream @@ -325,7 +322,6 @@ def _group_streams( partition_generator=partition_generator, name=declarative_stream.name, json_schema=declarative_stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=get_primary_key_from_stream(declarative_stream.primary_key), cursor_field=cursor.cursor_field.cursor_field_key if hasattr(cursor, "cursor_field") @@ -362,7 +358,6 @@ def _group_streams( partition_generator=partition_generator, name=declarative_stream.name, json_schema=declarative_stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=get_primary_key_from_stream(declarative_stream.primary_key), cursor_field=None, logger=self.logger, @@ -417,7 +412,6 @@ def _group_streams( partition_generator=partition_generator, name=declarative_stream.name, json_schema=declarative_stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=get_primary_key_from_stream(declarative_stream.primary_key), cursor_field=perpartition_cursor.cursor_field.cursor_field_key, logger=self.logger, diff --git a/airbyte_cdk/sources/file_based/availability_strategy/__init__.py b/airbyte_cdk/sources/file_based/availability_strategy/__init__.py index 8134a89e0..ee3c802df 100644 --- a/airbyte_cdk/sources/file_based/availability_strategy/__init__.py +++ b/airbyte_cdk/sources/file_based/availability_strategy/__init__.py @@ -1,11 +1,7 @@ -from .abstract_file_based_availability_strategy import ( - AbstractFileBasedAvailabilityStrategy, - AbstractFileBasedAvailabilityStrategyWrapper, -) +from .abstract_file_based_availability_strategy import AbstractFileBasedAvailabilityStrategy from .default_file_based_availability_strategy import DefaultFileBasedAvailabilityStrategy __all__ = [ "AbstractFileBasedAvailabilityStrategy", - "AbstractFileBasedAvailabilityStrategyWrapper", "DefaultFileBasedAvailabilityStrategy", ] diff --git a/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py b/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py index 12e1740b6..c7ae6ff43 100644 --- a/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +++ b/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py @@ -10,12 +10,6 @@ from airbyte_cdk.sources import Source from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AbstractAvailabilityStrategy, - StreamAvailability, - StreamAvailable, - StreamUnavailable, -) from airbyte_cdk.sources.streams.core import Stream if TYPE_CHECKING: @@ -28,7 +22,7 @@ def check_availability( # type: ignore[override] # Signature doesn't match bas self, stream: Stream, logger: logging.Logger, - _: Optional[Source], + source: Optional[Source] = None, ) -> Tuple[bool, Optional[str]]: """ Perform a connection check for the stream. @@ -51,23 +45,3 @@ def check_availability_and_parsability( Returns (True, None) if successful, otherwise (False, ). """ ... - - -class AbstractFileBasedAvailabilityStrategyWrapper(AbstractAvailabilityStrategy): - def __init__(self, stream: AbstractFileBasedStream) -> None: - self.stream = stream - - def check_availability(self, logger: logging.Logger) -> StreamAvailability: - is_available, reason = self.stream.availability_strategy.check_availability( - self.stream, logger, None - ) - if is_available: - return StreamAvailable() - return StreamUnavailable(reason or "") - - def check_availability_and_parsability( - self, logger: logging.Logger - ) -> Tuple[bool, Optional[str]]: - return self.stream.availability_strategy.check_availability_and_parsability( - self.stream, logger, None - ) diff --git a/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py index ef258b34d..e3fb0179e 100644 --- a/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py @@ -179,7 +179,6 @@ def record_passes_validation_policy(self, record: Mapping[str, Any]) -> bool: ) @cached_property - @deprecated("Deprecated as of CDK version 3.7.0.") def availability_strategy(self) -> AbstractFileBasedAvailabilityStrategy: return self._availability_strategy diff --git a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py index c36e5179d..67d0922f1 100644 --- a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +++ b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py @@ -19,10 +19,7 @@ ) from airbyte_cdk.sources import AbstractSource from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager -from airbyte_cdk.sources.file_based.availability_strategy import ( - AbstractFileBasedAvailabilityStrategy, - AbstractFileBasedAvailabilityStrategyWrapper, -) +from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile @@ -97,7 +94,6 @@ def create_from_stream( ), name=stream.name, json_schema=stream.get_json_schema(), - availability_strategy=AbstractFileBasedAvailabilityStrategyWrapper(stream), primary_key=pk, cursor_field=cursor_field, logger=logger, diff --git a/airbyte_cdk/sources/streams/availability_strategy.py b/airbyte_cdk/sources/streams/availability_strategy.py index 312ddae19..96a2c9bc9 100644 --- a/airbyte_cdk/sources/streams/availability_strategy.py +++ b/airbyte_cdk/sources/streams/availability_strategy.py @@ -14,6 +14,7 @@ from airbyte_cdk.sources import Source +# FIXME this class AvailabilityStrategy(ABC): """ Abstract base class for checking stream availability. diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 26e6f09d4..33e7c4d10 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -9,7 +9,6 @@ from airbyte_cdk.models import AirbyteStream from airbyte_cdk.sources.source import ExperimentalClassWarning -from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -64,12 +63,6 @@ def cursor_field(self) -> Optional[str]: :return: The name of the field used as a cursor. Nested cursor fields are not supported. """ - @abstractmethod - def check_availability(self) -> StreamAvailability: - """ - :return: The stream's availability - """ - @abstractmethod def get_json_schema(self) -> Mapping[str, Any]: """ diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 7da594155..949f0545b 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -24,12 +24,7 @@ from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.source import ExperimentalClassWarning from airbyte_cdk.sources.streams import Stream -from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AbstractAvailabilityStrategy, - AlwaysAvailableAvailabilityStrategy, -) from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage @@ -101,7 +96,6 @@ def create_from_stream( name=stream.name, namespace=stream.namespace, json_schema=stream.get_json_schema(), - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=pk, cursor_field=cursor_field, logger=logger, @@ -210,18 +204,6 @@ def get_json_schema(self) -> Mapping[str, Any]: def supports_incremental(self) -> bool: return self._legacy_stream.supports_incremental - def check_availability( - self, logger: logging.Logger, source: Optional["Source"] = None - ) -> Tuple[bool, Optional[str]]: - """ - Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters - :param logger: (ignored) - :param source: (ignored) - :return: - """ - availability = self._abstract_stream.check_availability() - return availability.is_available(), availability.message() - def as_airbyte_stream(self) -> AirbyteStream: return self._abstract_stream.as_airbyte_stream() @@ -370,28 +352,3 @@ def generate(self) -> Iterable[Partition]: self._cursor_field, self._state, ) - - -@deprecated( - "Availability strategy has been soft deprecated. Do not use. Class is subject to removal", - category=ExperimentalClassWarning, -) -class AvailabilityStrategyFacade(AvailabilityStrategy): - def __init__(self, abstract_availability_strategy: AbstractAvailabilityStrategy): - self._abstract_availability_strategy = abstract_availability_strategy - - def check_availability( - self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None - ) -> Tuple[bool, Optional[str]]: - """ - Checks stream availability. - - Important to note that the stream and source parameters are not used by the underlying AbstractAvailabilityStrategy. - - :param stream: (unused) - :param logger: logger object to use - :param source: (unused) - :return: A tuple of (boolean, str). If boolean is true, then the stream - """ - stream_availability = self._abstract_availability_strategy.check_availability(logger) - return stream_availability.is_available(), stream_availability.message() diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 54600d635..70ddd7d16 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -8,10 +8,6 @@ from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AbstractAvailabilityStrategy, - StreamAvailability, -) from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator @@ -23,7 +19,6 @@ def __init__( partition_generator: PartitionGenerator, name: str, json_schema: Mapping[str, Any], - availability_strategy: AbstractAvailabilityStrategy, primary_key: List[str], cursor_field: Optional[str], logger: Logger, @@ -34,7 +29,6 @@ def __init__( self._stream_partition_generator = partition_generator self._name = name self._json_schema = json_schema - self._availability_strategy = availability_strategy self._primary_key = primary_key self._cursor_field = cursor_field self._logger = logger @@ -53,9 +47,6 @@ def name(self) -> str: def namespace(self) -> Optional[str]: return self._namespace - def check_availability(self) -> StreamAvailability: - return self._availability_strategy.check_availability(self._logger) - @property def cursor_field(self) -> Optional[str]: return self._cursor_field diff --git a/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py b/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py index 185c5dceb..7db65b53d 100644 --- a/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +++ b/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py @@ -5,9 +5,6 @@ import logging from airbyte_cdk.sources.message import InMemoryMessageRepository -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - AlwaysAvailableAvailabilityStrategy, -) from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.types import Record @@ -48,7 +45,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -84,7 +80,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -120,7 +115,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=["id"], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -171,7 +165,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -222,7 +215,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -255,7 +247,6 @@ "id": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), @@ -397,7 +388,6 @@ "key": {"type": ["null", "string"]}, }, }, - availability_strategy=AlwaysAvailableAvailabilityStrategy(), primary_key=[], cursor_field=None, logger=logging.getLogger("test_logger"), diff --git a/unit_tests/sources/streams/concurrent/test_adapters.py b/unit_tests/sources/streams/concurrent/test_adapters.py index 66f48a9e0..82c5c91cb 100644 --- a/unit_tests/sources/streams/concurrent/test_adapters.py +++ b/unit_tests/sources/streams/concurrent/test_adapters.py @@ -18,7 +18,6 @@ from airbyte_cdk.models import Type as MessageType from airbyte_cdk.sources.message import InMemoryMessageRepository from airbyte_cdk.sources.streams.concurrent.adapters import ( - AvailabilityStrategyFacade, StreamFacade, StreamPartition, StreamPartitionGenerator, @@ -42,28 +41,6 @@ _ANY_CURSOR = Mock(spec=Cursor) -@pytest.mark.parametrize( - "stream_availability, expected_available, expected_message", - [ - pytest.param(StreamAvailable(), True, None, id="test_stream_is_available"), - pytest.param(STREAM_AVAILABLE, True, None, id="test_stream_is_available_using_singleton"), - pytest.param(StreamUnavailable("message"), False, "message", id="test_stream_is_available"), - ], -) -def test_availability_strategy_facade(stream_availability, expected_available, expected_message): - strategy = Mock() - strategy.check_availability.return_value = stream_availability - facade = AvailabilityStrategyFacade(strategy) - - logger = Mock() - available, message = facade.check_availability(Mock(), logger, Mock()) - - assert available == expected_available - assert message == expected_message - - strategy.check_availability.assert_called_once_with(logger) - - @pytest.mark.parametrize( "sync_mode", [ @@ -319,15 +296,6 @@ def test_given_cursor_is_not_noop_when_supports_incremental_then_return_true(sel Mock(spec=logging.Logger), ).supports_incremental - def test_check_availability_is_delegated_to_wrapped_stream(self): - availability = StreamAvailable() - self._abstract_stream.check_availability.return_value = availability - assert self._facade.check_availability(Mock(), Mock()) == ( - availability.is_available(), - availability.message(), - ) - self._abstract_stream.check_availability.assert_called_once_with() - def test_full_refresh(self): expected_stream_data = [{"data": 1}, {"data": 2}] records = [Record(data, "stream") for data in expected_stream_data] diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 2c9afe4da..dc2624eee 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -16,7 +16,6 @@ def setUp(self): self._partition_generator = Mock() self._name = "name" self._json_schema = {} - self._availability_strategy = Mock() self._primary_key = [] self._cursor_field = None self._logger = Mock() @@ -26,7 +25,6 @@ def setUp(self): self._partition_generator, self._name, self._json_schema, - self._availability_strategy, self._primary_key, self._cursor_field, self._logger, @@ -41,12 +39,6 @@ def test_get_json_schema(self): json_schema = self._stream.get_json_schema() assert json_schema == self._json_schema - def test_check_availability(self): - self._availability_strategy.check_availability.return_value = STREAM_AVAILABLE - availability = self._stream.check_availability() - assert availability == STREAM_AVAILABLE - self._availability_strategy.check_availability.assert_called_once_with(self._logger) - def test_check_for_error_raises_an_exception_if_any_of_the_futures_are_not_done(self): futures = [Mock() for _ in range(3)] for f in futures: @@ -93,7 +85,6 @@ def test_as_airbyte_stream_with_primary_key(self): self._partition_generator, self._name, json_schema, - self._availability_strategy, ["composite_key_1", "composite_key_2"], self._cursor_field, self._logger, @@ -131,7 +122,6 @@ def test_as_airbyte_stream_with_composite_primary_key(self): self._partition_generator, self._name, json_schema, - self._availability_strategy, ["id_a", "id_b"], self._cursor_field, self._logger, @@ -169,7 +159,6 @@ def test_as_airbyte_stream_with_a_cursor(self): self._partition_generator, self._name, json_schema, - self._availability_strategy, self._primary_key, "date", self._logger, @@ -200,7 +189,6 @@ def test_as_airbyte_stream_with_namespace(self): self._partition_generator, self._name, self._json_schema, - self._availability_strategy, self._primary_key, self._cursor_field, self._logger, @@ -231,7 +219,6 @@ def test_as_airbyte_stream_with_file_transfer_support(self): self._partition_generator, self._name, self._json_schema, - self._availability_strategy, self._primary_key, self._cursor_field, self._logger, From 76ac6f7ae6dda9f28da2f43b6a4de8b085d33e8a Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 31 Jul 2025 13:38:38 +0000 Subject: [PATCH 02/35] Auto-fix lint and format issues --- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py index 67d0922f1..fd8eef9b0 100644 --- a/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +++ b/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py @@ -19,7 +19,9 @@ ) from airbyte_cdk.sources import AbstractSource from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager -from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.availability_strategy import ( + AbstractFileBasedAvailabilityStrategy, +) from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile From 2d1e2f43396cabe792a9101af3841b5e7acc79ce Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 12:08:24 -0400 Subject: [PATCH 03/35] remove unused file --- .../concurrent/availability_strategy.py | 94 ------------------- .../streams/concurrent/test_adapters.py | 5 - .../streams/concurrent/test_default_stream.py | 1 - 3 files changed, 100 deletions(-) delete mode 100644 airbyte_cdk/sources/streams/concurrent/availability_strategy.py diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py deleted file mode 100644 index 118a7d0bb..000000000 --- a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# - -import logging -from abc import ABC, abstractmethod -from typing import Optional - -from typing_extensions import deprecated - -from airbyte_cdk.sources.source import ExperimentalClassWarning - - -class StreamAvailability(ABC): - @abstractmethod - def is_available(self) -> bool: - """ - :return: True if the stream is available. False if the stream is not - """ - - @abstractmethod - def message(self) -> Optional[str]: - """ - :return: A message describing why the stream is not available. If the stream is available, this should return None. - """ - - -class StreamAvailable(StreamAvailability): - def is_available(self) -> bool: - return True - - def message(self) -> Optional[str]: - return None - - -class StreamUnavailable(StreamAvailability): - def __init__(self, message: str): - self._message = message - - def is_available(self) -> bool: - return False - - def message(self) -> Optional[str]: - return self._message - - -# Singleton instances of StreamAvailability to avoid the overhead of creating new dummy objects -STREAM_AVAILABLE = StreamAvailable() - - -@deprecated( - "This class is experimental. Use at your own risk.", - category=ExperimentalClassWarning, -) -class AbstractAvailabilityStrategy(ABC): - """ - AbstractAvailabilityStrategy is an experimental interface developed as part of the Concurrent CDK. - This interface is not yet stable and may change in the future. Use at your own risk. - - Why create a new interface instead of using the existing AvailabilityStrategy? - The existing AvailabilityStrategy is tightly coupled with Stream and Source, which yields to circular dependencies and makes it difficult to move away from the Stream interface to AbstractStream. - """ - - @abstractmethod - def check_availability(self, logger: logging.Logger) -> StreamAvailability: - """ - Checks stream availability. - - :param logger: logger object to use - :return: A StreamAvailability object describing the stream's availability - """ - - -@deprecated( - "This class is experimental. Use at your own risk.", - category=ExperimentalClassWarning, -) -class AlwaysAvailableAvailabilityStrategy(AbstractAvailabilityStrategy): - """ - An availability strategy that always indicates a stream is available. - - This strategy is used to avoid breaking changes and serves as a soft - deprecation of the availability strategy, allowing a smoother transition - without disrupting existing functionality. - """ - - def check_availability(self, logger: logging.Logger) -> StreamAvailability: - """ - Checks stream availability. - - :param logger: logger object to use - :return: A StreamAvailability object describing the stream's availability - """ - return StreamAvailable() diff --git a/unit_tests/sources/streams/concurrent/test_adapters.py b/unit_tests/sources/streams/concurrent/test_adapters.py index 82c5c91cb..68efbc941 100644 --- a/unit_tests/sources/streams/concurrent/test_adapters.py +++ b/unit_tests/sources/streams/concurrent/test_adapters.py @@ -22,11 +22,6 @@ StreamPartition, StreamPartitionGenerator, ) -from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( - STREAM_AVAILABLE, - StreamAvailable, - StreamUnavailable, -) from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage from airbyte_cdk.sources.streams.core import Stream diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index dc2624eee..7cfc3ac05 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -6,7 +6,6 @@ from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.message import InMemoryMessageRepository -from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream From b4a5fecb7f8f5f7572076341cda7eec90ad3524c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 16:43:19 -0400 Subject: [PATCH 04/35] have declarative availability check support AbstractStream --- .../checks/check_dynamic_stream.py | 15 ++-- .../declarative/checks/check_stream.py | 35 ++++++---- .../streams/concurrent/abstract_stream.py | 7 ++ .../concurrent/availability_strategy.py | 38 +++++++++++ .../streams/concurrent/default_stream.py | 29 ++++++++ .../declarative/checks/test_check_stream.py | 7 +- .../streams/concurrent/test_default_stream.py | 68 ++++++++++++++++++- 7 files changed, 174 insertions(+), 25 deletions(-) create mode 100644 airbyte_cdk/sources/streams/concurrent/availability_strategy.py diff --git a/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py b/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py index 64d90de19..876750e4a 100644 --- a/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +++ b/airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py @@ -3,12 +3,13 @@ # import logging -import traceback from dataclasses import InitVar, dataclass -from typing import Any, List, Mapping, Tuple +from typing import Any, List, Mapping, Tuple, Union -from airbyte_cdk import AbstractSource +from airbyte_cdk.sources.abstract_source import AbstractSource +from airbyte_cdk.sources.declarative.checks.check_stream import evaluate_availability from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy @@ -34,20 +35,16 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: def check_connection( self, source: AbstractSource, logger: logging.Logger, config: Mapping[str, Any] ) -> Tuple[bool, Any]: - streams = source.streams(config=config) + streams: List[Union[Stream, AbstractStream]] = source.streams(config=config) # type: ignore # this is a migration step and we expect the declarative CDK to migrate off of ConnectionChecker if len(streams) == 0: return False, f"No streams to connect to from source {source}" if not self.use_check_availability: return True, None - availability_strategy = HttpAvailabilityStrategy() - try: for stream in streams[: min(self.stream_count, len(streams))]: - stream_is_available, reason = availability_strategy.check_availability( - stream, logger - ) + stream_is_available, reason = evaluate_availability(stream, logger) if not stream_is_available: logger.warning(f"Stream {stream.name} is not available: {reason}") return False, reason diff --git a/airbyte_cdk/sources/declarative/checks/check_stream.py b/airbyte_cdk/sources/declarative/checks/check_stream.py index 1123349cb..db97098ef 100644 --- a/airbyte_cdk/sources/declarative/checks/check_stream.py +++ b/airbyte_cdk/sources/declarative/checks/check_stream.py @@ -5,13 +5,28 @@ import logging import traceback from dataclasses import InitVar, dataclass -from typing import Any, Dict, List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, Union -from airbyte_cdk import AbstractSource +from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.abstract_source import AbstractSource from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy +def evaluate_availability(stream: Union[Stream, AbstractStream], logger: logging.Logger) -> Tuple[bool, Optional[str]]: + """ + As a transition period, we want to support both Stream and AbstractStream until we migrate everything to AbstractStream. + """ + if isinstance(stream, Stream): + return HttpAvailabilityStrategy().check_availability(stream, logger) + elif isinstance(stream, AbstractStream): + availability = stream.check_availability() + return availability.is_available, availability.reason + else: + raise ValueError(f"Unsupported stream type {type(stream)}") + + @dataclass(frozen=True) class DynamicStreamCheckConfig: """Defines the configuration for dynamic stream during connection checking. This class specifies @@ -51,7 +66,7 @@ def check_connection( ) -> Tuple[bool, Any]: """Checks the connection to the source and its streams.""" try: - streams = source.streams(config=config) + streams: List[Union[Stream, AbstractStream]] = source.streams(config=config) # type: ignore # this is a migration step and we expect the declarative CDK to migrate off of ConnectionChecker if not streams: return False, f"No streams to connect to from source {source}" except Exception as error: @@ -82,13 +97,12 @@ def check_connection( return True, None def _check_stream_availability( - self, stream_name_to_stream: Dict[str, Any], stream_name: str, logger: logging.Logger + self, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], stream_name: str, logger: logging.Logger ) -> Tuple[bool, Any]: """Checks if streams are available.""" - availability_strategy = HttpAvailabilityStrategy() try: stream = stream_name_to_stream[stream_name] - stream_is_available, reason = availability_strategy.check_availability(stream, logger) + stream_is_available, reason = evaluate_availability(stream, logger) if not stream_is_available: message = f"Stream {stream_name} is not available: {reason}" logger.warning(message) @@ -98,7 +112,7 @@ def _check_stream_availability( return True, None def _check_dynamic_streams_availability( - self, source: AbstractSource, stream_name_to_stream: Dict[str, Any], logger: logging.Logger + self, source: AbstractSource, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], logger: logging.Logger ) -> Tuple[bool, Any]: """Checks the availability of dynamic streams.""" dynamic_streams = source.resolved_manifest.get("dynamic_streams", []) # type: ignore[attr-defined] # The source's resolved_manifest manifest is checked before calling this method @@ -135,18 +149,15 @@ def _map_generated_streams( def _check_generated_streams_availability( self, generated_streams: List[Dict[str, Any]], - stream_name_to_stream: Dict[str, Any], + stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], logger: logging.Logger, max_count: int, ) -> Tuple[bool, Any]: """Checks availability of generated dynamic streams.""" - availability_strategy = HttpAvailabilityStrategy() for declarative_stream in generated_streams[: min(max_count, len(generated_streams))]: stream = stream_name_to_stream[declarative_stream["name"]] try: - stream_is_available, reason = availability_strategy.check_availability( - stream, logger - ) + stream_is_available, reason = evaluate_availability(stream, logger) if not stream_is_available: message = f"Dynamic Stream {stream.name} is not available: {reason}" logger.warning(message) diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 33e7c4d10..53fa9450e 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -9,6 +9,7 @@ from airbyte_cdk.models import AirbyteStream from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -87,3 +88,9 @@ def cursor(self) -> Cursor: """ :return: The cursor associated with this stream. """ + + @abstractmethod + def check_availability(self) -> StreamAvailability: + """ + :return: If the stream is available and if not, why + """ diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py new file mode 100644 index 000000000..5b5288bf3 --- /dev/null +++ b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Optional + + +class StreamAvailability: + + @classmethod + def available(cls) -> "StreamAvailability": + return cls(True) + + @classmethod + def unavailable(cls, reason: str) -> "StreamAvailability": + return StreamAvailability(False, reason) + + def __init__(self, available: bool, reason: Optional[str] = None) -> None: + self._available = available + self._reason = reason + + if not available: + assert reason, "A reason needs to be provided if the stream is not available" + + @property + def is_available(self) -> bool: + """ + :return: True if the stream is available. False if the stream is not + """ + return self._available + + @property + def reason(self) -> Optional[str]: + """ + :return: A message describing why the stream is not available. If the stream is available, this should return None. + """ + return self._reason diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 70ddd7d16..7fa72d522 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -8,12 +8,15 @@ from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator +from airbyte_cdk.utils.traced_exception import AirbyteTracedException class DefaultStream(AbstractStream): + def __init__( self, partition_generator: PartitionGenerator, @@ -91,3 +94,29 @@ def log_stream_sync_configuration(self) -> None: @property def cursor(self) -> Cursor: return self._cursor + + def check_availability(self) -> StreamAvailability: + """ + Check stream availability by attempting to read the first record of the stream. + """ + try: + partition = next(iter(self.generate_partitions())) + except StopIteration: + # NOTE: The following comment was copied from legacy stuff and I don't know how relevant it is: + # If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!) + # This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield ` + # without accounting for the case in which the parent stream is empty. + return StreamAvailability.unavailable( + f"Cannot attempt to connect to stream {self.name} - no stream slices were found" + ) + except AirbyteTracedException as error: + return StreamAvailability.unavailable(error.message) + + try: + next(iter(partition.read())) + return StreamAvailability.available() + except StopIteration: + self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") + return StreamAvailability.available() + except AirbyteTracedException as error: + return StreamAvailability.unavailable(error.message) diff --git a/unit_tests/sources/declarative/checks/test_check_stream.py b/unit_tests/sources/declarative/checks/test_check_stream.py index 3cbaf8fd8..49dc8ef9a 100644 --- a/unit_tests/sources/declarative/checks/test_check_stream.py +++ b/unit_tests/sources/declarative/checks/test_check_stream.py @@ -17,6 +17,7 @@ ConcurrentDeclarativeSource, ) from airbyte_cdk.sources.streams.http import HttpStream +from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse logger = logging.getLogger("test") @@ -45,7 +46,7 @@ def test_check_stream_with_slices_as_list( test_name, record, streams_to_check, stream_slice, expectation, slices_as_list ): - stream = MagicMock() + stream = MagicMock(spec=Stream) stream.name = "s1" stream.availability_strategy = None if slices_as_list: @@ -77,7 +78,7 @@ def mock_read_records(responses, default_response=None, **kwargs): def test_check_empty_stream(): - stream = MagicMock() + stream = MagicMock(spec=Stream) stream.name = "s1" stream.read_records.return_value = iter([]) stream.stream_slices.return_value = iter([None]) @@ -91,7 +92,7 @@ def test_check_empty_stream(): def test_check_stream_with_no_stream_slices_aborts(): - stream = MagicMock() + stream = MagicMock(spec=Stream) stream.name = "s1" stream.stream_slices.return_value = iter([]) diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 7cfc3ac05..6159ea1e6 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -4,15 +4,22 @@ import unittest from unittest.mock import Mock +import pytest + from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.message import InMemoryMessageRepository from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator +from airbyte_cdk.sources.types import Record +from airbyte_cdk.utils.traced_exception import AirbyteTracedException class ThreadBasedConcurrentStreamTest(unittest.TestCase): def setUp(self): - self._partition_generator = Mock() + self._partition_generator = Mock(spec=PartitionGenerator) + self._partition = Mock(spec=Partition) self._name = "name" self._json_schema = {} self._primary_key = [] @@ -243,3 +250,62 @@ def test_as_airbyte_stream_with_file_transfer_support(self): actual_airbyte_stream = stream.as_airbyte_stream() assert actual_airbyte_stream == expected_airbyte_stream + + def test_given_no_partitions_when_get_availability_then_unavailable(self) -> None: + self._partition_generator.generate.return_value = [] + + availability = self._stream.check_availability() + + assert availability.is_available == False + assert "no stream slices were found" in availability.reason + + def test_given_AirbyteTracedException_when_generating_partitions_when_get_availability_then_unavailable(self) -> None: + error_message = "error while generating partitions" + self._partition_generator.generate.side_effect = AirbyteTracedException(message=error_message) + + availability = self._stream.check_availability() + + assert availability.is_available == False + assert error_message in availability.reason + + def test_given_unknown_error_when_generating_partitions_when_get_availability_then_raise(self) -> None: + """ + I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy + """ + self._partition_generator.generate.side_effect = ValueError() + with pytest.raises(ValueError): + self._stream.check_availability() + + def test_given_no_records_when_get_availability_then_available(self) -> None: + self._partition_generator.generate.return_value = [self._partition] + self._partition.read.return_value = [] + + availability = self._stream.check_availability() + + assert availability.is_available == True + + def test_given_records_when_get_availability_then_available(self) -> None: + self._partition_generator.generate.return_value = [self._partition] + self._partition.read.return_value = [Mock(spec=Record)] + + availability = self._stream.check_availability() + + assert availability.is_available == True + + def test_given_AirbyteTracedException_when_reading_records_when_get_availability_then_unavailable(self) -> None: + self._partition_generator.generate.return_value = [self._partition] + error_message = "error while reading records" + self._partition.read.side_effect = AirbyteTracedException(message=error_message) + + availability = self._stream.check_availability() + + assert availability.is_available == False + + def test_given_unknown_error_when_reading_record_when_get_availability_then_raise(self) -> None: + """ + I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy + """ + self._partition_generator.generate.side_effect = ValueError() + self._partition.read.return_value = [] + with pytest.raises(ValueError): + self._stream.check_availability() From fc6c6b6128bb7ff8c1d5841c884ac25943f33028 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 31 Jul 2025 20:53:03 +0000 Subject: [PATCH 05/35] Auto-fix lint and format issues --- .../sources/declarative/checks/check_stream.py | 16 ++++++++++++---- .../streams/concurrent/availability_strategy.py | 1 - .../sources/streams/concurrent/default_stream.py | 1 - .../declarative/checks/test_check_stream.py | 2 +- .../streams/concurrent/test_default_stream.py | 16 ++++++++++++---- 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/airbyte_cdk/sources/declarative/checks/check_stream.py b/airbyte_cdk/sources/declarative/checks/check_stream.py index db97098ef..73940d382 100644 --- a/airbyte_cdk/sources/declarative/checks/check_stream.py +++ b/airbyte_cdk/sources/declarative/checks/check_stream.py @@ -7,14 +7,16 @@ from dataclasses import InitVar, dataclass from typing import Any, Dict, List, Mapping, Optional, Tuple, Union -from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.sources.abstract_source import AbstractSource from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy -def evaluate_availability(stream: Union[Stream, AbstractStream], logger: logging.Logger) -> Tuple[bool, Optional[str]]: +def evaluate_availability( + stream: Union[Stream, AbstractStream], logger: logging.Logger +) -> Tuple[bool, Optional[str]]: """ As a transition period, we want to support both Stream and AbstractStream until we migrate everything to AbstractStream. """ @@ -97,7 +99,10 @@ def check_connection( return True, None def _check_stream_availability( - self, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], stream_name: str, logger: logging.Logger + self, + stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], + stream_name: str, + logger: logging.Logger, ) -> Tuple[bool, Any]: """Checks if streams are available.""" try: @@ -112,7 +117,10 @@ def _check_stream_availability( return True, None def _check_dynamic_streams_availability( - self, source: AbstractSource, stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], logger: logging.Logger + self, + source: AbstractSource, + stream_name_to_stream: Dict[str, Union[Stream, AbstractStream]], + logger: logging.Logger, ) -> Tuple[bool, Any]: """Checks the availability of dynamic streams.""" dynamic_streams = source.resolved_manifest.get("dynamic_streams", []) # type: ignore[attr-defined] # The source's resolved_manifest manifest is checked before calling this method diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py index 5b5288bf3..3be77ff05 100644 --- a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py +++ b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py @@ -7,7 +7,6 @@ class StreamAvailability: - @classmethod def available(cls) -> "StreamAvailability": return cls(True) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 7fa72d522..d8814541f 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -16,7 +16,6 @@ class DefaultStream(AbstractStream): - def __init__( self, partition_generator: PartitionGenerator, diff --git a/unit_tests/sources/declarative/checks/test_check_stream.py b/unit_tests/sources/declarative/checks/test_check_stream.py index 49dc8ef9a..21f036440 100644 --- a/unit_tests/sources/declarative/checks/test_check_stream.py +++ b/unit_tests/sources/declarative/checks/test_check_stream.py @@ -16,8 +16,8 @@ from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( ConcurrentDeclarativeSource, ) -from airbyte_cdk.sources.streams.http import HttpStream from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.streams.http import HttpStream from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse logger = logging.getLogger("test") diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 6159ea1e6..98255bfe5 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -259,16 +259,22 @@ def test_given_no_partitions_when_get_availability_then_unavailable(self) -> Non assert availability.is_available == False assert "no stream slices were found" in availability.reason - def test_given_AirbyteTracedException_when_generating_partitions_when_get_availability_then_unavailable(self) -> None: + def test_given_AirbyteTracedException_when_generating_partitions_when_get_availability_then_unavailable( + self, + ) -> None: error_message = "error while generating partitions" - self._partition_generator.generate.side_effect = AirbyteTracedException(message=error_message) + self._partition_generator.generate.side_effect = AirbyteTracedException( + message=error_message + ) availability = self._stream.check_availability() assert availability.is_available == False assert error_message in availability.reason - def test_given_unknown_error_when_generating_partitions_when_get_availability_then_raise(self) -> None: + def test_given_unknown_error_when_generating_partitions_when_get_availability_then_raise( + self, + ) -> None: """ I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy """ @@ -292,7 +298,9 @@ def test_given_records_when_get_availability_then_available(self) -> None: assert availability.is_available == True - def test_given_AirbyteTracedException_when_reading_records_when_get_availability_then_unavailable(self) -> None: + def test_given_AirbyteTracedException_when_reading_records_when_get_availability_then_unavailable( + self, + ) -> None: self._partition_generator.generate.return_value = [self._partition] error_message = "error while reading records" self._partition.read.side_effect = AirbyteTracedException(message=error_message) From 5fe2e02054594f689a392d2ce706453b00e58168 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Thu, 31 Jul 2025 17:03:30 -0400 Subject: [PATCH 06/35] mypy --- airbyte_cdk/sources/streams/concurrent/default_stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index d8814541f..86eaaf9c1 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -109,7 +109,7 @@ def check_availability(self) -> StreamAvailability: f"Cannot attempt to connect to stream {self.name} - no stream slices were found" ) except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message) + return StreamAvailability.unavailable(error.message or error.internal_message or "") try: next(iter(partition.read())) @@ -118,4 +118,4 @@ def check_availability(self) -> StreamAvailability: self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") return StreamAvailability.available() except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message) + return StreamAvailability.unavailable(error.message or error.internal_message or "") From 1e8e9681672e05dd28e7f4fcecf53c8687b38337 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 31 Jul 2025 21:18:24 +0000 Subject: [PATCH 07/35] Auto-fix lint and format issues --- airbyte_cdk/sources/streams/concurrent/default_stream.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 86eaaf9c1..10f04e6ba 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -109,7 +109,9 @@ def check_availability(self) -> StreamAvailability: f"Cannot attempt to connect to stream {self.name} - no stream slices were found" ) except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) try: next(iter(partition.read())) @@ -118,4 +120,6 @@ def check_availability(self) -> StreamAvailability: self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") return StreamAvailability.available() except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) From 689e7929f33366c2123961e10dd1fee4207e2764 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Fri, 1 Aug 2025 14:22:58 -0400 Subject: [PATCH 08/35] Remove RFR stuff --- .../concurrent_declarative_source.py | 1 - .../parsers/model_to_component_factory.py | 65 +++----- .../test_model_to_component_factory.py | 148 +----------------- .../test_manifest_declarative_source.py | 32 ++-- 4 files changed, 28 insertions(+), 218 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index cc59b1554..8e49b9b2c 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -84,7 +84,6 @@ def __init__( # incremental streams running in full refresh. component_factory = component_factory or ModelToComponentFactory( emit_connector_builder_messages=emit_connector_builder_messages, - disable_resumable_full_refresh=True, connector_state_manager=self._connector_state_manager, max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"), ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 628bea575..7f953f14d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -7,6 +7,7 @@ import datetime import importlib import inspect +import logging import re from functools import partial from typing import ( @@ -544,6 +545,8 @@ StreamSlicer, StreamSlicerTestReadDecorator, ) +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import \ + StreamSlicerPartitionGenerator, DeclarativePartitionFactory from airbyte_cdk.sources.declarative.transformations import ( AddFields, RecordTransformation, @@ -604,7 +607,9 @@ WeekClampingStrategy, Weekday, ) -from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, Cursor, FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, DateTimeStreamStateConverter, @@ -634,7 +639,6 @@ def __init__( emit_connector_builder_messages: bool = False, disable_retries: bool = False, disable_cache: bool = False, - disable_resumable_full_refresh: bool = False, message_repository: Optional[MessageRepository] = None, connector_state_manager: Optional[ConnectorStateManager] = None, max_concurrent_async_job_count: Optional[int] = None, @@ -645,7 +649,6 @@ def __init__( self._emit_connector_builder_messages = emit_connector_builder_messages self._disable_retries = disable_retries self._disable_cache = disable_cache - self._disable_resumable_full_refresh = disable_resumable_full_refresh self._message_repository = message_repository or InMemoryMessageRepository( self._evaluate_log_level(emit_connector_builder_messages) ) @@ -2035,15 +2038,6 @@ def create_declarative_stream( file_uploader=file_uploader, incremental_sync=model.incremental_sync, ) - cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None - - if model.state_migrations: - state_transformations = [ - self._create_component_from_model(state_migration, config, declarative_stream=model) - for state_migration in model.state_migrations - ] - else: - state_transformations = [] schema_loader: Union[ CompositeSchemaLoader, @@ -2071,6 +2065,15 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) + cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None + + if model.state_migrations: + state_transformations = [ + self._create_component_from_model(state_migration, config, declarative_stream=model) + for state_migration in model.state_migrations + ] + else: + state_transformations = [] return DeclarativeStream( name=model.name or "", primary_key=primary_key, @@ -2185,28 +2188,6 @@ def _build_incremental_cursor( return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync return None - def _build_resumable_cursor( - self, - model: Union[ - AsyncRetrieverModel, - CustomRetrieverModel, - SimpleRetrieverModel, - ], - stream_slicer: Optional[PartitionRouter], - ) -> Optional[StreamSlicer]: - if hasattr(model, "paginator") and model.paginator and not stream_slicer: - # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` - return ResumableFullRefreshCursor(parameters={}) - elif stream_slicer: - # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` - return PerPartitionCursor( - cursor_factory=CursorFactory( - create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) - ), - partition_router=stream_slicer, - ) - return None - def _merge_stream_slicers( self, model: DeclarativeStreamModel, config: Config ) -> Optional[StreamSlicer]: @@ -2243,11 +2224,7 @@ def _merge_stream_slicers( if model.incremental_sync: return self._build_incremental_cursor(model, stream_slicer, config) - return ( - stream_slicer - if self._disable_resumable_full_refresh - else self._build_resumable_cursor(retriever_model, stream_slicer) - ) + return stream_slicer def create_default_error_handler( self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any @@ -2529,9 +2506,6 @@ def create_schema_type_identifier( def create_dynamic_schema_loader( self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any ) -> DynamicSchemaLoader: - stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) - combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) - schema_transformations = [] if model.schema_transformations: for transformation_model in model.schema_transformations: @@ -2544,7 +2518,7 @@ def create_dynamic_schema_loader( config=config, name=name, primary_key=None, - stream_slicer=combined_slicers, + stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), transformations=[], use_cache=True, log_formatter=( @@ -3808,15 +3782,12 @@ def create_components_mapping_definition( def create_http_components_resolver( self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None ) -> Any: - stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) - combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) - retriever = self._create_component_from_model( model=model.retriever, config=config, name=f"{stream_name if stream_name else '__http_components_resolver'}", primary_key=None, - stream_slicer=stream_slicer if stream_slicer else combined_slicers, + stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), transformations=[], ) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 4ac0b11e7..17a36c3b0 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -1055,152 +1055,6 @@ def test_stream_with_incremental_and_async_retriever_with_partition_router(use_l assert stream_slices == expected_stream_slices -def test_resumable_full_refresh_stream(): - content = """ -decoder: - type: JsonDecoder -extractor: - type: DpathExtractor -selector: - type: RecordSelector - record_filter: - type: RecordFilter - condition: "{{ record['id'] > stream_state['id'] }}" -metadata_paginator: - type: DefaultPaginator - page_size_option: - type: RequestOption - inject_into: body_json - field_path: ["variables", "page_size"] - page_token_option: - type: RequestPath - pagination_strategy: - type: "CursorPagination" - cursor_value: "{{ response._metadata.next }}" - page_size: 10 -requester: - type: HttpRequester - url_base: "https://api.sendgrid.com/v3/" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['apikey'] }}" - request_parameters: - unit: "day" -retriever: - paginator: - type: NoPagination - decoder: - $ref: "#/decoder" -partial_stream: - type: DeclarativeStream - schema_loader: - type: JsonFileSchemaLoader - file_path: "./source_sendgrid/schemas/{{ parameters.name }}.json" -list_stream: - $ref: "#/partial_stream" - $parameters: - name: "lists" - extractor: - $ref: "#/extractor" - field_path: ["{{ parameters['name'] }}"] - name: "lists" - primary_key: "id" - retriever: - $ref: "#/retriever" - requester: - $ref: "#/requester" - path: "{{ next_page_token['next_page_url'] }}" - paginator: - $ref: "#/metadata_paginator" - record_selector: - $ref: "#/selector" - transformations: - - type: AddFields - fields: - - path: ["extra"] - value: "{{ response.to_add }}" -check: - type: CheckStream - stream_names: ["list_stream"] -spec: - type: Spec - documentation_url: https://airbyte.com/#yaml-from-manifest - connection_specification: - title: Test Spec - type: object - required: - - api_key - additionalProperties: false - properties: - api_key: - type: string - airbyte_secret: true - title: API Key - description: Test API Key - order: 0 - advanced_auth: - auth_flow_type: "oauth2.0" - """ - parsed_manifest = YamlDeclarativeSource._parse(content) - resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - resolved_manifest["type"] = "DeclarativeSource" - manifest = transformer.propagate_types_and_parameters("", resolved_manifest, {}) - - stream_manifest = manifest["list_stream"] - assert stream_manifest["type"] == "DeclarativeStream" - stream = factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config - ) - - assert isinstance(stream, DeclarativeStream) - assert stream.primary_key == "id" - assert stream.name == "lists" - assert stream._stream_cursor_field.string == "" - - assert isinstance(stream.retriever, SimpleRetriever) - assert stream.retriever.primary_key == stream.primary_key - assert stream.retriever.name == stream.name - - assert isinstance(stream.retriever.record_selector, RecordSelector) - - assert isinstance(stream.retriever.stream_slicer, ResumableFullRefreshCursor) - assert isinstance(stream.retriever.cursor, ResumableFullRefreshCursor) - - assert isinstance(stream.retriever.paginator, DefaultPaginator) - assert isinstance(stream.retriever.paginator.decoder, PaginationDecoderDecorator) - for string in stream.retriever.paginator.page_size_option.field_path: - assert isinstance(string, InterpolatedString) - assert len(stream.retriever.paginator.page_size_option.field_path) == 2 - assert stream.retriever.paginator.page_size_option.inject_into == RequestOptionType.body_json - assert isinstance(stream.retriever.paginator.page_token_option, RequestPath) - assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" - assert stream.retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" - - assert isinstance(stream.retriever.paginator.pagination_strategy, CursorPaginationStrategy) - assert isinstance( - stream.retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator - ) - assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.string - == "{{ response._metadata.next }}" - ) - assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.default - == "{{ response._metadata.next }}" - ) - assert stream.retriever.paginator.pagination_strategy.page_size == 10 - - checker = factory.create_component( - model_type=CheckStreamModel, component_definition=manifest["check"], config=input_config - ) - - assert isinstance(checker, CheckStream) - streams_to_check = checker.stream_names - assert len(streams_to_check) == 1 - assert list(streams_to_check)[0] == "list_stream" - - def test_incremental_data_feed(): content = """ selector: @@ -2592,7 +2446,7 @@ def test_default_schema_loader(self): "values": "{{config['repos']}}", "cursor_field": "a_key", }, - PerPartitionCursor, + ListPartitionRouter, id="test_create_simple_retriever_with_partition_router", ), pytest.param( diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 6753e8e4e..8f72cc6a6 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -1818,8 +1818,8 @@ def _create_page(response_body): [ call({}, {}, None), call( - {"next_page_token": "next"}, - {"next_page_token": "next"}, + {}, + {}, {"next_page_token": "next"}, ), ], @@ -1907,16 +1907,9 @@ def _create_page(response_body): ), [{"ABC": 0, "partition": 0}, {"AED": 1, "partition": 0}, {"ABC": 2, "partition": 1}], [ - call({"states": []}, {"partition": "0"}, None), + call({}, {"partition": "0"}, None), call( - { - "states": [ - { - "partition": {"partition": "0"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - } - ] - }, + {}, {"partition": "1"}, None, ), @@ -2022,17 +2015,10 @@ def _create_page(response_body): {"ABC": 2, "partition": 1}, ], [ - call({"states": []}, {"partition": "0"}, None), - call({"states": []}, {"partition": "0"}, {"next_page_token": "next"}), + call({}, {"partition": "0"}, None), + call({}, {"partition": "0"}, {"next_page_token": "next"}), call( - { - "states": [ - { - "partition": {"partition": "0"}, - "cursor": {"__ab_full_refresh_sync_complete": True}, - } - ] - }, + {}, {"partition": "1"}, None, ), @@ -2201,12 +2187,12 @@ def test_only_parent_streams_use_cache(): # Parent stream created for substream assert ( - streams[1].retriever.stream_slicer._partition_router.parent_stream_configs[0].stream.name + streams[1].retriever.stream_slicer.parent_stream_configs[0].stream.name == "applications" ) assert ( streams[1] - .retriever.stream_slicer._partition_router.parent_stream_configs[0] + .retriever.stream_slicer.parent_stream_configs[0] .stream.retriever.requester.use_cache ) From 5399436280b5eef242cec592c7460f69beb63122 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 10:41:32 -0400 Subject: [PATCH 09/35] have bland stream be instantiated as DefaultStream --- .../concurrent_declarative_source.py | 12 +- .../manifest_declarative_source.py | 10 +- .../parsers/model_to_component_factory.py | 45 +++++-- .../declarative_partition_generator.py | 25 +++- .../sources/streams/concurrent/adapters.py | 4 +- .../streams/concurrent/default_stream.py | 10 +- .../test_connector_builder_handler.py | 2 +- .../test_model_to_component_factory.py | 96 +++++++++------ .../test_config_components_resolver.py | 2 +- .../retrievers/test_simple_retriever.py | 114 ------------------ .../test_declarative_partition_generator.py | 8 +- .../test_manifest_declarative_source.py | 18 +-- .../streams/concurrent/test_default_stream.py | 21 ++++ 13 files changed, 174 insertions(+), 193 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 8e49b9b2c..0ac6299f2 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -209,6 +209,10 @@ def _group_streams( # these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible, # so we need to treat them as synchronous + if isinstance(declarative_stream, AbstractStream): + concurrent_streams.append(declarative_stream) + continue + supports_file_transfer = ( isinstance(declarative_stream, DeclarativeStream) and "file_uploader" in name_to_stream_mapping[declarative_stream.name] @@ -278,7 +282,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, retriever, self.message_repository, ), @@ -309,7 +313,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, retriever, self.message_repository, ), @@ -339,7 +343,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, declarative_stream.retriever, self.message_repository, ), @@ -399,7 +403,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.get_json_schema(), + declarative_stream.schema_loader, retriever, self.message_repository, ), diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index e962f3813..303d12ba4 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -8,7 +8,7 @@ from copy import deepcopy from importlib import metadata from types import ModuleType -from typing import Any, Dict, Iterator, List, Mapping, Optional, Set +from typing import Any, Dict, Iterator, List, Mapping, Optional, Set, Union import orjson import yaml @@ -66,6 +66,7 @@ from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING from airbyte_cdk.sources.declarative.spec.spec import Spec from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.sources.types import Config, ConnectionDefinition from airbyte_cdk.sources.utils.slice_logger import ( @@ -297,7 +298,12 @@ def connection_checker(self) -> ConnectionChecker: f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}" ) - def streams(self, config: Mapping[str, Any]) -> List[Stream]: + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: + """ + As a migration step, this method will return both legacy stream (Stream) and concurrent stream (AbstractStream). + Once the migration is done, we can probably have this method throw "not implemented" as we figure out how to + fully decouple this from the AbstractSource. + """ if self._spec_component: self._spec_component.validate_config(config) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 7f953f14d..32c14873a 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -598,6 +598,7 @@ Rate, UnlimitedCallRatePolicy, ) +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, ClampingStrategy, @@ -1920,8 +1921,8 @@ def create_datetime_based_cursor( ) def create_declarative_stream( - self, model: DeclarativeStreamModel, config: Config, **kwargs: Any - ) -> DeclarativeStream: + self, model: DeclarativeStreamModel, config: Config, is_parent=False, **kwargs: Any + ) -> Union[DeclarativeStream, AbstractStream]: # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in @@ -2065,8 +2066,38 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None + if isinstance(combined_slicers, PartitionRouter) and not is_parent and not self._emit_connector_builder_messages: + # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the + # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: + # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter + # * Streams without partition router but with cursor + # * Streams with both partition router and cursor + # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet + # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway + stream_name = model.name or "" + partition_generator = StreamSlicerPartitionGenerator( + DeclarativePartitionFactory( + stream_name, + schema_loader, + retriever, + self._message_repository, + ), + combined_slicers, + ) + FinalStateCursor(stream_name, None, self._message_repository) + return DefaultStream( + partition_generator=partition_generator, + name=stream_name, + json_schema=schema_loader.get_json_schema, + primary_key=get_primary_key_from_stream(primary_key), + cursor_field=None, + # FIXME we should have the cursor field has part of the interface of cursor + logger=logging.getLogger(f"airbyte.{stream_name}"), + # FIXME this is a breaking change compared to the old implementation, + cursor=FinalStateCursor(stream_name, None, self._message_repository), + ) + cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None if model.state_migrations: state_transformations = [ self._create_component_from_model(state_migration, config, declarative_stream=model) @@ -2094,7 +2125,7 @@ def _build_stream_slicer_from_partition_router( ], config: Config, stream_name: Optional[str] = None, - ) -> Optional[PartitionRouter]: + ) -> PartitionRouter: if ( hasattr(model, "partition_router") and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) @@ -2115,7 +2146,7 @@ def _build_stream_slicer_from_partition_router( return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router model=stream_slicer_model, config=config, stream_name=stream_name or "" ) - return None + return SinglePartitionRouter(parameters={}) def _build_incremental_cursor( self, @@ -2123,7 +2154,7 @@ def _build_incremental_cursor( stream_slicer: Optional[PartitionRouter], config: Config, ) -> Optional[StreamSlicer]: - if model.incremental_sync and stream_slicer: + if model.incremental_sync and (stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter)): if model.retriever.type == "AsyncRetriever": stream_name = model.name or "" stream_namespace = None @@ -2871,7 +2902,7 @@ def create_parent_stream_config( self, model: ParentStreamConfigModel, config: Config, **kwargs: Any ) -> ParentStreamConfig: declarative_stream = self._create_component_from_model( - model.stream, config=config, **kwargs + model.stream, config=config, is_parent=True, **kwargs, ) request_option = ( self._create_component_from_model(model.request_option, config=config) diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 94ee03a56..fe76e7ee2 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -3,6 +3,7 @@ from typing import Any, Iterable, Mapping, Optional from airbyte_cdk.sources.declarative.retrievers import Retriever +from airbyte_cdk.sources.declarative.schema import SchemaLoader from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator @@ -11,11 +12,23 @@ from airbyte_cdk.utils.slice_hasher import SliceHasher +class SchemaLoaderCachingDecorator(SchemaLoader): + + def __init__(self, schema_loader: SchemaLoader): + self._decorated = schema_loader + self._loaded_schema = None + + def get_json_schema(self) -> Mapping[str, Any]: + if self._loaded_schema is None: + self._loaded_schema = self._decorated.get_json_schema() + return self._loaded_schema + + class DeclarativePartitionFactory: def __init__( self, stream_name: str, - json_schema: Mapping[str, Any], + schema_loader: SchemaLoader, retriever: Retriever, message_repository: MessageRepository, ) -> None: @@ -25,14 +38,14 @@ def __init__( In order to avoid these problems, we will create one retriever per thread which should make the processing thread-safe. """ self._stream_name = stream_name - self._json_schema = json_schema + self._schema_loader = SchemaLoaderCachingDecorator(schema_loader) self._retriever = retriever self._message_repository = message_repository def create(self, stream_slice: StreamSlice) -> Partition: return DeclarativePartition( self._stream_name, - self._json_schema, + self._schema_loader, self._retriever, self._message_repository, stream_slice, @@ -43,20 +56,20 @@ class DeclarativePartition(Partition): def __init__( self, stream_name: str, - json_schema: Mapping[str, Any], + schema_loader: SchemaLoader, retriever: Retriever, message_repository: MessageRepository, stream_slice: StreamSlice, ): self._stream_name = stream_name - self._json_schema = json_schema + self._schema_loader = schema_loader self._retriever = retriever self._message_repository = message_repository self._stream_slice = stream_slice self._hash = SliceHasher.hash(self._stream_name, self._stream_slice) def read(self) -> Iterable[Record]: - for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice): + for stream_data in self._retriever.read_records(self._schema_loader.get_json_schema(), self._stream_slice): if isinstance(stream_data, Mapping): record = ( stream_data diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 949f0545b..6a4682605 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -5,8 +5,7 @@ import copy import json import logging -from functools import lru_cache -from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union from typing_extensions import deprecated @@ -196,7 +195,6 @@ def cursor_field(self) -> Union[str, List[str]]: def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor return self._cursor - @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: return self._abstract_stream.get_json_schema() diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 86eaaf9c1..bceed08b2 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -2,9 +2,8 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -from functools import lru_cache from logging import Logger -from typing import Any, Iterable, List, Mapping, Optional +from typing import Any, Iterable, List, Mapping, Optional, Union, Callable from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -20,7 +19,7 @@ def __init__( self, partition_generator: PartitionGenerator, name: str, - json_schema: Mapping[str, Any], + json_schema: Union[Mapping[str, Any], Callable[[], Mapping[str, Any]]], primary_key: List[str], cursor_field: Optional[str], logger: Logger, @@ -53,14 +52,15 @@ def namespace(self) -> Optional[str]: def cursor_field(self) -> Optional[str]: return self._cursor_field - @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: + if isinstance(self._json_schema, Callable): + return self._json_schema() return self._json_schema def as_airbyte_stream(self) -> AirbyteStream: stream = AirbyteStream( name=self.name, - json_schema=dict(self._json_schema), + json_schema=dict(self.get_json_schema()), supported_sync_modes=[SyncMode.full_refresh], is_resumable=False, is_file_based=self._supports_file_transfer, diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index 2587fb95a..98b42a737 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -780,7 +780,7 @@ def test_config_update() -> None: "client_secret": "a client secret", "refresh_token": "a refresh token", } - source = ManifestDeclarativeSource(source_config=manifest) + source = ManifestDeclarativeSource(source_config=manifest, emit_connector_builder_messages=True) refresh_request_response = { "access_token": "an updated access token", diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 17a36c3b0..c7d2f8d7a 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -157,6 +157,7 @@ from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader from airbyte_cdk.sources.declarative.spec import Spec from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicerTestReadDecorator +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import SchemaLoaderCachingDecorator from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource @@ -168,6 +169,7 @@ WeekClampingStrategy, ) from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, ) @@ -1757,38 +1759,39 @@ def test_config_with_defaults(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert stream.primary_key == "id" + assert isinstance(stream, DefaultStream) assert stream.name == "lists" - assert isinstance(stream.retriever, SimpleRetriever) - assert stream.retriever.name == stream.name - assert stream.retriever.primary_key == stream.primary_key + retriever = stream._stream_partition_generator._partition_factory._retriever + assert isinstance(retriever, SimpleRetriever) + assert retriever.name == stream.name + assert retriever.primary_key == "id" - assert isinstance(stream.schema_loader, JsonFileSchemaLoader) + schema_loader = get_schema_loader(stream) + assert isinstance(schema_loader, JsonFileSchemaLoader) assert ( - stream.schema_loader.file_path.string + schema_loader.file_path.string == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" ) assert ( - stream.schema_loader.file_path.default + schema_loader.file_path.default == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" ) - assert isinstance(stream.retriever.requester, HttpRequester) - assert stream.retriever.requester.http_method == HttpMethod.GET + assert isinstance(retriever.requester, HttpRequester) + assert retriever.requester.http_method == HttpMethod.GET - assert isinstance(stream.retriever.requester.authenticator, BearerAuthenticator) - assert stream.retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" + assert isinstance(retriever.requester.authenticator, BearerAuthenticator) + assert retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" - assert isinstance(stream.retriever.record_selector, RecordSelector) - assert isinstance(stream.retriever.record_selector.extractor, DpathExtractor) + assert isinstance(retriever.record_selector, RecordSelector) + assert isinstance(retriever.record_selector.extractor, DpathExtractor) assert [ - fp.eval(input_config) for fp in stream.retriever.record_selector.extractor._field_path + fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path ] == ["result"] - assert isinstance(stream.retriever.paginator, DefaultPaginator) - assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com" - assert stream.retriever.paginator.pagination_strategy.get_page_size() == 10 + assert isinstance(retriever.paginator, DefaultPaginator) + assert retriever.paginator.url_base.string == "https://api.sendgrid.com" + assert retriever.paginator.pagination_strategy.get_page_size() == 10 def test_create_default_paginator(): @@ -2184,8 +2187,8 @@ def test_no_transformations(self): config=input_config, ) - assert isinstance(stream, DeclarativeStream) - assert [] == stream.retriever.record_selector.transformations + assert isinstance(stream, DefaultStream) + assert [] == get_retriever(stream).record_selector.transformations def test_remove_fields(self): content = f""" @@ -2212,11 +2215,11 @@ def test_remove_fields(self): config=input_config, ) - assert isinstance(stream, DeclarativeStream) + assert isinstance(stream, DefaultStream) expected = [ RemoveFields(field_pointers=[["path", "to", "field1"], ["path2"]], parameters={}) ] - assert stream.retriever.record_selector.transformations == expected + assert get_retriever(stream).record_selector.transformations == expected def test_add_fields_no_value_type(self): content = f""" @@ -2375,8 +2378,8 @@ def _test_add_fields(self, content, expected): config=input_config, ) - assert isinstance(stream, DeclarativeStream) - assert stream.retriever.record_selector.transformations == expected + assert isinstance(stream, DefaultStream) + assert get_retriever(stream).record_selector.transformations == expected def test_default_schema_loader(self): component_definition = { @@ -2415,7 +2418,7 @@ def test_default_schema_loader(self): component_definition=propagated_source_config, config=input_config, ) - schema_loader = stream.schema_loader + schema_loader = get_schema_loader(stream) assert ( schema_loader.default_loader._get_json_filepath().split("/")[-1] == f"{stream.name}.json" @@ -2423,7 +2426,7 @@ def test_default_schema_loader(self): @pytest.mark.parametrize( - "incremental, partition_router, expected_type", + "incremental, partition_router, expected_router_type, expected_stream_type", [ pytest.param( { @@ -2437,6 +2440,7 @@ def test_default_schema_loader(self): }, None, DatetimeBasedCursor, + DeclarativeStream, id="test_create_simple_retriever_with_incremental", ), pytest.param( @@ -2447,6 +2451,7 @@ def test_default_schema_loader(self): "cursor_field": "a_key", }, ListPartitionRouter, + DefaultStream, id="test_create_simple_retriever_with_partition_router", ), pytest.param( @@ -2465,6 +2470,7 @@ def test_default_schema_loader(self): "cursor_field": "a_key", }, PerPartitionWithGlobalCursor, + DeclarativeStream, id="test_create_simple_retriever_with_incremental_and_partition_router", ), pytest.param( @@ -2490,17 +2496,19 @@ def test_default_schema_loader(self): }, ], PerPartitionWithGlobalCursor, + DeclarativeStream, id="test_create_simple_retriever_with_partition_routers_multiple_components", ), pytest.param( None, None, SinglePartitionRouter, + DefaultStream, id="test_create_simple_retriever_with_no_incremental_or_partition_router", ), ], ) -def test_merge_incremental_and_partition_router(incremental, partition_router, expected_type): +def test_merge_incremental_and_partition_router(incremental, partition_router, expected_router_type, expected_stream_type): stream_model = { "type": "DeclarativeStream", "retriever": { @@ -2531,22 +2539,21 @@ def test_merge_incremental_and_partition_router(incremental, partition_router, e model_type=DeclarativeStreamModel, component_definition=stream_model, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert isinstance(stream.retriever, SimpleRetriever) - assert isinstance(stream.retriever.stream_slicer, expected_type) + assert isinstance(stream, expected_stream_type) + retriever = get_retriever(stream) + assert isinstance(retriever, SimpleRetriever) + stream_slicer = retriever.stream_slicer if expected_stream_type == DeclarativeStream else stream._stream_partition_generator._stream_slicer + assert isinstance(stream_slicer, expected_router_type) if incremental and partition_router: - assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) + assert isinstance(retriever.stream_slicer, PerPartitionWithGlobalCursor) if isinstance(partition_router, list) and len(partition_router) > 1: assert isinstance( - stream.retriever.stream_slicer._partition_router, CartesianProductStreamSlicer + retriever.stream_slicer._partition_router, CartesianProductStreamSlicer ) - assert len(stream.retriever.stream_slicer._partition_router.stream_slicers) == len( + assert len(retriever.stream_slicer._partition_router.stream_slicers) == len( partition_router ) - elif partition_router and isinstance(partition_router, list) and len(partition_router) > 1: - assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) - assert len(stream.retriever.stream_slicer.stream_slicerS) == len(partition_router) def test_simple_retriever_emit_log_messages(): @@ -2714,8 +2721,8 @@ def test_create_custom_retriever(): model_type=DeclarativeStreamModel, component_definition=stream_model, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert isinstance(stream.retriever, MyCustomRetriever) + assert isinstance(stream, DefaultStream) + assert isinstance(stream._stream_partition_generator._partition_factory._retriever, MyCustomRetriever) @freezegun.freeze_time("2021-01-01 00:00:00") @@ -4646,14 +4653,23 @@ def test_create_stream_with_multiple_schema_loaders(): "", resolved_manifest["stream_A"], {} ) - declarative_stream = factory.create_component( + stream = factory.create_component( model_type=DeclarativeStreamModel, component_definition=partition_router_manifest, config=input_config, ) - schema_loader = declarative_stream.schema_loader + schema_loader = get_schema_loader(stream) assert isinstance(schema_loader, CompositeSchemaLoader) assert len(schema_loader.schema_loaders) == 2 assert isinstance(schema_loader.schema_loaders[0], InlineSchemaLoader) assert isinstance(schema_loader.schema_loaders[1], InlineSchemaLoader) + + +def get_schema_loader(stream: DefaultStream): + assert isinstance(stream._stream_partition_generator._partition_factory._schema_loader, SchemaLoaderCachingDecorator) + return stream._stream_partition_generator._partition_factory._schema_loader._decorated + + +def get_retriever(stream: Union[DeclarativeStream, DefaultStream]): + return stream.retriever if isinstance(stream, DeclarativeStream) else stream._stream_partition_generator._partition_factory._retriever diff --git a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py index 2f2cbca5b..7e9ae2150 100644 --- a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py @@ -383,5 +383,5 @@ def test_component_mapping_conditions(manifest, config, expected_conditional_par for stream in source.streams(config): if stream.name in expected_conditional_params: assert ( - stream.retriever.requester._parameters == expected_conditional_params[stream.name] + stream._stream_partition_generator._partition_factory._retriever.requester._parameters == expected_conditional_params[stream.name] ) diff --git a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py index a1e390177..44f307a32 100644 --- a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py +++ b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py @@ -265,120 +265,6 @@ def test_simple_retriever_resumable_full_refresh_cursor_page_increment( assert retriever.state == {"__ab_full_refresh_sync_complete": True} -@pytest.mark.parametrize( - "initial_state, expected_reset_value, expected_next_page", - [ - pytest.param(None, None, 1, id="test_initial_sync_no_state"), - pytest.param( - { - "next_page_token": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=tracy_stevens" - }, - "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=tracy_stevens", - "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", - id="test_reset_with_next_page_token", - ), - ], -) -def test_simple_retriever_resumable_full_refresh_cursor_reset_cursor_pagination( - initial_state, expected_reset_value, expected_next_page, requests_mock -): - expected_records = [ - Record(data={"name": "ed_baldwin"}, associated_slice=None, stream_name="users"), - Record(data={"name": "danielle_poole"}, associated_slice=None, stream_name="users"), - Record(data={"name": "tracy_stevens"}, associated_slice=None, stream_name="users"), - Record(data={"name": "deke_slayton"}, associated_slice=None, stream_name="users"), - Record(data={"name": "molly_cobb"}, associated_slice=None, stream_name="users"), - Record(data={"name": "gordo_stevens"}, associated_slice=None, stream_name="users"), - Record(data={"name": "margo_madison"}, associated_slice=None, stream_name="users"), - Record(data={"name": "ellen_waverly"}, associated_slice=None, stream_name="users"), - ] - - content = """ -name: users -type: DeclarativeStream -retriever: - type: SimpleRetriever - decoder: - type: JsonDecoder - paginator: - type: "DefaultPaginator" - page_token_option: - type: RequestPath - pagination_strategy: - type: "CursorPagination" - cursor_value: "{{ response.next_page }}" - requester: - path: /astronauts - type: HttpRequester - url_base: "https://for-all-mankind.nasa.com/api/v1" - http_method: GET - authenticator: - type: ApiKeyAuthenticator - api_token: "{{ config['api_key'] }}" - inject_into: - type: RequestOption - field_name: Api-Key - inject_into: header - request_headers: {} - request_body_json: {} - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: ["data"] - partition_router: [] -primary_key: [] - """ - - factory = ModelToComponentFactory() - stream_manifest = YamlDeclarativeSource._parse(content) - stream = factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config={} - ) - response_body = { - "data": [r.data for r in expected_records[:5]], - "next_page": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", - } - requests_mock.get("https://for-all-mankind.nasa.com/api/v1/astronauts", json=response_body) - requests_mock.get( - "https://for-all-mankind.nasa.com/astronauts?next_page=tracy_stevens", json=response_body - ) - response_body_2 = { - "data": [r.data for r in expected_records[5:]], - } - requests_mock.get( - "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", - json=response_body_2, - ) - stream_slicer = ResumableFullRefreshCursor(parameters={}) - if initial_state: - stream_slicer.set_initial_state(initial_state) - stream.retriever.stream_slices = stream_slicer - stream.retriever.cursor = stream_slicer - stream_slice = list(stream_slicer.stream_slices())[0] - actual_records = [ - r for r in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice) - ] - - assert len(actual_records) == 5 - assert actual_records == expected_records[:5] - assert stream.retriever.state == { - "next_page_token": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens" - } - requests_mock.get( - "https://for-all-mankind.nasa.com/astronauts?next_page=tracy_stevens", json=response_body - ) - requests_mock.get( - "https://for-all-mankind.nasa.com/astronauts?next_page=gordo_stevens", json=response_body_2 - ) - actual_records = [ - r for r in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice) - ] - assert len(actual_records) == 3 - assert actual_records == expected_records[5:] - assert stream.retriever.state == {"__ab_full_refresh_sync_complete": True} - - def test_simple_retriever_resumable_full_refresh_cursor_reset_skip_completed_stream(): expected_records = [ Record(data={"id": "abc"}, associated_slice=None, stream_name="test_stream"), diff --git a/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py b/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py index 3ced03a69..ba7b5c478 100644 --- a/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py +++ b/unit_tests/sources/declarative/stream_slicers/test_declarative_partition_generator.py @@ -6,6 +6,7 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type from airbyte_cdk.sources.declarative.retrievers import Retriever +from airbyte_cdk.sources.declarative.schema import InlineSchemaLoader from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( DeclarativePartitionFactory, ) @@ -15,6 +16,7 @@ _STREAM_NAME = "a_stream_name" _JSON_SCHEMA = {"type": "object", "properties": {}} +_SCHEMA_LOADER = InlineSchemaLoader(_JSON_SCHEMA, {}) _A_STREAM_SLICE = StreamSlice( partition={"partition_key": "partition_value"}, cursor_slice={"cursor_key": "cursor_value"} ) @@ -34,7 +36,7 @@ def test_given_multiple_slices_partition_generator_uses_the_same_retriever(self) message_repository = Mock(spec=MessageRepository) partition_factory = DeclarativePartitionFactory( _STREAM_NAME, - _JSON_SCHEMA, + _SCHEMA_LOADER, retriever, message_repository, ) @@ -49,7 +51,7 @@ def test_given_a_mapping_when_read_then_yield_record(self) -> None: message_repository = Mock(spec=MessageRepository) partition_factory = DeclarativePartitionFactory( _STREAM_NAME, - _JSON_SCHEMA, + _SCHEMA_LOADER, retriever, message_repository, ) @@ -67,7 +69,7 @@ def test_given_not_a_record_when_read_then_send_to_message_repository(self) -> N message_repository = Mock(spec=MessageRepository) partition_factory = DeclarativePartitionFactory( _STREAM_NAME, - _JSON_SCHEMA, + _SCHEMA_LOADER, retriever, message_repository, ) diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 8f72cc6a6..51038095d 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -28,12 +28,14 @@ SyncMode, Type, ) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream logger = logging.getLogger("airbyte") @@ -280,8 +282,8 @@ def test_valid_manifest(self): streams = source.streams({}) assert len(streams) == 2 - assert isinstance(streams[0], DeclarativeStream) - assert isinstance(streams[1], DeclarativeStream) + assert isinstance(streams[0], DefaultStream) + assert isinstance(streams[1], DefaultStream) assert ( source.resolved_manifest["description"] == "This is a sample source connector that is very valid." @@ -1289,13 +1291,13 @@ def test_conditional_streams_manifest(self, is_sandbox, expected_stream_count): actual_streams = source.streams(config=config) assert len(actual_streams) == expected_stream_count - assert isinstance(actual_streams[0], DeclarativeStream) + assert isinstance(actual_streams[0], DefaultStream) assert actual_streams[0].name == "students" if is_sandbox: - assert isinstance(actual_streams[1], DeclarativeStream) + assert isinstance(actual_streams[1], DefaultStream) assert actual_streams[1].name == "classrooms" - assert isinstance(actual_streams[2], DeclarativeStream) + assert isinstance(actual_streams[2], DefaultStream) assert actual_streams[2].name == "clubs" assert ( @@ -2202,7 +2204,6 @@ def test_only_parent_streams_use_cache(): def _run_read(manifest: Mapping[str, Any], stream_name: str) -> List[AirbyteMessage]: - source = ManifestDeclarativeSource(source_config=manifest) catalog = ConfiguredAirbyteCatalog( streams=[ ConfiguredAirbyteStream( @@ -2214,7 +2215,10 @@ def _run_read(manifest: Mapping[str, Any], stream_name: str) -> List[AirbyteMess ) ] ) - return list(source.read(logger, {}, catalog, {})) + config = {} + state = {} + source = ConcurrentDeclarativeSource(catalog, config, state, manifest) + return list(source.read(logger, {}, catalog, state)) def test_declarative_component_schema_valid_ref_links(): diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 98255bfe5..129dde27f 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -45,6 +45,27 @@ def test_get_json_schema(self): json_schema = self._stream.get_json_schema() assert json_schema == self._json_schema + def test_json_schema_is_callable(self): + expected = {"schema": "is callable"} + json_schema_callable = lambda: expected + stream = DefaultStream( + self._partition_generator, + self._name, + json_schema_callable, + self._primary_key, + self._cursor_field, + self._logger, + FinalStateCursor( + stream_name=self._name, + stream_namespace=None, + message_repository=self._message_repository, + ), + ) + + result = stream.get_json_schema() + + assert result == expected + def test_check_for_error_raises_an_exception_if_any_of_the_futures_are_not_done(self): futures = [Mock() for _ in range(3)] for f in futures: From dff25594f0b1310609bffc4646f35b775bd3d08a Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 11:01:36 -0400 Subject: [PATCH 10/35] fix test --- .../test_manifest_declarative_source.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 51038095d..97f572510 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -36,6 +36,7 @@ ) from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from unit_tests.sources.declarative.parsers.test_model_to_component_factory import get_retriever logger = logging.getLogger("airbyte") @@ -2181,26 +2182,27 @@ def test_only_parent_streams_use_cache(): # Main stream with caching (parent for substream `applications_interviews`) assert streams[0].name == "applications" - assert streams[0].retriever.requester.use_cache + assert get_retriever(streams[0]).requester.use_cache # Substream assert streams[1].name == "applications_interviews" - assert not streams[1].retriever.requester.use_cache + + stream_1_retriever = get_retriever(streams[1]) + assert not stream_1_retriever.requester.use_cache # Parent stream created for substream assert ( - streams[1].retriever.stream_slicer.parent_stream_configs[0].stream.name - == "applications" + stream_1_retriever.stream_slicer.parent_stream_configs[0].stream.name + == "applications" ) assert ( - streams[1] - .retriever.stream_slicer.parent_stream_configs[0] + stream_1_retriever.stream_slicer.parent_stream_configs[0] .stream.retriever.requester.use_cache ) # Main stream without caching assert streams[2].name == "jobs" - assert not streams[2].retriever.requester.use_cache + assert not get_retriever(streams[2]).requester.use_cache def _run_read(manifest: Mapping[str, Any], stream_name: str) -> List[AirbyteMessage]: From 7dc2164e85f6d0513d891ef18f1873e94ceaadb5 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 12:50:39 -0400 Subject: [PATCH 11/35] fix test, format, lint and a bit of mypy --- .../manifest_declarative_source.py | 2 +- .../parsers/model_to_component_factory.py | 28 +++++++++--- .../declarative_partition_generator.py | 8 +++- .../sources/streams/concurrent/adapters.py | 3 ++ .../streams/concurrent/default_stream.py | 10 +++-- .../decoders/test_decoders_memory_usage.py | 7 ++- .../test_model_to_component_factory.py | 45 ++++++++++++------- .../test_config_components_resolver.py | 3 +- .../test_manifest_declarative_source.py | 16 +++---- 9 files changed, 79 insertions(+), 43 deletions(-) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index 303d12ba4..b1736f371 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -298,7 +298,7 @@ def connection_checker(self) -> ConnectionChecker: f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}" ) - def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder """ As a migration step, this method will return both legacy stream (Stream) and concurrent stream (AbstractStream). Once the migration is done, we can probably have this method throw "not implemented" as we figure out how to diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 32c14873a..d53dcd79e 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -545,8 +545,10 @@ StreamSlicer, StreamSlicerTestReadDecorator, ) -from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import \ - StreamSlicerPartitionGenerator, DeclarativePartitionFactory +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, +) from airbyte_cdk.sources.declarative.transformations import ( AddFields, RecordTransformation, @@ -608,7 +610,12 @@ WeekClampingStrategy, Weekday, ) -from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, Cursor, FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.cursor import ( + ConcurrentCursor, + Cursor, + CursorField, + FinalStateCursor, +) from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( @@ -2066,7 +2073,11 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - if isinstance(combined_slicers, PartitionRouter) and not is_parent and not self._emit_connector_builder_messages: + if ( + isinstance(combined_slicers, PartitionRouter) + and not is_parent + and not self._emit_connector_builder_messages + ): # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter @@ -2154,7 +2165,9 @@ def _build_incremental_cursor( stream_slicer: Optional[PartitionRouter], config: Config, ) -> Optional[StreamSlicer]: - if model.incremental_sync and (stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter)): + if model.incremental_sync and ( + stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter) + ): if model.retriever.type == "AsyncRetriever": stream_name = model.name or "" stream_namespace = None @@ -2902,7 +2915,10 @@ def create_parent_stream_config( self, model: ParentStreamConfigModel, config: Config, **kwargs: Any ) -> ParentStreamConfig: declarative_stream = self._create_component_from_model( - model.stream, config=config, is_parent=True, **kwargs, + model.stream, + config=config, + is_parent=True, + **kwargs, ) request_option = ( self._create_component_from_model(model.request_option, config=config) diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index fe76e7ee2..985f2d104 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -13,7 +13,6 @@ class SchemaLoaderCachingDecorator(SchemaLoader): - def __init__(self, schema_loader: SchemaLoader): self._decorated = schema_loader self._loaded_schema = None @@ -21,6 +20,9 @@ def __init__(self, schema_loader: SchemaLoader): def get_json_schema(self) -> Mapping[str, Any]: if self._loaded_schema is None: self._loaded_schema = self._decorated.get_json_schema() + + if self._loaded_schema is None: + raise ValueError("Could not load schema") return self._loaded_schema @@ -69,7 +71,9 @@ def __init__( self._hash = SliceHasher.hash(self._stream_name, self._stream_slice) def read(self) -> Iterable[Record]: - for stream_data in self._retriever.read_records(self._schema_loader.get_json_schema(), self._stream_slice): + for stream_data in self._retriever.read_records( + self._schema_loader.get_json_schema(), self._stream_slice + ): if isinstance(stream_data, Mapping): record = ( stream_data diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 6a4682605..c1dea49de 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -5,6 +5,7 @@ import copy import json import logging +from functools import lru_cache from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union from typing_extensions import deprecated @@ -195,6 +196,8 @@ def cursor_field(self) -> Union[str, List[str]]: def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor return self._cursor + # FIXME the lru_cache seems to be mostly there because of typing issue + @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: return self._abstract_stream.get_json_schema() diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index bceed08b2..fbbd2b613 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,7 +3,7 @@ # from logging import Logger -from typing import Any, Iterable, List, Mapping, Optional, Union, Callable +from typing import Any, Callable, Iterable, List, Mapping, Optional, Union from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -109,7 +109,9 @@ def check_availability(self) -> StreamAvailability: f"Cannot attempt to connect to stream {self.name} - no stream slices were found" ) except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) try: next(iter(partition.read())) @@ -118,4 +120,6 @@ def check_availability(self) -> StreamAvailability: self._logger.info(f"Successfully connected to stream {self.name}, but got 0 records.") return StreamAvailability.available() except AirbyteTracedException as error: - return StreamAvailability.unavailable(error.message or error.internal_message or "") + return StreamAvailability.unavailable( + error.message or error.internal_message or "" + ) diff --git a/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py b/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py index 6901c6382..2960c5802 100644 --- a/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py +++ b/unit_tests/sources/declarative/decoders/test_decoders_memory_usage.py @@ -93,9 +93,8 @@ def get_body(): requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users3", body=get_body()) requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users4", body=get_body()) - stream_slices = list(stream.stream_slices(sync_mode=SyncMode.full_refresh)) - for stream_slice in stream_slices: - for _ in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice): + for partition in stream.generate_partitions(): + for _ in partition.read(): counter += 1 - assert counter == lines_in_response * len(stream_slices) + assert counter == lines_in_response * 4 # 4 partitions diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index c7d2f8d7a..a1c2da8eb 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -157,7 +157,9 @@ from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader from airbyte_cdk.sources.declarative.spec import Spec from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicerTestReadDecorator -from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import SchemaLoaderCachingDecorator +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + SchemaLoaderCachingDecorator, +) from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource @@ -1768,14 +1770,8 @@ def test_config_with_defaults(): schema_loader = get_schema_loader(stream) assert isinstance(schema_loader, JsonFileSchemaLoader) - assert ( - schema_loader.file_path.string - == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" - ) - assert ( - schema_loader.file_path.default - == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" - ) + assert schema_loader.file_path.string == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + assert schema_loader.file_path.default == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" assert isinstance(retriever.requester, HttpRequester) assert retriever.requester.http_method == HttpMethod.GET @@ -1785,9 +1781,9 @@ def test_config_with_defaults(): assert isinstance(retriever.record_selector, RecordSelector) assert isinstance(retriever.record_selector.extractor, DpathExtractor) - assert [ - fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path - ] == ["result"] + assert [fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path] == [ + "result" + ] assert isinstance(retriever.paginator, DefaultPaginator) assert retriever.paginator.url_base.string == "https://api.sendgrid.com" @@ -2508,7 +2504,9 @@ def test_default_schema_loader(self): ), ], ) -def test_merge_incremental_and_partition_router(incremental, partition_router, expected_router_type, expected_stream_type): +def test_merge_incremental_and_partition_router( + incremental, partition_router, expected_router_type, expected_stream_type +): stream_model = { "type": "DeclarativeStream", "retriever": { @@ -2542,7 +2540,11 @@ def test_merge_incremental_and_partition_router(incremental, partition_router, e assert isinstance(stream, expected_stream_type) retriever = get_retriever(stream) assert isinstance(retriever, SimpleRetriever) - stream_slicer = retriever.stream_slicer if expected_stream_type == DeclarativeStream else stream._stream_partition_generator._stream_slicer + stream_slicer = ( + retriever.stream_slicer + if expected_stream_type == DeclarativeStream + else stream._stream_partition_generator._stream_slicer + ) assert isinstance(stream_slicer, expected_router_type) if incremental and partition_router: @@ -2722,7 +2724,9 @@ def test_create_custom_retriever(): ) assert isinstance(stream, DefaultStream) - assert isinstance(stream._stream_partition_generator._partition_factory._retriever, MyCustomRetriever) + assert isinstance( + stream._stream_partition_generator._partition_factory._retriever, MyCustomRetriever + ) @freezegun.freeze_time("2021-01-01 00:00:00") @@ -4667,9 +4671,16 @@ def test_create_stream_with_multiple_schema_loaders(): def get_schema_loader(stream: DefaultStream): - assert isinstance(stream._stream_partition_generator._partition_factory._schema_loader, SchemaLoaderCachingDecorator) + assert isinstance( + stream._stream_partition_generator._partition_factory._schema_loader, + SchemaLoaderCachingDecorator, + ) return stream._stream_partition_generator._partition_factory._schema_loader._decorated def get_retriever(stream: Union[DeclarativeStream, DefaultStream]): - return stream.retriever if isinstance(stream, DeclarativeStream) else stream._stream_partition_generator._partition_factory._retriever + return ( + stream.retriever + if isinstance(stream, DeclarativeStream) + else stream._stream_partition_generator._partition_factory._retriever + ) diff --git a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py index 7e9ae2150..c9ca1ecd5 100644 --- a/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_config_components_resolver.py @@ -383,5 +383,6 @@ def test_component_mapping_conditions(manifest, config, expected_conditional_par for stream in source.streams(config): if stream.name in expected_conditional_params: assert ( - stream._stream_partition_generator._partition_factory._retriever.requester._parameters == expected_conditional_params[stream.name] + stream._stream_partition_generator._partition_factory._retriever.requester._parameters + == expected_conditional_params[stream.name] ) diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 97f572510..24258f193 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -28,7 +28,9 @@ SyncMode, Type, ) -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( @@ -2191,14 +2193,10 @@ def test_only_parent_streams_use_cache(): assert not stream_1_retriever.requester.use_cache # Parent stream created for substream - assert ( - stream_1_retriever.stream_slicer.parent_stream_configs[0].stream.name - == "applications" - ) - assert ( - stream_1_retriever.stream_slicer.parent_stream_configs[0] - .stream.retriever.requester.use_cache - ) + assert stream_1_retriever.stream_slicer.parent_stream_configs[0].stream.name == "applications" + assert stream_1_retriever.stream_slicer.parent_stream_configs[ + 0 + ].stream.retriever.requester.use_cache # Main stream without caching assert streams[2].name == "jobs" From 0bfbdfe1ee016e456768a09826dac23fb5d3c441 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 14:12:59 -0400 Subject: [PATCH 12/35] mypy --- .../declarative/concurrent_declarative_source.py | 13 ++++++------- .../parsers/model_to_component_factory.py | 2 +- .../declarative_partition_generator.py | 6 ++---- .../sources/streams/concurrent/default_stream.py | 4 +--- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 0ac6299f2..ba4ba1fe2 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -3,7 +3,7 @@ # import logging -from typing import Any, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple +from typing import Any, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union from airbyte_cdk.models import ( AirbyteCatalog, @@ -28,7 +28,6 @@ PerPartitionWithGlobalCursor, ) from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource -from airbyte_cdk.sources.declarative.models import FileUploader from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ConcurrencyLevel as ConcurrencyLevelModel, ) @@ -179,7 +178,7 @@ def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> Airbyte ] ) - def streams(self, config: Mapping[str, Any]) -> List[Stream]: + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder """ The `streams` method is used as part of the AbstractSource in the following cases: * ConcurrentDeclarativeSource.check -> ManifestDeclarativeSource.check -> AbstractSource.check -> DeclarativeSource.check_connection -> CheckStream.check_connection -> streams @@ -282,7 +281,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish retriever, self.message_repository, ), @@ -313,7 +312,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( partition_factory=DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish retriever, self.message_repository, ), @@ -343,7 +342,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish declarative_stream.retriever, self.message_repository, ), @@ -403,7 +402,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( declarative_stream.name, - declarative_stream.schema_loader, + declarative_stream._schema_loader, # type: ignore # I know it's private property but the public one is optional and we will remove this code soonish retriever, self.message_repository, ), diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index d53dcd79e..45b75b175 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1928,7 +1928,7 @@ def create_datetime_based_cursor( ) def create_declarative_stream( - self, model: DeclarativeStreamModel, config: Config, is_parent=False, **kwargs: Any + self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any ) -> Union[DeclarativeStream, AbstractStream]: # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 985f2d104..c7e0a24cf 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -15,15 +15,13 @@ class SchemaLoaderCachingDecorator(SchemaLoader): def __init__(self, schema_loader: SchemaLoader): self._decorated = schema_loader - self._loaded_schema = None + self._loaded_schema: Optional[Mapping[str, Any]] = None def get_json_schema(self) -> Mapping[str, Any]: if self._loaded_schema is None: self._loaded_schema = self._decorated.get_json_schema() - if self._loaded_schema is None: - raise ValueError("Could not load schema") - return self._loaded_schema + return self._loaded_schema # type: ignore # at that point, we assume the schema will be populated class DeclarativePartitionFactory: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index fbbd2b613..ca227fd50 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -53,9 +53,7 @@ def cursor_field(self) -> Optional[str]: return self._cursor_field def get_json_schema(self) -> Mapping[str, Any]: - if isinstance(self._json_schema, Callable): - return self._json_schema() - return self._json_schema + return self._json_schema() if callable(self._json_schema) else self._json_schema def as_airbyte_stream(self) -> AirbyteStream: stream = AirbyteStream( From 0b454bb551958f28771f817413313fdc5e83b8c2 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 14:13:33 -0400 Subject: [PATCH 13/35] format --- .../sources/declarative/concurrent_declarative_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index ba4ba1fe2..69582c12b 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -178,7 +178,7 @@ def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> Airbyte ] ) - def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder + def streams(self, config: Mapping[str, Any]) -> List[Union[Stream, AbstractStream]]: # type: ignore # we are migrating away from the AbstractSource and are expecting that this will only be called by ConcurrentDeclarativeSource or the Connector Builder """ The `streams` method is used as part of the AbstractSource in the following cases: * ConcurrentDeclarativeSource.check -> ManifestDeclarativeSource.check -> AbstractSource.check -> DeclarativeSource.check_connection -> CheckStream.check_connection -> streams From 13c17f4437a56db2eb1105f3d68b9707d1b13db8 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 15:13:35 -0400 Subject: [PATCH 14/35] remove unused line --- .../sources/declarative/parsers/model_to_component_factory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 45b75b175..dcb84c8aa 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2095,7 +2095,6 @@ def create_declarative_stream( ), combined_slicers, ) - FinalStateCursor(stream_name, None, self._message_repository) return DefaultStream( partition_generator=partition_generator, name=stream_name, From fb75765d4dcd2893bd30c13ede714f732b2e884e Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 16:37:58 -0400 Subject: [PATCH 15/35] fix test --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 425ab8da8..35be7ce59 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2244,7 +2244,7 @@ def _build_concurrent_cursor( else: state_transformations = [] - if model.incremental_sync and stream_slicer: + if model.incremental_sync and stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter): return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing state_manager=self._connector_state_manager, model_type=DatetimeBasedCursorModel, From c07839529219211417205df7b010aa9d39c2137f Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 16:42:17 -0400 Subject: [PATCH 16/35] lint --- .../declarative/concurrent_declarative_source.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index aad88badb..be1d160b5 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -5,7 +5,18 @@ import logging from dataclasses import dataclass, field from queue import Queue -from typing import Any, ClassVar, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union +from typing import ( + Any, + ClassVar, + Generic, + Iterator, + List, + Mapping, + MutableMapping, + Optional, + Tuple, + Union, +) from airbyte_protocol_dataclasses.models import Level From decc557f4af3267cde542da2b070bc80c85f7808 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 16:44:06 -0400 Subject: [PATCH 17/35] format --- .../declarative/parsers/model_to_component_factory.py | 8 ++++++-- .../connector_builder/test_connector_builder_handler.py | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 35be7ce59..1cb3cdb7e 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2170,7 +2170,7 @@ def _build_incremental_cursor( ) if model.incremental_sync and ( - stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter) + stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter) ): if model.retriever.type == "AsyncRetriever": stream_name = model.name or "" @@ -2244,7 +2244,11 @@ def _build_concurrent_cursor( else: state_transformations = [] - if model.incremental_sync and stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter): + if ( + model.incremental_sync + and stream_slicer + and not isinstance(stream_slicer, SinglePartitionRouter) + ): return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing state_manager=self._connector_state_manager, model_type=DatetimeBasedCursorModel, diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index 4a68645a3..c036c12d3 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -785,7 +785,11 @@ def test_config_update() -> None: "refresh_token": "a refresh token", } source = ConcurrentDeclarativeSource( - catalog=None, config=config, state=None, source_config=manifest, emit_connector_builder_messages=True + catalog=None, + config=config, + state=None, + source_config=manifest, + emit_connector_builder_messages=True, ) refresh_request_response = { From b8daf647915364453ba7c58c66fffb49092ad958 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 4 Aug 2025 19:34:06 -0400 Subject: [PATCH 18/35] code review --- airbyte_cdk/sources/streams/concurrent/availability_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py index 3be77ff05..1068e6a92 100644 --- a/airbyte_cdk/sources/streams/concurrent/availability_strategy.py +++ b/airbyte_cdk/sources/streams/concurrent/availability_strategy.py @@ -13,7 +13,7 @@ def available(cls) -> "StreamAvailability": @classmethod def unavailable(cls, reason: str) -> "StreamAvailability": - return StreamAvailability(False, reason) + return cls(False, reason) def __init__(self, available: bool, reason: Optional[str] = None) -> None: self._available = available From 2bc4b307dd56575ad1014ad9bb17ac22f41e8895 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 08:11:48 -0400 Subject: [PATCH 19/35] code review --- unit_tests/sources/streams/concurrent/test_default_stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_default_stream.py b/unit_tests/sources/streams/concurrent/test_default_stream.py index 98255bfe5..12e2b34f4 100644 --- a/unit_tests/sources/streams/concurrent/test_default_stream.py +++ b/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -313,7 +313,7 @@ def test_given_unknown_error_when_reading_record_when_get_availability_then_rais """ I'm not sure why we handle AirbyteTracedException but not other exceptions but this is to keep feature compatibility with HttpAvailabilityStrategy """ - self._partition_generator.generate.side_effect = ValueError() - self._partition.read.return_value = [] + self._partition_generator.generate.return_value = [self._partition] + self._partition.read.side_effect = ValueError() with pytest.raises(ValueError): self._stream.check_availability() From d9d09f02d4b640def561af742ee2aacc745e3c52 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 09:48:09 -0400 Subject: [PATCH 20/35] incremental without partition router as DefaultStream --- .../parsers/model_to_component_factory.py | 62 +++++--- .../test_model_to_component_factory.py | 145 +++++++++--------- 2 files changed, 115 insertions(+), 92 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 67ec9f2ac..3a25ba5b0 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1940,22 +1940,11 @@ def create_declarative_stream( combined_slicers = self._merge_stream_slicers(model=model, config=config) primary_key = model.primary_key.__root__ if model.primary_key else None - stop_condition_on_cursor = ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_data_feed") - and model.incremental_sync.is_data_feed - ) - client_side_filtering_enabled = ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_client_side_incremental") - and model.incremental_sync.is_client_side_incremental + + stream_slicer = self._build_stream_slicer_from_partition_router( + model.retriever, config, stream_name=model.name ) - concurrent_cursor = None - if stop_condition_on_cursor or client_side_filtering_enabled: - stream_slicer = self._build_stream_slicer_from_partition_router( - model.retriever, config, stream_name=model.name - ) - concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) + concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): cursor_model = model.incremental_sync @@ -2030,9 +2019,9 @@ def create_declarative_stream( primary_key=primary_key, stream_slicer=combined_slicers, request_options_provider=request_options_provider, - stop_condition_cursor=concurrent_cursor, + stop_condition_cursor=concurrent_cursor if self._is_stop_condition_on_cursor(model) else None, client_side_incremental_sync={"cursor": concurrent_cursor} - if client_side_filtering_enabled + if self._is_client_side_filtering_enabled(model) else None, transformations=transformations, file_uploader=file_uploader, @@ -2066,17 +2055,30 @@ def create_declarative_stream( schema_loader = DefaultSchemaLoader(config=config, parameters=options) if ( - isinstance(combined_slicers, PartitionRouter) + (isinstance(combined_slicers, PartitionRouter) or isinstance(concurrent_cursor, ConcurrentCursor)) and not is_parent and not self._emit_connector_builder_messages ): # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter - # * Streams without partition router but with cursor + # * Streams without partition router but with cursor. This is the `isinstance(concurrent_cursor, ConcurrentCursor)` condition # * Streams with both partition router and cursor # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway + + stream_slicer = concurrent_cursor + if isinstance(retriever, AsyncRetriever): + # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method + # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a + # special clause and return a concurrent cursor. This stream slicer is passed to AsyncRetriever when + # built because the async retriever has a specific partition router which relies on this stream slicer. + # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in + # AsyncJobPartitionRouter. + stream_slicer = retriever.stream_slicer + elif isinstance(combined_slicers, PartitionRouter): + stream_slicer = combined_slicers + stream_name = model.name or "" partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( @@ -2085,18 +2087,19 @@ def create_declarative_stream( retriever, self._message_repository, ), - combined_slicers, + stream_slicer, ) + cursor = concurrent_cursor if concurrent_cursor else FinalStateCursor(stream_name, None, self._message_repository) return DefaultStream( partition_generator=partition_generator, name=stream_name, json_schema=schema_loader.get_json_schema, primary_key=get_primary_key_from_stream(primary_key), - cursor_field=None, + cursor_field=cursor.cursor_field.cursor_field_key if hasattr(cursor, "cursor_field") else "", # FIXME we should have the cursor field has part of the interface of cursor, # FIXME we should have the cursor field has part of the interface of cursor logger=logging.getLogger(f"airbyte.{stream_name}"), # FIXME this is a breaking change compared to the old implementation, - cursor=FinalStateCursor(stream_name, None, self._message_repository), + cursor=cursor, ) cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None @@ -2118,6 +2121,21 @@ def create_declarative_stream( parameters=model.parameters or {}, ) + def _is_stop_condition_on_cursor(self, model): + return ( + model.incremental_sync + and hasattr(model.incremental_sync, "is_data_feed") + and model.incremental_sync.is_data_feed + ) + + def _is_client_side_filtering_enabled(self, model): + client_side_filtering_enabled = ( + model.incremental_sync + and hasattr(model.incremental_sync, "is_client_side_incremental") + and model.incremental_sync.is_client_side_incremental + ) + return client_side_filtering_enabled + def _build_stream_slicer_from_partition_router( self, model: Union[ diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index aa8d0d781..8d9b1f808 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -350,101 +350,102 @@ def test_full_config_stream(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - assert isinstance(stream, DeclarativeStream) - assert stream.primary_key == "id" + assert isinstance(stream, DefaultStream) assert stream.name == "lists" - assert stream._stream_cursor_field.string == "created" + assert stream.cursor_field == "created" - assert isinstance(stream.schema_loader, JsonFileSchemaLoader) - assert stream.schema_loader._get_json_filepath() == "./source_sendgrid/schemas/lists.json" + schema_loader = get_schema_loader(stream) + assert isinstance(schema_loader, JsonFileSchemaLoader) + assert schema_loader._get_json_filepath() == "./source_sendgrid/schemas/lists.json" - assert len(stream.retriever.record_selector.transformations) == 1 - add_fields = stream.retriever.record_selector.transformations[0] + retriever = get_retriever(stream) + assert len(retriever.record_selector.transformations) == 1 + add_fields = retriever.record_selector.transformations[0] assert isinstance(add_fields, AddFields) assert add_fields.fields[0].path == ["extra"] assert add_fields.fields[0].value.string == "{{ response.to_add }}" - assert isinstance(stream.retriever, SimpleRetriever) - assert stream.retriever.primary_key == stream.primary_key - assert stream.retriever.name == stream.name + assert isinstance(retriever, SimpleRetriever) + assert retriever.primary_key == "id" + assert retriever.name == stream.name - assert isinstance(stream.retriever.record_selector, RecordSelector) + assert isinstance(retriever.record_selector, RecordSelector) - assert isinstance(stream.retriever.record_selector.extractor, DpathExtractor) - assert isinstance(stream.retriever.record_selector.extractor.decoder, JsonDecoder) + assert isinstance(retriever.record_selector.extractor, DpathExtractor) + assert isinstance(retriever.record_selector.extractor.decoder, JsonDecoder) assert [ - fp.eval(input_config) for fp in stream.retriever.record_selector.extractor._field_path + fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path ] == ["lists"] - assert isinstance(stream.retriever.record_selector.record_filter, RecordFilter) + assert isinstance(retriever.record_selector.record_filter, RecordFilter) assert ( - stream.retriever.record_selector.record_filter._filter_interpolator.condition - == "{{ record['id'] > stream_state['id'] }}" + retriever.record_selector.record_filter._filter_interpolator.condition + == "{{ record['id'] > stream_state['id'] }}" ) - assert isinstance(stream.retriever.paginator, DefaultPaginator) - assert isinstance(stream.retriever.paginator.decoder, PaginationDecoderDecorator) - assert stream.retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" + assert isinstance(retriever.paginator, DefaultPaginator) + assert isinstance(retriever.paginator.decoder, PaginationDecoderDecorator) + assert retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" assert ( - stream.retriever.paginator.page_size_option.inject_into - == RequestOptionType.request_parameter + retriever.paginator.page_size_option.inject_into + == RequestOptionType.request_parameter ) - assert isinstance(stream.retriever.paginator.page_token_option, RequestPath) - assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" - assert stream.retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" + assert isinstance(retriever.paginator.page_token_option, RequestPath) + assert retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" + assert retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" - assert isinstance(stream.retriever.paginator.pagination_strategy, CursorPaginationStrategy) + assert isinstance(retriever.paginator.pagination_strategy, CursorPaginationStrategy) assert isinstance( - stream.retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator + retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator ) assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.string - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.string + == "{{ response._metadata.next }}" ) assert ( - stream.retriever.paginator.pagination_strategy._cursor_value.default - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.default + == "{{ response._metadata.next }}" ) - assert stream.retriever.paginator.pagination_strategy.page_size == 10 + assert retriever.paginator.pagination_strategy.page_size == 10 - assert isinstance(stream.retriever.requester, HttpRequester) - assert stream.retriever.requester.http_method == HttpMethod.GET - assert stream.retriever.requester.name == stream.name - assert stream.retriever.requester._path.string == "{{ next_page_token['next_page_url'] }}" - assert stream.retriever.requester._path.default == "{{ next_page_token['next_page_url'] }}" + assert isinstance(retriever.requester, HttpRequester) + assert retriever.requester.http_method == HttpMethod.GET + assert retriever.requester.name == stream.name + assert retriever.requester._path.string == "{{ next_page_token['next_page_url'] }}" + assert retriever.requester._path.default == "{{ next_page_token['next_page_url'] }}" - assert isinstance(stream.retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) + assert isinstance(retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) assert ( - stream.retriever.request_option_provider.start_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.start_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - stream.retriever.request_option_provider.start_time_option.field_name.eval( + retriever.request_option_provider.start_time_option.field_name.eval( config=input_config ) - == "after" + == "after" ) assert ( - stream.retriever.request_option_provider.end_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.end_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - stream.retriever.request_option_provider.end_time_option.field_name.eval( + retriever.request_option_provider.end_time_option.field_name.eval( config=input_config ) - == "before" + == "before" ) - assert stream.retriever.request_option_provider._partition_field_start.string == "start_time" - assert stream.retriever.request_option_provider._partition_field_end.string == "end_time" + assert retriever.request_option_provider._partition_field_start.string == "start_time" + assert retriever.request_option_provider._partition_field_end.string == "end_time" - assert isinstance(stream.retriever.requester.authenticator, BearerAuthenticator) - assert stream.retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" + assert isinstance(retriever.requester.authenticator, BearerAuthenticator) + assert retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" assert isinstance( - stream.retriever.requester.request_options_provider, InterpolatedRequestOptionsProvider + retriever.requester.request_options_provider, InterpolatedRequestOptionsProvider ) assert ( - stream.retriever.requester.request_options_provider.request_parameters.get("unit") == "day" + retriever.requester.request_options_provider.request_parameters.get("unit") == "day" ) checker = factory.create_component( @@ -1117,7 +1118,7 @@ def test_incremental_data_feed(): ) assert isinstance( - stream.retriever.paginator.pagination_strategy, StopConditionPaginationStrategyDecorator + get_retriever(stream).paginator.pagination_strategy, StopConditionPaginationStrategyDecorator ) @@ -1198,11 +1199,12 @@ def test_client_side_incremental(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) + retriever = get_retriever(stream) assert isinstance( - stream.retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator + retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator ) - assert stream.retriever.record_selector.transform_before_filtering == True + assert get_retriever(stream).record_selector.transform_before_filtering == True def test_client_side_incremental_with_partition_router(): @@ -2440,8 +2442,8 @@ def test_default_schema_loader(self): "cursor_granularity": "PT0.000001S", }, None, - DatetimeBasedCursor, - DeclarativeStream, + ConcurrentCursor, + DefaultStream, id="test_create_simple_retriever_with_incremental", ), pytest.param( @@ -4130,7 +4132,8 @@ def test_simple_retriever_with_query_properties(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - query_properties = stream.retriever.additional_query_properties + retriever = get_retriever(stream) + query_properties = retriever.additional_query_properties assert isinstance(query_properties, QueryProperties) assert query_properties.property_list == [ "first_name", @@ -4141,18 +4144,18 @@ def test_simple_retriever_with_query_properties(): ] assert query_properties.always_include_properties == ["id"] - property_chunking = stream.retriever.additional_query_properties.property_chunking + property_chunking = retriever.additional_query_properties.property_chunking assert isinstance(property_chunking, PropertyChunking) assert property_chunking.property_limit_type == PropertyLimitType.property_count assert property_chunking.property_limit == 3 merge_strategy = ( - stream.retriever.additional_query_properties.property_chunking.record_merge_strategy + retriever.additional_query_properties.property_chunking.record_merge_strategy ) assert isinstance(merge_strategy, GroupByKey) assert merge_strategy.key == ["id"] - request_options_provider = stream.retriever.requester.request_options_provider + request_options_provider = retriever.requester.request_options_provider assert isinstance(request_options_provider, InterpolatedRequestOptionsProvider) # For a better developer experience we allow QueryProperties to be defined on the requester.request_parameters, # but it actually is leveraged by the SimpleRetriever which is why it is not included in the RequestOptionsProvider @@ -4232,27 +4235,28 @@ def test_simple_retriever_with_request_parameters_properties_from_endpoint(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - query_properties = stream.retriever.additional_query_properties + retriever = get_retriever(stream) + query_properties = retriever.additional_query_properties assert isinstance(query_properties, QueryProperties) assert query_properties.always_include_properties is None - properties_from_endpoint = stream.retriever.additional_query_properties.property_list + properties_from_endpoint = retriever.additional_query_properties.property_list assert isinstance(properties_from_endpoint, PropertiesFromEndpoint) assert properties_from_endpoint.property_field_path == ["name"] properties_from_endpoint_retriever = ( - stream.retriever.additional_query_properties.property_list.retriever + retriever.additional_query_properties.property_list.retriever ) assert isinstance(properties_from_endpoint_retriever, SimpleRetriever) properties_from_endpoint_requester = ( - stream.retriever.additional_query_properties.property_list.retriever.requester + retriever.additional_query_properties.property_list.retriever.requester ) assert isinstance(properties_from_endpoint_requester, HttpRequester) assert properties_from_endpoint_requester.url_base == "https://api.hubapi.com" assert properties_from_endpoint_requester.path == "/properties/v2/dynamics/properties" - property_chunking = stream.retriever.additional_query_properties.property_chunking + property_chunking = retriever.additional_query_properties.property_chunking assert isinstance(property_chunking, PropertyChunking) assert property_chunking.property_limit_type == PropertyLimitType.property_count assert property_chunking.property_limit == 3 @@ -4320,22 +4324,23 @@ def test_simple_retriever_with_requester_properties_from_endpoint(): model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config ) - query_properties = stream.retriever.additional_query_properties + retriever = get_retriever(stream) + query_properties = retriever.additional_query_properties assert isinstance(query_properties, QueryProperties) assert query_properties.always_include_properties is None assert query_properties.property_chunking is None - properties_from_endpoint = stream.retriever.additional_query_properties.property_list + properties_from_endpoint = retriever.additional_query_properties.property_list assert isinstance(properties_from_endpoint, PropertiesFromEndpoint) assert properties_from_endpoint.property_field_path == ["name"] properties_from_endpoint_retriever = ( - stream.retriever.additional_query_properties.property_list.retriever + retriever.additional_query_properties.property_list.retriever ) assert isinstance(properties_from_endpoint_retriever, SimpleRetriever) properties_from_endpoint_requester = ( - stream.retriever.additional_query_properties.property_list.retriever.requester + retriever.additional_query_properties.property_list.retriever.requester ) assert isinstance(properties_from_endpoint_requester, HttpRequester) assert properties_from_endpoint_requester.url_base == "https://api.hubapi.com" From 1af22644ad638f8408587f506289a0e7646e7089 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:15:39 -0400 Subject: [PATCH 21/35] refactor regarding async stuff --- .../parsers/model_to_component_factory.py | 54 +++++++++-------- .../test_model_to_component_factory.py | 58 ++++++++----------- 2 files changed, 52 insertions(+), 60 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3a25ba5b0..0703a2e5c 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -94,16 +94,13 @@ ClientSideIncrementalRecordFilterDecorator, ) from airbyte_cdk.sources.declarative.incremental import ( - ChildPartitionResumableFullRefreshCursor, ConcurrentCursorFactory, ConcurrentPerPartitionCursor, CursorFactory, DatetimeBasedCursor, DeclarativeCursor, GlobalSubstreamCursor, - PerPartitionCursor, PerPartitionWithGlobalCursor, - ResumableFullRefreshCursor, ) from airbyte_cdk.sources.declarative.interpolation import InterpolatedString from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping @@ -446,10 +443,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ZipfileDecoder as ZipfileDecoderModel, ) -from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import ( - COMPONENTS_MODULE_NAME, - SDM_COMPONENTS_MODULE_NAME, -) from airbyte_cdk.sources.declarative.partition_routers import ( CartesianProductStreamSlicer, GroupingPartitionRouter, @@ -508,7 +501,7 @@ RequestOptionsProvider, ) from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath -from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod from airbyte_cdk.sources.declarative.resolvers import ( ComponentMappingDefinition, ConfigComponentsResolver, @@ -1941,10 +1934,10 @@ def create_declarative_stream( primary_key = model.primary_key.__root__ if model.primary_key else None - stream_slicer = self._build_stream_slicer_from_partition_router( + partition_router = self._build_stream_slicer_from_partition_router( model.retriever, config, stream_name=model.name ) - concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) + concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): cursor_model = model.incremental_sync @@ -2019,7 +2012,9 @@ def create_declarative_stream( primary_key=primary_key, stream_slicer=combined_slicers, request_options_provider=request_options_provider, - stop_condition_cursor=concurrent_cursor if self._is_stop_condition_on_cursor(model) else None, + stop_condition_cursor=concurrent_cursor + if self._is_stop_condition_on_cursor(model) + else None, client_side_incremental_sync={"cursor": concurrent_cursor} if self._is_client_side_filtering_enabled(model) else None, @@ -2055,7 +2050,10 @@ def create_declarative_stream( schema_loader = DefaultSchemaLoader(config=config, parameters=options) if ( - (isinstance(combined_slicers, PartitionRouter) or isinstance(concurrent_cursor, ConcurrentCursor)) + ( + isinstance(combined_slicers, PartitionRouter) + or isinstance(concurrent_cursor, ConcurrentCursor) + ) and not is_parent and not self._emit_connector_builder_messages ): @@ -2067,7 +2065,9 @@ def create_declarative_stream( # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway + stream_name = model.name or "" stream_slicer = concurrent_cursor + cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a @@ -2076,10 +2076,13 @@ def create_declarative_stream( # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in # AsyncJobPartitionRouter. stream_slicer = retriever.stream_slicer + if isinstance(combined_slicers, Cursor): + cursor = combined_slicers elif isinstance(combined_slicers, PartitionRouter): stream_slicer = combined_slicers + else: + cursor = concurrent_cursor - stream_name = model.name or "" partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( stream_name, @@ -2089,16 +2092,17 @@ def create_declarative_stream( ), stream_slicer, ) - cursor = concurrent_cursor if concurrent_cursor else FinalStateCursor(stream_name, None, self._message_repository) + return DefaultStream( partition_generator=partition_generator, name=stream_name, json_schema=schema_loader.get_json_schema, primary_key=get_primary_key_from_stream(primary_key), - cursor_field=cursor.cursor_field.cursor_field_key if hasattr(cursor, "cursor_field") else "", # FIXME we should have the cursor field has part of the interface of cursor, - # FIXME we should have the cursor field has part of the interface of cursor + cursor_field=cursor.cursor_field.cursor_field_key + if hasattr(cursor, "cursor_field") + else "", # FIXME we should have the cursor field has part of the interface of cursor, logger=logging.getLogger(f"airbyte.{stream_name}"), - # FIXME this is a breaking change compared to the old implementation, + # FIXME this is a breaking change compared to the old implementation which used the source name instead cursor=cursor, ) @@ -2121,18 +2125,18 @@ def create_declarative_stream( parameters=model.parameters or {}, ) - def _is_stop_condition_on_cursor(self, model): + def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: return ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_data_feed") - and model.incremental_sync.is_data_feed + model.incremental_sync + and hasattr(model.incremental_sync, "is_data_feed") + and model.incremental_sync.is_data_feed ) - def _is_client_side_filtering_enabled(self, model): + def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: client_side_filtering_enabled = ( - model.incremental_sync - and hasattr(model.incremental_sync, "is_client_side_incremental") - and model.incremental_sync.is_client_side_incremental + model.incremental_sync + and hasattr(model.incremental_sync, "is_client_side_incremental") + and model.incremental_sync.is_client_side_incremental ) return client_side_filtering_enabled diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 8d9b1f808..b543354f7 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -373,38 +373,33 @@ def test_full_config_stream(): assert isinstance(retriever.record_selector.extractor, DpathExtractor) assert isinstance(retriever.record_selector.extractor.decoder, JsonDecoder) - assert [ - fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path - ] == ["lists"] + assert [fp.eval(input_config) for fp in retriever.record_selector.extractor._field_path] == [ + "lists" + ] assert isinstance(retriever.record_selector.record_filter, RecordFilter) assert ( - retriever.record_selector.record_filter._filter_interpolator.condition - == "{{ record['id'] > stream_state['id'] }}" + retriever.record_selector.record_filter._filter_interpolator.condition + == "{{ record['id'] > stream_state['id'] }}" ) assert isinstance(retriever.paginator, DefaultPaginator) assert isinstance(retriever.paginator.decoder, PaginationDecoderDecorator) assert retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" - assert ( - retriever.paginator.page_size_option.inject_into - == RequestOptionType.request_parameter - ) + assert retriever.paginator.page_size_option.inject_into == RequestOptionType.request_parameter assert isinstance(retriever.paginator.page_token_option, RequestPath) assert retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" assert retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" assert isinstance(retriever.paginator.pagination_strategy, CursorPaginationStrategy) - assert isinstance( - retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator - ) + assert isinstance(retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator) assert ( - retriever.paginator.pagination_strategy._cursor_value.string - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.string + == "{{ response._metadata.next }}" ) assert ( - retriever.paginator.pagination_strategy._cursor_value.default - == "{{ response._metadata.next }}" + retriever.paginator.pagination_strategy._cursor_value.default + == "{{ response._metadata.next }}" ) assert retriever.paginator.pagination_strategy.page_size == 10 @@ -416,24 +411,20 @@ def test_full_config_stream(): assert isinstance(retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) assert ( - retriever.request_option_provider.start_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.start_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - retriever.request_option_provider.start_time_option.field_name.eval( - config=input_config - ) - == "after" + retriever.request_option_provider.start_time_option.field_name.eval(config=input_config) + == "after" ) assert ( - retriever.request_option_provider.end_time_option.inject_into - == RequestOptionType.request_parameter + retriever.request_option_provider.end_time_option.inject_into + == RequestOptionType.request_parameter ) assert ( - retriever.request_option_provider.end_time_option.field_name.eval( - config=input_config - ) - == "before" + retriever.request_option_provider.end_time_option.field_name.eval(config=input_config) + == "before" ) assert retriever.request_option_provider._partition_field_start.string == "start_time" assert retriever.request_option_provider._partition_field_end.string == "end_time" @@ -444,9 +435,7 @@ def test_full_config_stream(): assert isinstance( retriever.requester.request_options_provider, InterpolatedRequestOptionsProvider ) - assert ( - retriever.requester.request_options_provider.request_parameters.get("unit") == "day" - ) + assert retriever.requester.request_options_provider.request_parameters.get("unit") == "day" checker = factory.create_component( model_type=CheckStreamModel, component_definition=manifest["check"], config=input_config @@ -1118,7 +1107,8 @@ def test_incremental_data_feed(): ) assert isinstance( - get_retriever(stream).paginator.pagination_strategy, StopConditionPaginationStrategyDecorator + get_retriever(stream).paginator.pagination_strategy, + StopConditionPaginationStrategyDecorator, ) @@ -4149,9 +4139,7 @@ def test_simple_retriever_with_query_properties(): assert property_chunking.property_limit_type == PropertyLimitType.property_count assert property_chunking.property_limit == 3 - merge_strategy = ( - retriever.additional_query_properties.property_chunking.record_merge_strategy - ) + merge_strategy = retriever.additional_query_properties.property_chunking.record_merge_strategy assert isinstance(merge_strategy, GroupByKey) assert merge_strategy.key == ["id"] From 8c771bb032450bfda33ad694282c0d949df238bf Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:21:25 -0400 Subject: [PATCH 22/35] partially fix mypy --- .../declarative/parsers/model_to_component_factory.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 0703a2e5c..c23c39cab 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2066,8 +2066,8 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer = concurrent_cursor - cursor = FinalStateCursor(stream_name, None, self._message_repository) + stream_slicer: StreamSlicer = concurrent_cursor + cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a @@ -2126,19 +2126,18 @@ def create_declarative_stream( ) def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: - return ( + return bool( model.incremental_sync and hasattr(model.incremental_sync, "is_data_feed") and model.incremental_sync.is_data_feed ) def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: - client_side_filtering_enabled = ( + return bool( model.incremental_sync and hasattr(model.incremental_sync, "is_client_side_incremental") and model.incremental_sync.is_client_side_incremental ) - return client_side_filtering_enabled def _build_stream_slicer_from_partition_router( self, From fb40a6b1ee9a937349a500512f4226bdc26750b4 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:24:19 -0400 Subject: [PATCH 23/35] fix mypy --- .../sources/declarative/parsers/model_to_component_factory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index c23c39cab..1bb75f90c 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -610,6 +610,7 @@ ) from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream +from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer as ConcurrentStreamSlicer from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, DateTimeStreamStateConverter, @@ -2066,7 +2067,7 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer: StreamSlicer = concurrent_cursor + stream_slicer: ConcurrentStreamSlicer = concurrent_cursor cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method From 91752839ecb93294c20952159e952d460d0a5383 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:25:45 -0400 Subject: [PATCH 24/35] format --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 1bb75f90c..910324067 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -610,7 +610,9 @@ ) from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream -from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer as ConcurrentStreamSlicer +from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( + StreamSlicer as ConcurrentStreamSlicer, +) from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( CustomFormatConcurrentStreamStateConverter, DateTimeStreamStateConverter, From 1d84a49637e9fd18fe3ce3e06198aab1c40bcf51 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:35:42 -0400 Subject: [PATCH 25/35] mypy --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 910324067..cc3f698fd 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2069,7 +2069,7 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer: ConcurrentStreamSlicer = concurrent_cursor + stream_slicer: ConcurrentStreamSlicer = concurrent_cursor if concurrent_cursor else SinglePartitionRouter() cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method From 2cba5ffca735be862789e89dadbd2366268b2c0c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 10:45:52 -0400 Subject: [PATCH 26/35] fix --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index cc3f698fd..3eeb7f8ad 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2069,7 +2069,9 @@ def create_declarative_stream( # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway stream_name = model.name or "" - stream_slicer: ConcurrentStreamSlicer = concurrent_cursor if concurrent_cursor else SinglePartitionRouter() + stream_slicer: ConcurrentStreamSlicer = ( + concurrent_cursor if concurrent_cursor else SinglePartitionRouter(parameters={}) + ) cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository) if isinstance(retriever, AsyncRetriever): # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method From 8181c833aca515fa8583c2aa9d5a560b9ad2e67e Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Tue, 5 Aug 2025 14:33:38 -0400 Subject: [PATCH 27/35] fix condition where we might override FinalStateCursor with null --- .../parsers/model_to_component_factory.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3eeb7f8ad..5cf89814f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1929,19 +1929,8 @@ def create_datetime_based_cursor( def create_declarative_stream( self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any ) -> Union[DeclarativeStream, AbstractStream]: - # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field - # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the - # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in - # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. - combined_slicers = self._merge_stream_slicers(model=model, config=config) - primary_key = model.primary_key.__root__ if model.primary_key else None - partition_router = self._build_stream_slicer_from_partition_router( - model.retriever, config, stream_name=model.name - ) - concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) - if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): cursor_model = model.incremental_sync @@ -2008,6 +1997,15 @@ def create_declarative_stream( model=model.file_uploader, config=config ) + # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field + # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the + # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in + # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. + combined_slicers = self._merge_stream_slicers(model=model, config=config) + partition_router = self._build_stream_slicer_from_partition_router( + model.retriever, config, stream_name=model.name + ) + concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) retriever = self._create_component_from_model( model=model.retriever, config=config, @@ -2085,7 +2083,7 @@ def create_declarative_stream( cursor = combined_slicers elif isinstance(combined_slicers, PartitionRouter): stream_slicer = combined_slicers - else: + elif concurrent_cursor: cursor = concurrent_cursor partition_generator = StreamSlicerPartitionGenerator( From 10796293ea515cd3ca6f5ce17059cbfdc0fcddea Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 6 Aug 2025 09:52:24 -0400 Subject: [PATCH 28/35] supports_file_transfer --- .../sources/declarative/parsers/model_to_component_factory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 67ec9f2ac..ec8f94478 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2097,6 +2097,7 @@ def create_declarative_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), # FIXME this is a breaking change compared to the old implementation, cursor=FinalStateCursor(stream_name, None, self._message_repository), + supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), ) cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None From 7f643e4be73abf8b4d0dca61beab8a5a59e68efa Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 6 Aug 2025 10:27:46 -0400 Subject: [PATCH 29/35] format --- .../sources/declarative/parsers/model_to_component_factory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index ec8f94478..f672e06cd 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2097,7 +2097,8 @@ def create_declarative_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), # FIXME this is a breaking change compared to the old implementation, cursor=FinalStateCursor(stream_name, None, self._message_repository), - supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), + supports_file_transfer=hasattr(model, "file_uploader") + and bool(model.file_uploader), ) cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None From 11e3a35603cc2dbbf944d7d2842ab3f637a4473c Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:03:26 -0400 Subject: [PATCH 30/35] format --- .../sources/declarative/concurrent_declarative_source.py | 2 +- .../stream_slicers/declarative_partition_generator.py | 4 +++- .../connector_builder/test_connector_builder_handler.py | 6 +++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index b688ea23c..9a651514b 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -466,7 +466,7 @@ def _group_streams( partition_generator = StreamSlicerPartitionGenerator( DeclarativePartitionFactory( stream_name=declarative_stream.name, - schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish + schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish retriever=retriever, message_repository=self.message_repository, max_records_limit=self._limits.max_records if self._limits else None, diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 8afb80813..4a511fe70 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -86,7 +86,9 @@ def read(self) -> Iterable[Record]: global total_record_counter if total_record_counter >= self._max_records_limit: return - for stream_data in self._retriever.read_records(self._schema_loader.get_json_schema(), self._stream_slice): + for stream_data in self._retriever.read_records( + self._schema_loader.get_json_schema(), self._stream_slice + ): if self._max_records_limit: if total_record_counter >= self._max_records_limit: break diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index bc846d526..c036c12d3 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -785,7 +785,11 @@ def test_config_update() -> None: "refresh_token": "a refresh token", } source = ConcurrentDeclarativeSource( - catalog=None, config=config, state=None, source_config=manifest, emit_connector_builder_messages=True, + catalog=None, + config=config, + state=None, + source_config=manifest, + emit_connector_builder_messages=True, ) refresh_request_response = { From ebb4b288af8b7659140775461e10e52a98e41497 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:22:51 -0400 Subject: [PATCH 31/35] more fixes for DefaultStream in Connector Builder --- .../connector_builder/test_reader/reader.py | 2 +- .../parsers/model_to_component_factory.py | 9 ++++++-- .../test_connector_builder_handler.py | 22 ++++++++++++++----- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/airbyte_cdk/connector_builder/test_reader/reader.py b/airbyte_cdk/connector_builder/test_reader/reader.py index 5c16798a2..e3d43f825 100644 --- a/airbyte_cdk/connector_builder/test_reader/reader.py +++ b/airbyte_cdk/connector_builder/test_reader/reader.py @@ -120,7 +120,7 @@ def run_test_read( deprecation_warnings: List[LogMessage] = source.deprecation_warnings() schema_inferrer = SchemaInferrer( - self._pk_to_nested_and_composite_field(stream.primary_key) if stream else None, + self._pk_to_nested_and_composite_field(stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key) if stream else None, self._cursor_field_to_nested_and_composite_field(stream.cursor_field) if stream else None, diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 58c8e654e..ebafcddb8 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2072,7 +2072,6 @@ def create_declarative_stream( if ( isinstance(combined_slicers, PartitionRouter) and not is_parent - and not self._emit_connector_builder_messages ): # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: @@ -2089,7 +2088,13 @@ def create_declarative_stream( retriever, self._message_repository, ), - combined_slicers, + stream_slicer=cast( + StreamSlicer, + StreamSlicerTestReadDecorator( + wrapped_slicer=combined_slicers, + maximum_number_of_slices=self._limit_slices_fetched or 5, + ), + ), ) return DefaultStream( partition_generator=partition_generator, diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index c036c12d3..4ebdd565a 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -7,7 +7,7 @@ import json import logging import os -from typing import List, Literal +from typing import List, Literal, Union from unittest import mock from unittest.mock import MagicMock, patch @@ -17,7 +17,6 @@ from airbyte_cdk import connector_builder from airbyte_cdk.connector_builder.connector_builder_handler import ( - TestLimits, create_source, get_limits, resolve_manifest, @@ -60,6 +59,7 @@ from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicerTestReadDecorator +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets, update_secrets from unit_tests.connector_builder.utils import create_configured_catalog @@ -440,6 +440,14 @@ } +def get_retriever(stream: Union[DeclarativeStream, DefaultStream]): + return ( + stream.retriever + if isinstance(stream, DeclarativeStream) + else stream._stream_partition_generator._partition_factory._retriever + ) + + @pytest.fixture def valid_resolve_manifest_config_file(tmp_path): config_file = tmp_path / "config.json" @@ -1130,8 +1138,9 @@ def test_read_source(mock_http_stream): streams = source.streams(config) for s in streams: - assert isinstance(s.retriever, SimpleRetriever) - assert isinstance(s.retriever.stream_slicer, StreamSlicerTestReadDecorator) + retriever = get_retriever(s) + assert isinstance(retriever, SimpleRetriever) + assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) @patch.object( @@ -1177,8 +1186,9 @@ def test_read_source_single_page_single_slice(mock_http_stream): streams = source.streams(config) for s in streams: - assert isinstance(s.retriever, SimpleRetriever) - assert isinstance(s.retriever.stream_slicer, StreamSlicerTestReadDecorator) + retriever = get_retriever(s) + assert isinstance(retriever, SimpleRetriever) + assert isinstance(retriever.stream_slicer, StreamSlicerTestReadDecorator) @pytest.mark.parametrize( From 6fef39b817818166748cf70f12df3fef41d40e1b Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:27:27 -0400 Subject: [PATCH 32/35] mypy and format --- airbyte_cdk/connector_builder/test_reader/reader.py | 6 +++++- .../declarative/parsers/model_to_component_factory.py | 5 +---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/connector_builder/test_reader/reader.py b/airbyte_cdk/connector_builder/test_reader/reader.py index e3d43f825..3ff920208 100644 --- a/airbyte_cdk/connector_builder/test_reader/reader.py +++ b/airbyte_cdk/connector_builder/test_reader/reader.py @@ -120,7 +120,11 @@ def run_test_read( deprecation_warnings: List[LogMessage] = source.deprecation_warnings() schema_inferrer = SchemaInferrer( - self._pk_to_nested_and_composite_field(stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key) if stream else None, + self._pk_to_nested_and_composite_field( + stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key + ) + if stream + else None, # type: ignore # We are accessing the private property here as the primary key is not exposed. We should either expose it or use `as_airbyte_stream` to retrieve it as this is the "official" way where it is exposed in the Airbyte protocol self._cursor_field_to_nested_and_composite_field(stream.cursor_field) if stream else None, diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index ebafcddb8..9dac39011 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2069,10 +2069,7 @@ def create_declarative_stream( options["name"] = model.name schema_loader = DefaultSchemaLoader(config=config, parameters=options) - if ( - isinstance(combined_slicers, PartitionRouter) - and not is_parent - ): + if isinstance(combined_slicers, PartitionRouter) and not is_parent: # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter From e31fed969a56b2bfdb81510f235feb8ceef4ffb7 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Mon, 11 Aug 2025 11:36:39 -0400 Subject: [PATCH 33/35] format broke mypy --- airbyte_cdk/connector_builder/test_reader/reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/connector_builder/test_reader/reader.py b/airbyte_cdk/connector_builder/test_reader/reader.py index 3ff920208..e7399f3f6 100644 --- a/airbyte_cdk/connector_builder/test_reader/reader.py +++ b/airbyte_cdk/connector_builder/test_reader/reader.py @@ -121,10 +121,10 @@ def run_test_read( schema_inferrer = SchemaInferrer( self._pk_to_nested_and_composite_field( - stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key + stream.primary_key if hasattr(stream, "primary_key") else stream._primary_key # type: ignore # We are accessing the private property here as the primary key is not exposed. We should either expose it or use `as_airbyte_stream` to retrieve it as this is the "official" way where it is exposed in the Airbyte protocol ) if stream - else None, # type: ignore # We are accessing the private property here as the primary key is not exposed. We should either expose it or use `as_airbyte_stream` to retrieve it as this is the "official" way where it is exposed in the Airbyte protocol + else None, self._cursor_field_to_nested_and_composite_field(stream.cursor_field) if stream else None, From 1be518b0478e6b729f9c46bc77ba0a9b2484e54e Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 20 Aug 2025 08:59:04 -0400 Subject: [PATCH 34/35] format --- .../declarative/parsers/model_to_component_factory.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 362937a3c..4742ebc70 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2051,9 +2051,13 @@ def create_declarative_stream( schema_loader = DefaultSchemaLoader(config=config, parameters=options) if ( - isinstance(combined_slicers, PartitionRouter) - or isinstance(concurrent_cursor, ConcurrentCursor) - ) and not self._emit_connector_builder_messages and not is_parent: + ( + isinstance(combined_slicers, PartitionRouter) + or isinstance(concurrent_cursor, ConcurrentCursor) + ) + and not self._emit_connector_builder_messages + and not is_parent + ): # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the # DeclarativeStream and assembling the DefaultStream from that. The plan is the following: # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter From 59c1fd82d0a8027a807591a2b727d0a3268018e3 Mon Sep 17 00:00:00 2001 From: "maxime.c" Date: Wed, 20 Aug 2025 09:02:46 -0400 Subject: [PATCH 35/35] lint --- .../declarative_partition_generator.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py index 466608910..a7ce26143 100644 --- a/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +++ b/airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py @@ -12,18 +12,6 @@ from airbyte_cdk.utils.slice_hasher import SliceHasher -class SchemaLoaderCachingDecorator(SchemaLoader): - def __init__(self, schema_loader: SchemaLoader): - self._decorated = schema_loader - self._loaded_schema: Optional[Mapping[str, Any]] = None - - def get_json_schema(self) -> Mapping[str, Any]: - if self._loaded_schema is None: - self._loaded_schema = self._decorated.get_json_schema() - - return self._loaded_schema # type: ignore # at that point, we assume the schema will be populated - - class SchemaLoaderCachingDecorator(SchemaLoader): def __init__(self, schema_loader: SchemaLoader): self._decorated = schema_loader