From 09e855da48f43a7a67c7bc1f781f131780fac6ff Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 6 Dec 2024 12:41:22 -0500 Subject: [PATCH 1/3] Reduce occurences of database table is locked errors --- airbyte_cdk/sources/streams/http/http_client.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/streams/http/http_client.py b/airbyte_cdk/sources/streams/http/http_client.py index 4f99bbeba..bf76188a4 100644 --- a/airbyte_cdk/sources/streams/http/http_client.py +++ b/airbyte_cdk/sources/streams/http/http_client.py @@ -138,12 +138,18 @@ def _request_session(self) -> requests.Session: cache_dir = os.getenv(ENV_REQUEST_CACHE_PATH) # Use in-memory cache if cache_dir is not set # This is a non-obvious interface, but it ensures we don't write sql files when running unit tests - if cache_dir: - sqlite_path = str(Path(cache_dir) / self.cache_filename) - else: - sqlite_path = "file::memory:?cache=shared" + # Use in-memory cache if cache_dir is not set + # This is a non-obvious interface, but it ensures we don't write sql files when running unit tests + sqlite_path = ( + str(Path(cache_dir) / self.cache_filename) + if cache_dir + else "file::memory:?cache=shared" + ) + backend = requests_cache.SQLiteCache( + sqlite_path, wal=True + ) # by using `PRAGMA journal_mode=WAL`, we avoid having `database table is locked` errors return CachedLimiterSession( - sqlite_path, backend="sqlite", api_budget=self._api_budget, match_headers=True + sqlite_path, backend=backend, api_budget=self._api_budget, match_headers=True ) else: return LimiterSession(api_budget=self._api_budget) From 8faf59e396b7112c0424fbc5a493a7ba16abf2fa Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 6 Dec 2024 12:59:31 -0500 Subject: [PATCH 2/3] Add fast_save --- airbyte_cdk/sources/streams/http/http_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/http/http_client.py b/airbyte_cdk/sources/streams/http/http_client.py index bf76188a4..a3d2399db 100644 --- a/airbyte_cdk/sources/streams/http/http_client.py +++ b/airbyte_cdk/sources/streams/http/http_client.py @@ -146,8 +146,8 @@ def _request_session(self) -> requests.Session: else "file::memory:?cache=shared" ) backend = requests_cache.SQLiteCache( - sqlite_path, wal=True - ) # by using `PRAGMA journal_mode=WAL`, we avoid having `database table is locked` errors + sqlite_path, fast_save=True, wal=True + ) # By using `PRAGMA synchronous=OFF` and `PRAGMA journal_mode=WAL`, we avoid having `database table is locked` errors. Note that those were blindly added at the same time and one or the other might be sufficient to prevent the issues but we have seen good results with both. Feel free to revisit given more information. return CachedLimiterSession( sqlite_path, backend=backend, api_budget=self._api_budget, match_headers=True ) From 82e7d97604f861cfe8f89ace60bd9312fa815fd3 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 6 Dec 2024 13:03:17 -0500 Subject: [PATCH 3/3] improve description --- airbyte_cdk/sources/streams/http/http_client.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/streams/http/http_client.py b/airbyte_cdk/sources/streams/http/http_client.py index a3d2399db..715484410 100644 --- a/airbyte_cdk/sources/streams/http/http_client.py +++ b/airbyte_cdk/sources/streams/http/http_client.py @@ -145,9 +145,13 @@ def _request_session(self) -> requests.Session: if cache_dir else "file::memory:?cache=shared" ) - backend = requests_cache.SQLiteCache( - sqlite_path, fast_save=True, wal=True - ) # By using `PRAGMA synchronous=OFF` and `PRAGMA journal_mode=WAL`, we avoid having `database table is locked` errors. Note that those were blindly added at the same time and one or the other might be sufficient to prevent the issues but we have seen good results with both. Feel free to revisit given more information. + # By using `PRAGMA synchronous=OFF` and `PRAGMA journal_mode=WAL`, we reduce the possible occurrences of `database table is locked` errors. + # Note that those were blindly added at the same time and one or the other might be sufficient to prevent the issues but we have seen good results with both. Feel free to revisit given more information. + # There are strong signals that `fast_save` might create problems but if the sync crashes, we start back from the beginning in terms of sqlite anyway so the impact should be minimal. Signals are: + # * https://github.com/requests-cache/requests-cache/commit/7fa89ffda300331c37d8fad7f773348a3b5b0236#diff-f43db4a5edf931647c32dec28ea7557aae4cae8444af4b26c8ecbe88d8c925aaR238 + # * https://github.com/requests-cache/requests-cache/commit/7fa89ffda300331c37d8fad7f773348a3b5b0236#diff-2e7f95b7d7be270ff1a8118f817ea3e6663cdad273592e536a116c24e6d23c18R164-R168 + # * `If the application running SQLite crashes, the data will be safe, but the database [might become corrupted](https://www.sqlite.org/howtocorrupt.html#cfgerr) if the operating system crashes or the computer loses power before that data has been written to the disk surface.` in [this description](https://www.sqlite.org/pragma.html#pragma_synchronous). + backend = requests_cache.SQLiteCache(sqlite_path, fast_save=True, wal=True) return CachedLimiterSession( sqlite_path, backend=backend, api_budget=self._api_budget, match_headers=True )