From 96499c31a68f62e328a5ffee7a5fe211ba12f72f Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Wed, 15 Apr 2026 18:10:01 +0800 Subject: [PATCH 1/8] feat(rest): parse storage-credentials from LoadTableResponse --- src/iceberg/catalog/rest/json_serde.cc | 34 +++++++++++++++ src/iceberg/catalog/rest/types.cc | 3 +- src/iceberg/catalog/rest/types.h | 13 +++++- src/iceberg/test/rest_json_serde_test.cc | 53 ++++++++++++++++++++++-- 4 files changed, 98 insertions(+), 5 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index 49dbcdd4c..27d32bf83 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -71,6 +71,8 @@ constexpr std::string_view kSource = "source"; constexpr std::string_view kDestination = "destination"; constexpr std::string_view kMetadata = "metadata"; constexpr std::string_view kConfig = "config"; +constexpr std::string_view kStorageCredentials = "storage-credentials"; +constexpr std::string_view kPrefix = "prefix"; constexpr std::string_view kIdentifiers = "identifiers"; constexpr std::string_view kOverrides = "overrides"; constexpr std::string_view kDefaults = "defaults"; @@ -695,6 +697,17 @@ Result ToJson(const LoadTableResult& result) { SetOptionalStringField(json, kMetadataLocation, result.metadata_location); ICEBERG_ASSIGN_OR_RAISE(json[kMetadata], ToJson(*result.metadata)); SetContainerField(json, kConfig, result.config); + if (!result.storage_credentials.empty()) { + nlohmann::json creds = nlohmann::json::array(); + for (const auto& cred : result.storage_credentials) { + nlohmann::json entry; + entry[kPrefix] = cred.prefix; + // config is required, so always write it (matches Java). + entry[kConfig] = cred.config; + creds.push_back(std::move(entry)); + } + json[kStorageCredentials] = std::move(creds); + } return json; } @@ -707,6 +720,27 @@ Result LoadTableResultFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(result.metadata, TableMetadataFromJson(metadata_json)); ICEBERG_ASSIGN_OR_RAISE(result.config, GetJsonValueOrDefault(json, kConfig)); + if (auto it = json.find(kStorageCredentials); it != json.end() && !it->is_null()) { + if (!it->is_array()) { + return JsonParseError("Cannot parse storage credentials from non-array: {}", + SafeDumpJson(*it)); + } + for (const auto& entry : *it) { + StorageCredential cred; + ICEBERG_ASSIGN_OR_RAISE(cred.prefix, GetJsonValue(entry, kPrefix)); + ICEBERG_ASSIGN_OR_RAISE(cred.config, + GetJsonValue(entry, kConfig)); + // prefix and config are required by the REST spec; non-empty matches the + // Java reference implementation (Credential.validate()). + if (cred.prefix.empty()) { + return JsonParseError("Invalid storage credential: prefix must be non-empty"); + } + if (cred.config.empty()) { + return JsonParseError("Invalid storage credential: config must be non-empty"); + } + result.storage_credentials.push_back(std::move(cred)); + } + } ICEBERG_RETURN_UNEXPECTED(result.Validate()); return result; } diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index 8d96bccb2..84fba9a7c 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -86,7 +86,8 @@ bool CreateTableRequest::operator==(const CreateTableRequest& other) const { } bool LoadTableResult::operator==(const LoadTableResult& other) const { - if (metadata_location != other.metadata_location || config != other.config) { + if (metadata_location != other.metadata_location || config != other.config || + storage_credentials != other.storage_credentials) { return false; } diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 7849b366b..0403fc22c 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -180,12 +180,23 @@ struct ICEBERG_REST_EXPORT CreateTableRequest { /// \brief An opaque token that allows clients to make use of pagination for list APIs. using PageToken = std::string; +/// \brief A short-lived credential vended by a REST catalog for a storage +/// location ``prefix`` (clients pick the longest matching prefix); ``config`` +/// holds backend properties such as ``"s3.access-key-id"`` (Iceberg REST spec). +struct ICEBERG_REST_EXPORT StorageCredential { + std::string prefix; + std::unordered_map config; + + bool operator==(const StorageCredential& other) const = default; +}; + /// \brief Result body for table create/load/register APIs. struct ICEBERG_REST_EXPORT LoadTableResult { std::string metadata_location; std::shared_ptr metadata; // required std::unordered_map config; - // TODO(Li Feiyang): Add std::shared_ptr storage_credential; + /// \brief Vended storage credentials, one per URI prefix; empty if none. + std::vector storage_credentials; /// \brief Validates the LoadTableResult. Status Validate() const { diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 50507dd3a..1dd3ed19b 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -82,6 +82,11 @@ static std::shared_ptr MakeSimpleTableMetadata() { }); } +std::string LoadTableJsonWithCredentials(std::string_view storage_credentials) { + return std::string(R"({"storage-credentials":)") + std::string(storage_credentials) + + R"(,"metadata":{"format-version":2,"table-uuid":"test","location":"s3://test","last-sequence-number":0,"last-column-id":1,"last-updated-ms":0,"schemas":[{"type":"struct","schema-id":1,"fields":[{"id":1,"name":"id","type":"int","required":true}]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0}})"; +} + // Test parameter structure for roundtrip tests template struct JsonRoundTripParam { @@ -1116,7 +1121,17 @@ INSTANTIATE_TEST_SUITE_P( .model = {.metadata_location = "s3://bucket/metadata/v1.json", .metadata = MakeSimpleTableMetadata(), .config = {{"warehouse", "s3://bucket/warehouse"}, - {"foo", "bar"}}}}), + {"foo", "bar"}}}}, + LoadTableResultParam{ + .test_name = "WithStorageCredentials", + .expected_json_str = + R"({"metadata":{"current-schema-id":1,"current-snapshot-id":null,"default-sort-order-id":0,"default-spec-id":0,"format-version":2,"last-column-id":1,"last-partition-id":0,"last-sequence-number":0,"last-updated-ms":0,"location":"s3://bucket/test","metadata-log":[],"partition-specs":[{"fields":[],"spec-id":0}],"partition-statistics":[],"properties":{},"refs":{},"schemas":[{"fields":[{"id":1,"name":"id","required":true,"type":"int"}],"schema-id":1,"type":"struct"}],"snapshot-log":[],"snapshots":[],"sort-orders":[{"fields":[],"order-id":0}],"statistics":[],"table-uuid":"test-uuid-1234"},"storage-credentials":[{"config":{"s3.access-key-id":"AKIAtest","s3.region":"us-east-1","s3.secret-access-key":"secret"},"prefix":"s3"}]})", + .model = + {.metadata = MakeSimpleTableMetadata(), + .storage_credentials = {{.prefix = "s3", + .config = {{"s3.access-key-id", "AKIAtest"}, + {"s3.secret-access-key", "secret"}, + {"s3.region", "us-east-1"}}}}}}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; }); @@ -1145,7 +1160,18 @@ INSTANTIATE_TEST_SUITE_P( .json_str = R"({"metadata":{"format-version":2,"table-uuid":"test-uuid-1234","location":"s3://bucket/test","last-sequence-number":0,"last-updated-ms":0,"last-column-id":1,"schemas":[{"type":"struct","schema-id":1,"fields":[{"id":1,"name":"id","type":"int","required":true}]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"properties":{}},"config":{"warehouse":"s3://bucket/warehouse"}})", .expected_model = {.metadata = MakeSimpleTableMetadata(), - .config = {{"warehouse", "s3://bucket/warehouse"}}}}), + .config = {{"warehouse", "s3://bucket/warehouse"}}}}, + LoadTableResultDeserializeParam{ + .test_name = "WithStorageCredentials", + .json_str = + R"({"metadata":{"format-version":2,"table-uuid":"test-uuid-1234","location":"s3://bucket/test","last-sequence-number":0,"last-updated-ms":0,"last-column-id":1,"schemas":[{"type":"struct","schema-id":1,"fields":[{"id":1,"name":"id","type":"int","required":true}]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"properties":{}},"storage-credentials":[{"prefix":"s3","config":{"s3.access-key-id":"AKIAtest","s3.secret-access-key":"secret","s3.session-token":"token","s3.region":"us-east-1"}}]})", + .expected_model = + {.metadata = MakeSimpleTableMetadata(), + .storage_credentials = {{.prefix = "s3", + .config = {{"s3.access-key-id", "AKIAtest"}, + {"s3.secret-access-key", "secret"}, + {"s3.session-token", "token"}, + {"s3.region", "us-east-1"}}}}}}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; }); @@ -1184,7 +1210,28 @@ INSTANTIATE_TEST_SUITE_P( LoadTableResultInvalidParam{ .test_name = "InvalidMetadataContent", .invalid_json_str = R"({"metadata":{"format-version":"invalid"}})", - .expected_error_message = "type must be number, but is string"}), + .expected_error_message = "type must be number, but is string"}, + LoadTableResultInvalidParam{ + .test_name = "StorageCredentialsNotArray", + .invalid_json_str = LoadTableJsonWithCredentials(R"("oops")"), + .expected_error_message = "Cannot parse storage credentials from non-array"}, + LoadTableResultInvalidParam{ + .test_name = "StorageCredentialMissingPrefix", + .invalid_json_str = LoadTableJsonWithCredentials(R"([{"config":{"k":"v"}}])"), + .expected_error_message = "Missing 'prefix'"}, + LoadTableResultInvalidParam{ + .test_name = "StorageCredentialMissingConfig", + .invalid_json_str = LoadTableJsonWithCredentials(R"([{"prefix":"s3"}])"), + .expected_error_message = "Missing 'config'"}, + LoadTableResultInvalidParam{.test_name = "StorageCredentialEmptyPrefix", + .invalid_json_str = LoadTableJsonWithCredentials( + R"([{"prefix":"","config":{"k":"v"}}])"), + .expected_error_message = "prefix must be non-empty"}, + LoadTableResultInvalidParam{ + .test_name = "StorageCredentialEmptyConfig", + .invalid_json_str = + LoadTableJsonWithCredentials(R"([{"prefix":"s3","config":{}}])"), + .expected_error_message = "config must be non-empty"}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; }); From 9f3442f483b807dcda7f347483946bc9097ab03b Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Mon, 15 Jun 2026 18:30:33 +0800 Subject: [PATCH 2/8] feat(rest): bind a per-table FileIO from vended credentials --- src/iceberg/arrow/s3/arrow_s3_file_io.cc | 27 +++++++--- src/iceberg/arrow/s3/arrow_s3_internal.h | 44 ++++++++++++++++ src/iceberg/arrow/s3/s3_properties.h | 4 +- src/iceberg/catalog/rest/rest_catalog.cc | 33 ++++++++++-- src/iceberg/catalog/rest/rest_catalog.h | 3 +- src/iceberg/catalog/rest/rest_file_io.cc | 14 ++++++ src/iceberg/catalog/rest/rest_file_io.h | 7 +++ src/iceberg/catalog/rest/type_fwd.h | 1 + src/iceberg/test/arrow_s3_file_io_test.cc | 61 +++++++++++++++++++++++ src/iceberg/test/rest_file_io_test.cc | 32 ++++++++++++ 10 files changed, 214 insertions(+), 12 deletions(-) create mode 100644 src/iceberg/arrow/s3/arrow_s3_internal.h diff --git a/src/iceberg/arrow/s3/arrow_s3_file_io.cc b/src/iceberg/arrow/s3/arrow_s3_file_io.cc index cffd95840..36858f76d 100644 --- a/src/iceberg/arrow/s3/arrow_s3_file_io.cc +++ b/src/iceberg/arrow/s3/arrow_s3_file_io.cc @@ -30,6 +30,7 @@ #include "iceberg/arrow/arrow_io_internal.h" #include "iceberg/arrow/arrow_io_util.h" #include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/arrow/s3/arrow_s3_internal.h" #include "iceberg/arrow/s3/s3_properties.h" #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" @@ -74,6 +75,12 @@ Status EnsureS3Initialized() { return {}; } +#endif + +} // namespace + +#if ICEBERG_S3_ENABLED + /// \brief Configure S3Options from a properties map. /// /// \param properties The configuration properties map. @@ -100,15 +107,25 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( } // Configure region - if (const auto* region = FindProperty(properties, S3Properties::kRegion); - region != nullptr) { + // Prefer the standard `client.region`; fall back to legacy `s3.region`. + const auto* region = FindProperty(properties, S3Properties::kClientRegion); + if (region == nullptr) { + region = FindProperty(properties, S3Properties::kRegion); + } + if (region != nullptr) { options.region = *region; } - // Configure endpoint (for MinIO, LocalStack, etc.) + // Configure endpoint (for MinIO, LocalStack, OSS, etc.) if (const auto* endpoint = FindProperty(properties, S3Properties::kEndpoint); endpoint != nullptr) { - options.endpoint_override = *endpoint; + // `s3.endpoint` may be a full URI; Arrow wants host[:port], so strip the scheme. + std::string_view ep = *endpoint; + if (const auto pos = ep.find("://"); pos != std::string_view::npos) { + options.scheme = std::string(ep.substr(0, pos)); + ep = ep.substr(pos + 3); + } + options.endpoint_override = std::string(ep); } else { // Fall back to AWS standard environment variables for endpoint override const char* s3_endpoint_env = std::getenv("AWS_ENDPOINT_URL_S3"); @@ -154,8 +171,6 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( } #endif -} // namespace - Result> MakeS3FileIO( const std::unordered_map& properties) { #if ICEBERG_S3_ENABLED diff --git a/src/iceberg/arrow/s3/arrow_s3_internal.h b/src/iceberg/arrow/s3/arrow_s3_internal.h new file mode 100644 index 000000000..c37663a62 --- /dev/null +++ b/src/iceberg/arrow/s3/arrow_s3_internal.h @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_bundle_export.h" +#include "iceberg/result.h" + +#if ICEBERG_S3_ENABLED +# include +#endif + +namespace iceberg::arrow { + +#if ICEBERG_S3_ENABLED +/// \brief Build Arrow ``S3Options`` from an Iceberg properties map. +/// +/// Production code should use MakeS3FileIO(); this is exposed so the +/// property-to-option mapping (region resolution, endpoint scheme handling, +/// addressing style) can be unit tested without a live S3 endpoint. +ICEBERG_BUNDLE_EXPORT Result<::arrow::fs::S3Options> ConfigureS3Options( + const std::unordered_map& properties); +#endif + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3/s3_properties.h b/src/iceberg/arrow/s3/s3_properties.h index 53657743d..61248948b 100644 --- a/src/iceberg/arrow/s3/s3_properties.h +++ b/src/iceberg/arrow/s3/s3_properties.h @@ -37,8 +37,10 @@ struct S3Properties { static constexpr std::string_view kSecretAccessKey = "s3.secret-access-key"; /// AWS session token (for temporary credentials) static constexpr std::string_view kSessionToken = "s3.session-token"; - /// AWS region + /// AWS region (legacy, non-standard key kept for compatibility) static constexpr std::string_view kRegion = "s3.region"; + /// AWS region, standard Iceberg client property (preferred over kRegion). + static constexpr std::string_view kClientRegion = "client.region"; /// Custom endpoint override (for MinIO, LocalStack, etc.) static constexpr std::string_view kEndpoint = "s3.endpoint"; /// Whether to use path-style access (needed for MinIO) diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 6b479ee03..ad104320c 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -41,6 +41,7 @@ #include "iceberg/catalog/rest/rest_file_io.h" #include "iceberg/catalog/rest/rest_util.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/file_io_registry.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" @@ -519,7 +520,26 @@ Result> RestCatalog::TableAuthSession( Result> RestCatalog::TableFileIO( const SessionContext& /*context*/, - const std::unordered_map& table_config) const { + const std::unordered_map& table_config, + const std::vector& storage_credentials) const { + // Longest-prefix S3-family vended credential, matching the Java client's + // VendedCredentialsProvider. When present, build a per-table S3 FileIO from + // catalog + table config + the credential; its ResolvePath resolves oss:// and + // other S3-compatible schemes. + if (const StorageCredential* s3_cred = SelectS3StorageCredential(storage_credentials); + s3_cred != nullptr) { + auto properties = config_.configs(); + for (const auto& [key, value] : table_config) { + properties[key] = value; + } + for (const auto& [key, value] : s3_cred->config) { + properties[key] = value; + } + ICEBERG_ASSIGN_OR_RAISE( + auto io, + FileIORegistry::Load(std::string(FileIORegistry::kArrowS3FileIO), properties)); + return std::shared_ptr(std::move(io)); + } ICEBERG_RETURN_UNEXPECTED(ValidateNoFileIOConfig(table_config)); return file_io_; } @@ -756,7 +776,9 @@ Result> RestCatalog::StageCreateTable( CreateTableInternal(identifier, schema, spec, order, location, properties, /*stage_create=*/true, *contextual_session)); auto table_config = std::move(result.config); - ICEBERG_ASSIGN_OR_RAISE(auto table_io, TableFileIO(context, table_config)); + auto storage_credentials = std::move(result.storage_credentials); + ICEBERG_ASSIGN_OR_RAISE(auto table_io, + TableFileIO(context, table_config, storage_credentials)); ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); @@ -869,7 +891,9 @@ Result> RestCatalog::MakeTableFromLoadResult( const SessionContext& context, std::shared_ptr contextual_session) { auto table_config = std::move(result.config); - ICEBERG_ASSIGN_OR_RAISE(auto table_io, TableFileIO(context, table_config)); + auto storage_credentials = std::move(result.storage_credentials); + ICEBERG_ASSIGN_OR_RAISE(auto table_io, + TableFileIO(context, table_config, storage_credentials)); ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); @@ -888,7 +912,8 @@ Result> RestCatalog::MakeTableFromCommitResponse( // TODO(gangwu): If the REST commit response grows table config or // storage credentials, derive a replacement table session/FileIO from that // response. The current table commit response does not define config. - ICEBERG_ASSIGN_OR_RAISE(auto table_io, TableFileIO(context, table_config)); + ICEBERG_ASSIGN_OR_RAISE(auto table_io, + TableFileIO(context, table_config, /*storage_credentials=*/{})); auto table_catalog = std::make_shared( shared_from_this(), context, identifier, table_config, table_session); return Table::Make(identifier, std::move(response.metadata), diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 76d2e54dc..312ad3edc 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -79,7 +79,8 @@ class ICEBERG_REST_EXPORT RestCatalog final Result> TableFileIO( const SessionContext& context, - const std::unordered_map& table_config) const; + const std::unordered_map& table_config, + const std::vector& storage_credentials) const; Result> ListNamespaces(const Namespace& ns, auth::AuthSession& session) const; diff --git a/src/iceberg/catalog/rest/rest_file_io.cc b/src/iceberg/catalog/rest/rest_file_io.cc index 5fadca1ac..a12561feb 100644 --- a/src/iceberg/catalog/rest/rest_file_io.cc +++ b/src/iceberg/catalog/rest/rest_file_io.cc @@ -20,7 +20,9 @@ #include "iceberg/catalog/rest/rest_file_io.h" #include +#include +#include "iceberg/catalog/rest/types.h" #include "iceberg/file_io_registry.h" #include "iceberg/util/macros.h" @@ -92,4 +94,16 @@ Result> MakeCatalogFileIO(const RestCatalogProperties& c return FileIORegistry::Load(io_impl, config.configs()); } +const StorageCredential* SelectS3StorageCredential( + const std::vector& credentials) { + const StorageCredential* best = nullptr; + for (const auto& cred : credentials) { + if (cred.prefix.starts_with("s3") && + (best == nullptr || cred.prefix.size() > best->prefix.size())) { + best = &cred; + } + } + return best; +} + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/rest_file_io.h b/src/iceberg/catalog/rest/rest_file_io.h index 68482521a..5f0fb9fc2 100644 --- a/src/iceberg/catalog/rest/rest_file_io.h +++ b/src/iceberg/catalog/rest/rest_file_io.h @@ -22,9 +22,11 @@ #include #include #include +#include #include "iceberg/catalog/rest/catalog_properties.h" #include "iceberg/catalog/rest/iceberg_rest_export.h" +#include "iceberg/catalog/rest/types.h" #include "iceberg/file_io.h" #include "iceberg/file_io_registry.h" #include "iceberg/result.h" @@ -44,4 +46,9 @@ ICEBERG_REST_EXPORT std::string_view BuiltinFileIOName(BuiltinFileIOKind kind); ICEBERG_REST_EXPORT Result> MakeCatalogFileIO( const RestCatalogProperties& config); +/// \brief Returns the longest-prefix S3-family vended credential (prefix +/// starting with "s3"), or nullptr if none. +ICEBERG_REST_EXPORT const StorageCredential* SelectS3StorageCredential( + const std::vector& credentials); + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/type_fwd.h b/src/iceberg/catalog/rest/type_fwd.h index ee684b245..783f22750 100644 --- a/src/iceberg/catalog/rest/type_fwd.h +++ b/src/iceberg/catalog/rest/type_fwd.h @@ -28,6 +28,7 @@ struct ErrorResponse; struct CommitTableResponse; struct LoadTableResult; struct OAuthTokenResponse; +struct StorageCredential; class Endpoint; class ErrorHandler; diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc index b1caff1e8..e90325e0b 100644 --- a/src/iceberg/test/arrow_s3_file_io_test.cc +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -27,6 +27,7 @@ #include #include "iceberg/arrow/arrow_io_util.h" +#include "iceberg/arrow/s3/arrow_s3_internal.h" #include "iceberg/arrow/s3/s3_properties.h" #include "iceberg/test/matchers.h" @@ -76,6 +77,11 @@ namespace { class ArrowS3FileIOTest : public ::testing::Test { protected: + static void SetUpTestSuite() { + auto io = MakeS3FileIO({}); + ASSERT_THAT(io, IsOk()); + } + static void TearDownTestSuite() { auto status = FinalizeS3(); if (!status.has_value()) { @@ -181,4 +187,59 @@ TEST_F(ArrowS3FileIOTest, MakeS3FileIOWithTimeouts) { ASSERT_THAT(io_res, IsOk()); } +#if ICEBERG_S3_ENABLED +TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsPrefersClientRegionOverS3Region) { + auto result = + ConfigureS3Options({{std::string(S3Properties::kClientRegion), "cn-hangzhou"}, + {std::string(S3Properties::kRegion), "us-east-1"}}); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->region, "cn-hangzhou"); +} + +TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsFallsBackToS3Region) { + auto result = ConfigureS3Options({{std::string(S3Properties::kRegion), "us-east-1"}}); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->region, "us-east-1"); +} + +TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsStripsHttpsEndpointScheme) { + auto result = ConfigureS3Options({{std::string(S3Properties::kEndpoint), + "https://oss-cn-hangzhou.aliyuncs.com:443"}}); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->endpoint_override, "oss-cn-hangzhou.aliyuncs.com:443"); + EXPECT_EQ(result->scheme, "https"); +} + +TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsStripsHttpEndpointScheme) { + auto result = ConfigureS3Options( + {{std::string(S3Properties::kEndpoint), "http://localhost:9000"}}); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->endpoint_override, "localhost:9000"); + EXPECT_EQ(result->scheme, "http"); +} + +TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsKeepsSchemelessEndpoint) { + auto result = + ConfigureS3Options({{std::string(S3Properties::kEndpoint), "localhost:9000"}}); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->endpoint_override, "localhost:9000"); +} + +TEST_F(ArrowS3FileIOTest, + ConfigureS3OptionsPathStyleAccessFalseEnablesVirtualAddressing) { + auto result = + ConfigureS3Options({{std::string(S3Properties::kPathStyleAccess), "false"}}); + ASSERT_THAT(result, IsOk()); + EXPECT_TRUE(result->force_virtual_addressing); +} + +TEST_F(ArrowS3FileIOTest, + ConfigureS3OptionsPathStyleAccessTrueDisablesVirtualAddressing) { + auto result = + ConfigureS3Options({{std::string(S3Properties::kPathStyleAccess), "true"}}); + ASSERT_THAT(result, IsOk()); + EXPECT_FALSE(result->force_virtual_addressing); +} +#endif + } // namespace iceberg::arrow diff --git a/src/iceberg/test/rest_file_io_test.cc b/src/iceberg/test/rest_file_io_test.cc index b1193d9f8..50d33e383 100644 --- a/src/iceberg/test/rest_file_io_test.cc +++ b/src/iceberg/test/rest_file_io_test.cc @@ -19,9 +19,12 @@ #include "iceberg/catalog/rest/rest_file_io.h" +#include + #include #include +#include "iceberg/catalog/rest/types.h" #include "iceberg/file_io_registry.h" #include "iceberg/test/matchers.h" @@ -147,4 +150,33 @@ TEST(RestFileIOTest, MakeCatalogFileIOSkipsCheckWhenWarehouseAbsent) { ASSERT_THAT(result, IsOk()); } +TEST(RestFileIOTest, SelectS3StorageCredentialPicksLongestS3Prefix) { + std::vector credentials = { + {.prefix = "s3", .config = {{"s3.access-key-id", "a"}}}, + {.prefix = "s3://bucket/data", .config = {{"s3.access-key-id", "b"}}}, + {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "c"}}}, + }; + const auto* cred = SelectS3StorageCredential(credentials); + ASSERT_NE(cred, nullptr); + EXPECT_EQ(cred->prefix, "s3://bucket/data"); +} + +TEST(RestFileIOTest, SelectS3StorageCredentialIgnoresNonS3Prefixes) { + std::vector credentials = { + {.prefix = "gs://bucket", .config = {{"k", "v"}}}, + {.prefix = "s3", .config = {{"s3.access-key-id", "a"}}}, + }; + const auto* cred = SelectS3StorageCredential(credentials); + ASSERT_NE(cred, nullptr); + EXPECT_EQ(cred->prefix, "s3"); +} + +TEST(RestFileIOTest, SelectS3StorageCredentialReturnsNullWhenNoneMatch) { + std::vector credentials = { + {.prefix = "gs://bucket", .config = {{"k", "v"}}}, + }; + EXPECT_EQ(SelectS3StorageCredential(credentials), nullptr); + EXPECT_EQ(SelectS3StorageCredential({}), nullptr); +} + } // namespace iceberg::rest From 8f6546cc78191c48c82f3a449d3abef95b2e8654 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Tue, 16 Jun 2026 17:58:40 +0800 Subject: [PATCH 3/8] address review comments --- src/iceberg/arrow/s3/arrow_s3_file_io.cc | 3 ++- src/iceberg/catalog/rest/json_serde.cc | 4 ++-- src/iceberg/catalog/rest/rest_catalog.cc | 18 ++++-------------- src/iceberg/catalog/rest/rest_file_io.cc | 15 +++++++++++++++ src/iceberg/catalog/rest/rest_file_io.h | 8 ++++++++ src/iceberg/test/arrow_s3_file_io_test.cc | 2 ++ src/iceberg/test/rest_file_io_test.cc | 21 +++++++++++++++++++++ 7 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/iceberg/arrow/s3/arrow_s3_file_io.cc b/src/iceberg/arrow/s3/arrow_s3_file_io.cc index 36858f76d..5392f65fa 100644 --- a/src/iceberg/arrow/s3/arrow_s3_file_io.cc +++ b/src/iceberg/arrow/s3/arrow_s3_file_io.cc @@ -145,7 +145,8 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( options.force_virtual_addressing = !*path_style_access; } - // Configure SSL + // Configure SSL. Explicit `s3.ssl.enabled` overrides any scheme derived from + // the endpoint above. ICEBERG_ASSIGN_OR_RAISE(const auto ssl_enabled, ParseOptionalBool(properties, S3Properties::kSslEnabled)); if (ssl_enabled.has_value() && !*ssl_enabled) { diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index 27d32bf83..1b2b4bd2e 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -722,8 +722,8 @@ Result LoadTableResultFromJson(const nlohmann::json& json) { GetJsonValueOrDefault(json, kConfig)); if (auto it = json.find(kStorageCredentials); it != json.end() && !it->is_null()) { if (!it->is_array()) { - return JsonParseError("Cannot parse storage credentials from non-array: {}", - SafeDumpJson(*it)); + // Don't echo the value — it may carry credential material. + return JsonParseError("Cannot parse storage credentials from non-array"); } for (const auto& entry : *it) { StorageCredential cred; diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index ad104320c..e9a4f2420 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -41,7 +41,6 @@ #include "iceberg/catalog/rest/rest_file_io.h" #include "iceberg/catalog/rest/rest_util.h" #include "iceberg/catalog/rest/types.h" -#include "iceberg/file_io_registry.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" @@ -522,22 +521,13 @@ Result> RestCatalog::TableFileIO( const SessionContext& /*context*/, const std::unordered_map& table_config, const std::vector& storage_credentials) const { - // Longest-prefix S3-family vended credential, matching the Java client's - // VendedCredentialsProvider. When present, build a per-table S3 FileIO from - // catalog + table config + the credential; its ResolvePath resolves oss:// and - // other S3-compatible schemes. + // Longest-prefix "s3" vended credential. Java's VendedCredentialsProvider + // resolves per path against the table location; we bind one at load time + // (fine for the common one-credential-per-table case). if (const StorageCredential* s3_cred = SelectS3StorageCredential(storage_credentials); s3_cred != nullptr) { - auto properties = config_.configs(); - for (const auto& [key, value] : table_config) { - properties[key] = value; - } - for (const auto& [key, value] : s3_cred->config) { - properties[key] = value; - } ICEBERG_ASSIGN_OR_RAISE( - auto io, - FileIORegistry::Load(std::string(FileIORegistry::kArrowS3FileIO), properties)); + auto io, MakeS3FileIOFromCredential(config_.configs(), table_config, *s3_cred)); return std::shared_ptr(std::move(io)); } ICEBERG_RETURN_UNEXPECTED(ValidateNoFileIOConfig(table_config)); diff --git a/src/iceberg/catalog/rest/rest_file_io.cc b/src/iceberg/catalog/rest/rest_file_io.cc index a12561feb..a1002c296 100644 --- a/src/iceberg/catalog/rest/rest_file_io.cc +++ b/src/iceberg/catalog/rest/rest_file_io.cc @@ -20,6 +20,7 @@ #include "iceberg/catalog/rest/rest_file_io.h" #include +#include #include #include "iceberg/catalog/rest/types.h" @@ -106,4 +107,18 @@ const StorageCredential* SelectS3StorageCredential( return best; } +Result> MakeS3FileIOFromCredential( + const std::unordered_map& catalog_config, + const std::unordered_map& table_config, + const StorageCredential& s3_cred) { + auto properties = catalog_config; + for (const auto& [key, value] : table_config) { + properties[key] = value; + } + for (const auto& [key, value] : s3_cred.config) { + properties[key] = value; + } + return FileIORegistry::Load(std::string(FileIORegistry::kArrowS3FileIO), properties); +} + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/rest_file_io.h b/src/iceberg/catalog/rest/rest_file_io.h index 5f0fb9fc2..e4f52766e 100644 --- a/src/iceberg/catalog/rest/rest_file_io.h +++ b/src/iceberg/catalog/rest/rest_file_io.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "iceberg/catalog/rest/catalog_properties.h" @@ -51,4 +52,11 @@ ICEBERG_REST_EXPORT Result> MakeCatalogFileIO( ICEBERG_REST_EXPORT const StorageCredential* SelectS3StorageCredential( const std::vector& credentials); +/// \brief Builds an `arrow-fs-s3` FileIO, merging catalog, table, then +/// credential config (later wins). +ICEBERG_REST_EXPORT Result> MakeS3FileIOFromCredential( + const std::unordered_map& catalog_config, + const std::unordered_map& table_config, + const StorageCredential& s3_cred); + } // namespace iceberg::rest diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc index e90325e0b..b4e64d505 100644 --- a/src/iceberg/test/arrow_s3_file_io_test.cc +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -77,10 +77,12 @@ namespace { class ArrowS3FileIOTest : public ::testing::Test { protected: +#if ICEBERG_S3_ENABLED static void SetUpTestSuite() { auto io = MakeS3FileIO({}); ASSERT_THAT(io, IsOk()); } +#endif static void TearDownTestSuite() { auto status = FinalizeS3(); diff --git a/src/iceberg/test/rest_file_io_test.cc b/src/iceberg/test/rest_file_io_test.cc index 50d33e383..ac70a934f 100644 --- a/src/iceberg/test/rest_file_io_test.cc +++ b/src/iceberg/test/rest_file_io_test.cc @@ -19,6 +19,8 @@ #include "iceberg/catalog/rest/rest_file_io.h" +#include +#include #include #include @@ -179,4 +181,23 @@ TEST(RestFileIOTest, SelectS3StorageCredentialReturnsNullWhenNoneMatch) { EXPECT_EQ(SelectS3StorageCredential({}), nullptr); } +TEST(RestFileIOTest, MakeS3FileIOFromCredentialMergesConfigWithPrecedence) { + auto captured = std::make_shared>(); + FileIORegistry::Register( + std::string(FileIORegistry::kArrowS3FileIO), + [captured](const std::unordered_map& properties) + -> Result> { + *captured = properties; + return std::make_unique(); + }); + auto result = MakeS3FileIOFromCredential( + {{"a", "catalog"}, {"shared", "catalog"}}, {{"b", "table"}, {"shared", "table"}}, + StorageCredential{.prefix = "s3", .config = {{"c", "cred"}, {"shared", "cred"}}}); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ((*captured)["a"], "catalog"); + EXPECT_EQ((*captured)["b"], "table"); + EXPECT_EQ((*captured)["c"], "cred"); + EXPECT_EQ((*captured)["shared"], "cred"); +} + } // namespace iceberg::rest From 02b74d1445e98b11d594bff8867254fe85a7f4c9 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Tue, 23 Jun 2026 00:34:10 -0400 Subject: [PATCH 4/8] fix(rest): preserve vended credentials on commit and match by location --- src/iceberg/catalog/rest/rest_catalog.cc | 69 ++++++++++++++++-------- src/iceberg/catalog/rest/rest_catalog.h | 5 +- src/iceberg/catalog/rest/rest_file_io.cc | 35 ++++++++++-- src/iceberg/catalog/rest/rest_file_io.h | 12 ++++- src/iceberg/test/rest_file_io_test.cc | 63 ++++++++++++++++++++-- 5 files changed, 150 insertions(+), 34 deletions(-) diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index e9a4f2420..36950c49d 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -20,6 +20,7 @@ #include "iceberg/catalog/rest/rest_catalog.h" #include +#include #include #include #include @@ -51,6 +52,7 @@ #include "iceberg/table_requirements.h" #include "iceberg/table_update.h" #include "iceberg/transaction.h" +#include "iceberg/util/formatter_internal.h" #include "iceberg/util/macros.h" namespace iceberg::rest { @@ -285,12 +287,14 @@ class RestCatalog::TableScopedCatalog final TableScopedCatalog(std::shared_ptr root, SessionContext context, TableIdentifier identifier, std::unordered_map table_config, - std::shared_ptr table_session) + std::shared_ptr table_session, + std::shared_ptr table_io) : root_(std::move(root)), context_(std::move(context)), identifier_(std::move(identifier)), table_config_(std::move(table_config)), - table_session_(std::move(table_session)) {} + table_session_(std::move(table_session)), + table_io_(std::move(table_io)) {} std::string_view name() const override { return root_->name(); } @@ -348,7 +352,7 @@ class RestCatalog::TableScopedCatalog final auto response, root_->UpdateTableInternal(identifier, requirements, updates, *table_session_)); return root_->MakeTableFromCommitResponse(identifier, std::move(response), context_, - table_config_, table_session_); + table_config_, table_session_, table_io_); } Result> StageCreateTable( @@ -395,6 +399,7 @@ class RestCatalog::TableScopedCatalog final TableIdentifier identifier_; std::unordered_map table_config_; std::shared_ptr table_session_; + std::shared_ptr table_io_; }; RestCatalog::~RestCatalog() { @@ -520,16 +525,31 @@ Result> RestCatalog::TableAuthSession( Result> RestCatalog::TableFileIO( const SessionContext& /*context*/, const std::unordered_map& table_config, - const std::vector& storage_credentials) const { - // Longest-prefix "s3" vended credential. Java's VendedCredentialsProvider - // resolves per path against the table location; we bind one at load time - // (fine for the common one-credential-per-table case). - if (const StorageCredential* s3_cred = SelectS3StorageCredential(storage_credentials); + const std::vector& storage_credentials, + std::string_view storage_location) const { + // Bind one S3 FileIO from the vended credential whose prefix best matches the + // table location (the common one-credential-per-table case). + // TODO: support per-path credential routing, STS refresh via credentials.uri, + // and non-S3 (GCS/ADLS) credentials, like Java's S3FileIO. + if (const StorageCredential* s3_cred = + SelectS3StorageCredential(storage_credentials, storage_location); s3_cred != nullptr) { ICEBERG_ASSIGN_OR_RAISE( auto io, MakeS3FileIOFromCredential(config_.configs(), table_config, *s3_cred)); return std::shared_ptr(std::move(io)); } + + // Only non-S3 (e.g. GCS/ADLS) credentials were vended and we can't honor them; + // fail fast. (S3 credentials that simply don't match fall through below.) + if (HasOnlyNonS3StorageCredentials(storage_credentials)) { + auto prefixes = + storage_credentials | std::views::transform(&StorageCredential::prefix); + return NotSupported( + "Vended storage credentials {} are unsupported (only S3-family " + "credentials are supported)", + FormatRange(prefixes, ", ", "[", "]")); + } + ICEBERG_RETURN_UNEXPECTED(ValidateNoFileIOConfig(table_config)); return file_io_; } @@ -750,8 +770,10 @@ Result> RestCatalog::UpdateTable( ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); + // No table was loaded here, so there is no per-table FileIO to reuse; use the + // catalog FileIO (table_config carries no credentials). return MakeTableFromCommitResponse(identifier, std::move(response), context, - table_config, std::move(table_session)); + table_config, std::move(table_session), file_io_); } Result> RestCatalog::StageCreateTable( @@ -767,13 +789,15 @@ Result> RestCatalog::StageCreateTable( /*stage_create=*/true, *contextual_session)); auto table_config = std::move(result.config); auto storage_credentials = std::move(result.storage_credentials); - ICEBERG_ASSIGN_OR_RAISE(auto table_io, - TableFileIO(context, table_config, storage_credentials)); + ICEBERG_ASSIGN_OR_RAISE( + auto table_io, + TableFileIO(context, table_config, storage_credentials, result.metadata->location)); ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); auto table_catalog = std::make_shared( - shared_from_this(), context, identifier, table_config, std::move(table_session)); + shared_from_this(), context, identifier, table_config, std::move(table_session), + table_io); ICEBERG_ASSIGN_OR_RAISE( auto staged_table, StagedTable::Make(identifier, std::move(result.metadata), @@ -882,13 +906,14 @@ Result> RestCatalog::MakeTableFromLoadResult( std::shared_ptr contextual_session) { auto table_config = std::move(result.config); auto storage_credentials = std::move(result.storage_credentials); - ICEBERG_ASSIGN_OR_RAISE(auto table_io, - TableFileIO(context, table_config, storage_credentials)); + ICEBERG_ASSIGN_OR_RAISE( + auto table_io, + TableFileIO(context, table_config, storage_credentials, result.metadata->location)); ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); auto table_catalog = std::make_shared( - shared_from_this(), context, identifier, table_config, table_session); + shared_from_this(), context, identifier, table_config, table_session, table_io); return Table::Make(identifier, std::move(result.metadata), std::move(result.metadata_location), std::move(table_io), std::move(table_catalog)); @@ -898,14 +923,14 @@ Result> RestCatalog::MakeTableFromCommitResponse( const TableIdentifier& identifier, CommitTableResponse response, const SessionContext& context, const std::unordered_map& table_config, - std::shared_ptr table_session) { - // TODO(gangwu): If the REST commit response grows table config or - // storage credentials, derive a replacement table session/FileIO from that - // response. The current table commit response does not define config. - ICEBERG_ASSIGN_OR_RAISE(auto table_io, - TableFileIO(context, table_config, /*storage_credentials=*/{})); + std::shared_ptr table_session, std::shared_ptr table_io) { + // Reuse the FileIO bound at load: CommitTableResponse carries no config or + // storage credentials, so rebuilding it would drop the vended credentials + // (mirrors Java RESTSessionCatalog#tableFileIO). + // TODO(gangwu): rebuild the FileIO if the commit response ever grows config + // or storage credentials. auto table_catalog = std::make_shared( - shared_from_this(), context, identifier, table_config, table_session); + shared_from_this(), context, identifier, table_config, table_session, table_io); return Table::Make(identifier, std::move(response.metadata), std::move(response.metadata_location), std::move(table_io), std::move(table_catalog)); diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 312ad3edc..8d7694759 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -80,7 +80,8 @@ class ICEBERG_REST_EXPORT RestCatalog final Result> TableFileIO( const SessionContext& context, const std::unordered_map& table_config, - const std::vector& storage_credentials) const; + const std::vector& storage_credentials, + std::string_view storage_location) const; Result> ListNamespaces(const Namespace& ns, auth::AuthSession& session) const; @@ -170,7 +171,7 @@ class ICEBERG_REST_EXPORT RestCatalog final const TableIdentifier& identifier, CommitTableResponse response, const SessionContext& context, const std::unordered_map& table_config, - std::shared_ptr table_session); + std::shared_ptr table_session, std::shared_ptr table_io); RestCatalogProperties config_; std::shared_ptr file_io_; diff --git a/src/iceberg/catalog/rest/rest_file_io.cc b/src/iceberg/catalog/rest/rest_file_io.cc index a1002c296..775eb3699 100644 --- a/src/iceberg/catalog/rest/rest_file_io.cc +++ b/src/iceberg/catalog/rest/rest_file_io.cc @@ -19,6 +19,7 @@ #include "iceberg/catalog/rest/rest_file_io.h" +#include #include #include #include @@ -36,6 +37,18 @@ bool IsBuiltinImpl(std::string_view io_impl) { io_impl == FileIORegistry::kArrowS3FileIO; } +// Rewrites S3-compatible schemes (s3a/s3n/oss) to the canonical s3:// so a vended +// credential matches a table location using an equivalent scheme (e.g. DLF vends +// an `s3` credential for oss:// locations). +std::string CanonicalizeS3Scheme(std::string_view location) { + for (std::string_view scheme : {"s3a://", "s3n://", "oss://"}) { + if (location.starts_with(scheme)) { + return std::string("s3://").append(location.substr(scheme.size())); + } + } + return std::string(location); +} + } // namespace Result DetectBuiltinFileIO(std::string_view location) { @@ -96,17 +109,33 @@ Result> MakeCatalogFileIO(const RestCatalogProperties& c } const StorageCredential* SelectS3StorageCredential( - const std::vector& credentials) { + const std::vector& credentials, + std::string_view storage_location) { + const std::string location = CanonicalizeS3Scheme(storage_location); const StorageCredential* best = nullptr; + size_t best_size = 0; for (const auto& cred : credentials) { - if (cred.prefix.starts_with("s3") && - (best == nullptr || cred.prefix.size() > best->prefix.size())) { + if (!cred.prefix.starts_with("s3")) { + continue; + } + // Keep the longest S3 prefix that matches this location; matching globally + // would bind a credential to paths it does not cover. + const std::string prefix = CanonicalizeS3Scheme(cred.prefix); + if (location.starts_with(prefix) && (best == nullptr || prefix.size() > best_size)) { best = &cred; + best_size = prefix.size(); } } return best; } +bool HasOnlyNonS3StorageCredentials(const std::vector& credentials) { + return !credentials.empty() && + std::ranges::none_of(credentials, [](const StorageCredential& cred) { + return cred.prefix.starts_with("s3"); + }); +} + Result> MakeS3FileIOFromCredential( const std::unordered_map& catalog_config, const std::unordered_map& table_config, diff --git a/src/iceberg/catalog/rest/rest_file_io.h b/src/iceberg/catalog/rest/rest_file_io.h index e4f52766e..f8375ed7e 100644 --- a/src/iceberg/catalog/rest/rest_file_io.h +++ b/src/iceberg/catalog/rest/rest_file_io.h @@ -47,9 +47,17 @@ ICEBERG_REST_EXPORT std::string_view BuiltinFileIOName(BuiltinFileIOKind kind); ICEBERG_REST_EXPORT Result> MakeCatalogFileIO( const RestCatalogProperties& config); -/// \brief Returns the longest-prefix S3-family vended credential (prefix -/// starting with "s3"), or nullptr if none. +/// \brief Returns the S3-family vended credential (prefix starting with "s3") +/// whose prefix is the longest match for `storage_location`, or nullptr if none +/// applies (Iceberg REST spec / Java `S3FileIO` longest-prefix matching). The +/// s3/s3a/s3n schemes are treated as equivalent when matching. ICEBERG_REST_EXPORT const StorageCredential* SelectS3StorageCredential( + const std::vector& credentials, std::string_view storage_location); + +/// \brief True if `credentials` is non-empty but contains no S3-family +/// credential (prefix starting with "s3") — i.e. only unsupported schemes such +/// as GCS/ADLS were vended, which an S3/local FileIO cannot honor. +ICEBERG_REST_EXPORT bool HasOnlyNonS3StorageCredentials( const std::vector& credentials); /// \brief Builds an `arrow-fs-s3` FileIO, merging catalog, table, then diff --git a/src/iceberg/test/rest_file_io_test.cc b/src/iceberg/test/rest_file_io_test.cc index ac70a934f..6f85f1910 100644 --- a/src/iceberg/test/rest_file_io_test.cc +++ b/src/iceberg/test/rest_file_io_test.cc @@ -152,23 +152,56 @@ TEST(RestFileIOTest, MakeCatalogFileIOSkipsCheckWhenWarehouseAbsent) { ASSERT_THAT(result, IsOk()); } -TEST(RestFileIOTest, SelectS3StorageCredentialPicksLongestS3Prefix) { +TEST(RestFileIOTest, SelectS3StorageCredentialPicksLongestMatchingS3Prefix) { std::vector credentials = { {.prefix = "s3", .config = {{"s3.access-key-id", "a"}}}, {.prefix = "s3://bucket/data", .config = {{"s3.access-key-id", "b"}}}, {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "c"}}}, }; - const auto* cred = SelectS3StorageCredential(credentials); + const auto* cred = SelectS3StorageCredential(credentials, "s3://bucket/data/db/t"); ASSERT_NE(cred, nullptr); EXPECT_EQ(cred->prefix, "s3://bucket/data"); } +TEST(RestFileIOTest, SelectS3StorageCredentialMatchesAgainstStorageLocation) { + std::vector credentials = { + // Globally longest prefix, but for a path that does not cover this table. + {.prefix = "s3://other/very/long/prefix", .config = {{"s3.access-key-id", "x"}}}, + {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "y"}}}, + }; + const auto* cred = SelectS3StorageCredential(credentials, "s3://bucket/db/t/data"); + ASSERT_NE(cred, nullptr); + // The longest *matching* prefix wins, not the globally longest one. + EXPECT_EQ(cred->prefix, "s3://bucket"); +} + +TEST(RestFileIOTest, SelectS3StorageCredentialMatchesEquivalentS3Schemes) { + std::vector s3_cred = { + {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "a"}}}, + }; + // s3a:// and s3n:// locations match an s3:// vended prefix (same backend). + EXPECT_NE(SelectS3StorageCredential(s3_cred, "s3a://bucket/db/t"), nullptr); + EXPECT_NE(SelectS3StorageCredential(s3_cred, "s3n://bucket/db/t"), nullptr); + + // ...and the inverse: an s3a:// prefix matches an s3:// location. + std::vector s3a_cred = { + {.prefix = "s3a://bucket", .config = {{"s3.access-key-id", "b"}}}, + }; + const auto* cred = SelectS3StorageCredential(s3a_cred, "s3://bucket/db/t"); + ASSERT_NE(cred, nullptr); + EXPECT_EQ(cred->prefix, "s3a://bucket"); + + // OSS is S3-compatible: an `s3` credential matches an oss:// location (Alibaba + // DLF vends an `s3` credential for tables whose location uses the oss:// scheme). + EXPECT_NE(SelectS3StorageCredential(s3_cred, "oss://bucket/db/t"), nullptr); +} + TEST(RestFileIOTest, SelectS3StorageCredentialIgnoresNonS3Prefixes) { std::vector credentials = { {.prefix = "gs://bucket", .config = {{"k", "v"}}}, {.prefix = "s3", .config = {{"s3.access-key-id", "a"}}}, }; - const auto* cred = SelectS3StorageCredential(credentials); + const auto* cred = SelectS3StorageCredential(credentials, "s3://bucket/data"); ASSERT_NE(cred, nullptr); EXPECT_EQ(cred->prefix, "s3"); } @@ -177,8 +210,28 @@ TEST(RestFileIOTest, SelectS3StorageCredentialReturnsNullWhenNoneMatch) { std::vector credentials = { {.prefix = "gs://bucket", .config = {{"k", "v"}}}, }; - EXPECT_EQ(SelectS3StorageCredential(credentials), nullptr); - EXPECT_EQ(SelectS3StorageCredential({}), nullptr); + // Non-S3 prefix, S3 location. + EXPECT_EQ(SelectS3StorageCredential(credentials, "s3://bucket"), nullptr); + // S3 credential whose prefix does not cover the location. + EXPECT_EQ(SelectS3StorageCredential( + {{.prefix = "s3://other", .config = {{"s3.access-key-id", "a"}}}}, + "s3://bucket/data"), + nullptr); + // No credentials at all. + EXPECT_EQ(SelectS3StorageCredential({}, "s3://bucket"), nullptr); +} + +TEST(RestFileIOTest, HasOnlyNonS3StorageCredentials) { + // Only GCS/ADLS prefixes -> unsupported, fail fast. + EXPECT_TRUE(HasOnlyNonS3StorageCredentials( + {{.prefix = "gs://bucket", .config = {{"k", "v"}}}, + {.prefix = "abfs://c@a.dfs.core.windows.net", .config = {{"k", "v"}}}})); + // At least one S3 credential present -> not unsupported (may fall back). + EXPECT_FALSE(HasOnlyNonS3StorageCredentials( + {{.prefix = "gs://bucket", .config = {{"k", "v"}}}, + {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "a"}}}})); + // No credentials at all -> not "only non-S3". + EXPECT_FALSE(HasOnlyNonS3StorageCredentials({})); } TEST(RestFileIOTest, MakeS3FileIOFromCredentialMergesConfigWithPrecedence) { From 579cb4e77aafe8e52d754c2243c0afe07d7dada5 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Fri, 26 Jun 2026 07:28:57 -0400 Subject: [PATCH 5/8] fix(rest): validate storage credentials in LoadTableResult::Validate --- src/iceberg/catalog/rest/types.h | 15 +++++++++++++++ src/iceberg/test/rest_json_serde_test.cc | 14 ++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 0403fc22c..3bf218a6f 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -187,6 +187,18 @@ struct ICEBERG_REST_EXPORT StorageCredential { std::string prefix; std::unordered_map config; + /// \brief Validates the StorageCredential. The REST spec requires both a + /// prefix and config; non-empty matches Java `Credential.validate()`. + Status Validate() const { + if (prefix.empty()) { + return ValidationFailed("Invalid storage credential: prefix must be non-empty"); + } + if (config.empty()) { + return ValidationFailed("Invalid storage credential: config must be non-empty"); + } + return {}; + } + bool operator==(const StorageCredential& other) const = default; }; @@ -203,6 +215,9 @@ struct ICEBERG_REST_EXPORT LoadTableResult { if (!metadata) { return ValidationFailed("Invalid metadata: null"); } + for (const auto& credential : storage_credentials) { + ICEBERG_RETURN_UNEXPECTED(credential.Validate()); + } return {}; } diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 1dd3ed19b..10f209aa2 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -2612,4 +2612,18 @@ TEST(FetchPlanningResultResponseRoundtripTest, FailedWithError) { EXPECT_EQ(*result, *result2); } +TEST(StorageCredentialValidateTest, RequiresPrefixAndConfig) { + EXPECT_THAT( + (StorageCredential{.prefix = "s3", .config = {{"s3.region", "us"}}}.Validate()), + IsOk()); + + auto empty_prefix = StorageCredential{.prefix = "", .config = {{"s3.region", "us"}}}; + EXPECT_THAT(empty_prefix.Validate(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(empty_prefix.Validate(), HasErrorMessage("prefix must be non-empty")); + + auto empty_config = StorageCredential{.prefix = "s3", .config = {}}; + EXPECT_THAT(empty_config.Validate(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(empty_config.Validate(), HasErrorMessage("config must be non-empty")); +} + } // namespace iceberg::rest From 567603f434d8c7fb0558285022d1011cd2f1624a Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Fri, 26 Jun 2026 07:44:05 -0400 Subject: [PATCH 6/8] fix(s3): strip endpoint scheme from env vars and honor ssl.enabled=true --- src/iceberg/arrow/s3/arrow_s3_file_io.cc | 46 +++++++++++------------ src/iceberg/test/arrow_s3_file_io_test.cc | 14 +++++++ 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/src/iceberg/arrow/s3/arrow_s3_file_io.cc b/src/iceberg/arrow/s3/arrow_s3_file_io.cc index 5392f65fa..3d83cdf66 100644 --- a/src/iceberg/arrow/s3/arrow_s3_file_io.cc +++ b/src/iceberg/arrow/s3/arrow_s3_file_io.cc @@ -75,6 +75,17 @@ Status EnsureS3Initialized() { return {}; } +// Splits any URI scheme off `endpoint` into `options.scheme`, returning the bare +// host[:port] that Arrow's `endpoint_override` expects. +std::string SplitEndpointScheme(std::string_view endpoint, + ::arrow::fs::S3Options& options) { + if (const auto pos = endpoint.find("://"); pos != std::string_view::npos) { + options.scheme = std::string(endpoint.substr(0, pos)); + endpoint = endpoint.substr(pos + 3); + } + return std::string(endpoint); +} + #endif } // namespace @@ -116,27 +127,17 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( options.region = *region; } - // Configure endpoint (for MinIO, LocalStack, OSS, etc.) + // Configure endpoint (for MinIO, LocalStack, OSS, etc.) from `s3.endpoint` or + // the AWS standard env vars. if (const auto* endpoint = FindProperty(properties, S3Properties::kEndpoint); endpoint != nullptr) { - // `s3.endpoint` may be a full URI; Arrow wants host[:port], so strip the scheme. - std::string_view ep = *endpoint; - if (const auto pos = ep.find("://"); pos != std::string_view::npos) { - options.scheme = std::string(ep.substr(0, pos)); - ep = ep.substr(pos + 3); - } - options.endpoint_override = std::string(ep); - } else { - // Fall back to AWS standard environment variables for endpoint override - const char* s3_endpoint_env = std::getenv("AWS_ENDPOINT_URL_S3"); - if (s3_endpoint_env != nullptr) { - options.endpoint_override = s3_endpoint_env; - } else { - const char* endpoint_env = std::getenv("AWS_ENDPOINT_URL"); - if (endpoint_env != nullptr) { - options.endpoint_override = endpoint_env; - } - } + options.endpoint_override = SplitEndpointScheme(*endpoint, options); + } else if (const char* s3_endpoint_env = std::getenv("AWS_ENDPOINT_URL_S3"); + s3_endpoint_env != nullptr) { + options.endpoint_override = SplitEndpointScheme(s3_endpoint_env, options); + } else if (const char* endpoint_env = std::getenv("AWS_ENDPOINT_URL"); + endpoint_env != nullptr) { + options.endpoint_override = SplitEndpointScheme(endpoint_env, options); } ICEBERG_ASSIGN_OR_RAISE(const auto path_style_access, @@ -145,12 +146,11 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( options.force_virtual_addressing = !*path_style_access; } - // Configure SSL. Explicit `s3.ssl.enabled` overrides any scheme derived from - // the endpoint above. + // Explicit `s3.ssl.enabled` overrides any endpoint-derived scheme. ICEBERG_ASSIGN_OR_RAISE(const auto ssl_enabled, ParseOptionalBool(properties, S3Properties::kSslEnabled)); - if (ssl_enabled.has_value() && !*ssl_enabled) { - options.scheme = "http"; + if (ssl_enabled.has_value()) { + options.scheme = *ssl_enabled ? "https" : "http"; } // Configure timeouts diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc index b4e64d505..ef8291050 100644 --- a/src/iceberg/test/arrow_s3_file_io_test.cc +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -227,6 +227,20 @@ TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsKeepsSchemelessEndpoint) { EXPECT_EQ(result->endpoint_override, "localhost:9000"); } +TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsSslEnabledOverridesEndpointScheme) { + auto https = + ConfigureS3Options({{std::string(S3Properties::kEndpoint), "http://localhost:9000"}, + {std::string(S3Properties::kSslEnabled), "true"}}); + ASSERT_THAT(https, IsOk()); + EXPECT_EQ(https->scheme, "https"); + + auto http = ConfigureS3Options( + {{std::string(S3Properties::kEndpoint), "https://localhost:9000"}, + {std::string(S3Properties::kSslEnabled), "false"}}); + ASSERT_THAT(http, IsOk()); + EXPECT_EQ(http->scheme, "http"); +} + TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsPathStyleAccessFalseEnablesVirtualAddressing) { auto result = From e9a2878c67cb9e8219e911208ef34af15163ec07 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Fri, 26 Jun 2026 10:37:00 -0400 Subject: [PATCH 7/8] feat(rest): route each object path to its vended S3 credential (per-path FileIO) --- src/iceberg/arrow/arrow_io.cc | 112 +++++++++++++++++------ src/iceberg/arrow/arrow_io_internal.h | 24 ++++- src/iceberg/arrow/s3/arrow_s3_file_io.cc | 15 ++- src/iceberg/arrow/s3/arrow_s3_internal.h | 6 ++ src/iceberg/catalog/rest/rest_catalog.cc | 48 ++++------ src/iceberg/catalog/rest/rest_catalog.h | 3 +- src/iceberg/catalog/rest/rest_file_io.cc | 74 +++++++-------- src/iceberg/catalog/rest/rest_file_io.h | 21 ++--- src/iceberg/file_io.h | 17 ++++ src/iceberg/test/rest_file_io_test.cc | 108 +++++----------------- src/iceberg/util/location_util.h | 22 +++++ 11 files changed, 250 insertions(+), 200 deletions(-) diff --git a/src/iceberg/arrow/arrow_io.cc b/src/iceberg/arrow/arrow_io.cc index 4c795badf..d5cd2d7e3 100644 --- a/src/iceberg/arrow/arrow_io.cc +++ b/src/iceberg/arrow/arrow_io.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,8 @@ #include "iceberg/arrow/arrow_io_internal.h" #include "iceberg/arrow/arrow_io_util.h" #include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/arrow/s3/arrow_s3_internal.h" +#include "iceberg/util/location_util.h" #include "iceberg/util/macros.h" namespace iceberg::arrow { @@ -483,13 +486,53 @@ class ArrowOutputFile : public OutputFile { } // namespace -Result ArrowFileSystemFileIO::ResolvePath(const std::string& file_location) { +const std::shared_ptr<::arrow::fs::FileSystem>& ArrowFileSystemFileIO::FileSystemForPath( + std::string_view location) const { + if (fs_by_prefix_.empty()) { + return arrow_fs_; + } + // Longest matching prefix wins; fall back to the default file system. + const std::string canonical = LocationUtil::CanonicalizeS3Scheme(location); + const std::shared_ptr<::arrow::fs::FileSystem>* best = &arrow_fs_; + size_t best_len = 0; + for (const auto& [prefix, fs] : fs_by_prefix_) { + if (prefix.size() > best_len && LocationUtil::PathHasPrefix(canonical, prefix)) { + best = &fs; + best_len = prefix.size(); + } + } + return *best; +} + +Status ArrowFileSystemFileIO::SetStorageCredentials( + const std::vector< + std::pair>>& + properties_by_prefix) { +#if ICEBERG_S3_ENABLED + std::vector>> + fs_by_prefix; + fs_by_prefix.reserve(properties_by_prefix.size()); + for (const auto& [prefix, properties] : properties_by_prefix) { + ICEBERG_ASSIGN_OR_RAISE(auto fs, BuildArrowS3FileSystem(properties)); + fs_by_prefix.emplace_back(prefix, std::move(fs)); + } + fs_by_prefix_ = std::move(fs_by_prefix); + return {}; +#else + (void)properties_by_prefix; + return NotSupported("S3 storage credentials require Arrow S3 support"); +#endif +} + +Result ArrowFileSystemFileIO::ResolvePath( + const std::shared_ptr<::arrow::fs::FileSystem>& fs, + const std::string& file_location) { const auto pos = file_location.find("://"); if (pos == std::string::npos) { return file_location; } - auto path = arrow_fs_->PathFromUri(file_location); + auto path = fs->PathFromUri(file_location); if (path.ok()) { return std::move(path).ValueOrDie(); } @@ -512,14 +555,14 @@ Result> OpenArrowInputStream( ICEBERG_PRECHECK(io != nullptr, "FileIO cannot be null"); if (auto arrow_io = std::dynamic_pointer_cast(io)) { - ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(path)); + const auto& fs = arrow_io->FileSystemForPath(path); + ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(fs, path)); ::arrow::fs::FileInfo file_info(resolved_path, ::arrow::fs::FileType::File); if (length.has_value()) { ICEBERG_ASSIGN_OR_RAISE(auto size, ToInt64Length(*length)); file_info.set_size(size); } - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto input, - arrow_io->arrow_fs_->OpenInputFile(file_info)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto input, fs->OpenInputFile(file_info)); return input; } @@ -543,16 +586,15 @@ Result> OpenArrowOutputStream( ICEBERG_PRECHECK(io != nullptr, "FileIO cannot be null"); if (auto arrow_io = std::dynamic_pointer_cast(io)) { - ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(path)); + const auto& fs = arrow_io->FileSystemForPath(path); + ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(fs, path)); if (!overwrite) { - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto info, - arrow_io->arrow_fs_->GetFileInfo(resolved_path)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto info, fs->GetFileInfo(resolved_path)); if (info.type() != ::arrow::fs::FileType::NotFound) { return AlreadyExists("File already exists: {}", path); } } - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto output, - arrow_io->arrow_fs_->OpenOutputStream(resolved_path)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto output, fs->OpenOutputStream(resolved_path)); return output; } @@ -568,42 +610,60 @@ Result> OpenArrowOutputStream( Result> ArrowFileSystemFileIO::NewInputFile( std::string file_location) { - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); - return std::make_unique(arrow_fs_, std::move(file_location), - std::move(path), std::nullopt); + const auto& fs = FileSystemForPath(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); + return std::make_unique(fs, std::move(file_location), std::move(path), + std::nullopt); } Result> ArrowFileSystemFileIO::NewInputFile( std::string file_location, size_t length) { ICEBERG_ASSIGN_OR_RAISE(auto size, ToInt64Length(length)); - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); - return std::make_unique(arrow_fs_, std::move(file_location), - std::move(path), size); + const auto& fs = FileSystemForPath(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); + return std::make_unique(fs, std::move(file_location), std::move(path), + size); } Result> ArrowFileSystemFileIO::NewOutputFile( std::string file_location) { - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); - return std::make_unique(arrow_fs_, std::move(file_location), - std::move(path)); + const auto& fs = FileSystemForPath(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); + return std::make_unique(fs, std::move(file_location), std::move(path)); } /// \brief Delete a file at the given location. Status ArrowFileSystemFileIO::DeleteFile(const std::string& file_location) { - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); - ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(path)); + const auto& fs = FileSystemForPath(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); + ICEBERG_ARROW_RETURN_NOT_OK(fs->DeleteFile(path)); return {}; } Status ArrowFileSystemFileIO::DeleteFiles( const std::vector& file_locations) { - std::vector paths; - paths.reserve(file_locations.size()); + if (fs_by_prefix_.empty()) { + // No per-prefix routing: one batched delete on the default file system. + std::vector paths; + paths.reserve(file_locations.size()); + for (const auto& file_location : file_locations) { + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(arrow_fs_, file_location)); + paths.push_back(std::move(path)); + } + ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFiles(paths)); + return {}; + } + + // Paths may route to different file systems: group by fs, then batch per fs. + std::unordered_map<::arrow::fs::FileSystem*, std::vector> paths_by_fs; for (const auto& file_location : file_locations) { - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); - paths.push_back(std::move(path)); + const auto& fs = FileSystemForPath(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); + paths_by_fs[fs.get()].push_back(std::move(path)); + } + for (auto& [fs, paths] : paths_by_fs) { + ICEBERG_ARROW_RETURN_NOT_OK(fs->DeleteFiles(paths)); } - ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFiles(paths)); return {}; } diff --git a/src/iceberg/arrow/arrow_io_internal.h b/src/iceberg/arrow/arrow_io_internal.h index a6b85b6c9..0e3607cb2 100644 --- a/src/iceberg/arrow/arrow_io_internal.h +++ b/src/iceberg/arrow/arrow_io_internal.h @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #include #include @@ -52,7 +55,8 @@ OpenArrowOutputStream(const std::shared_ptr& io, const std::string& path bool overwrite = true); /// \brief A concrete implementation of FileIO for Arrow file system. -class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO { +class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO, + public SupportsStorageCredentials { public: explicit ArrowFileSystemFileIO(std::shared_ptr<::arrow::fs::FileSystem> arrow_fs) : arrow_fs_(std::move(arrow_fs)) {} @@ -81,6 +85,12 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO { /// \brief Delete files at the given locations. Status DeleteFiles(const std::vector& file_locations) override; + /// \brief Build one S3 file system per credential prefix for per-path routing. + Status SetStorageCredentials( + const std::vector< + std::pair>>& + properties_by_prefix) override; + /// \brief Get the Arrow file system. const std::shared_ptr<::arrow::fs::FileSystem>& fs() const { return arrow_fs_; } @@ -92,10 +102,18 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO { friend Result> OpenArrowOutputStream( const std::shared_ptr& io, const std::string& path, bool overwrite); - /// \brief Resolve a file location to a filesystem path. - Result ResolvePath(const std::string& file_location); + /// \brief Pick the file system for `location` (longest matching prefix, else + /// the default). + const std::shared_ptr<::arrow::fs::FileSystem>& FileSystemForPath( + std::string_view location) const; + + /// \brief Resolve a file location to a filesystem path using `fs`. + Result ResolvePath(const std::shared_ptr<::arrow::fs::FileSystem>& fs, + const std::string& file_location); std::shared_ptr<::arrow::fs::FileSystem> arrow_fs_; + std::vector>> + fs_by_prefix_; }; } // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3/arrow_s3_file_io.cc b/src/iceberg/arrow/s3/arrow_s3_file_io.cc index 3d83cdf66..f65f023ca 100644 --- a/src/iceberg/arrow/s3/arrow_s3_file_io.cc +++ b/src/iceberg/arrow/s3/arrow_s3_file_io.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #if ICEBERG_S3_ENABLED @@ -172,15 +173,21 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( } #endif -Result> MakeS3FileIO( - const std::unordered_map& properties) { #if ICEBERG_S3_ENABLED +Result> BuildArrowS3FileSystem( + const std::unordered_map& properties) { ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); - - // Configure S3 options from properties (uses default credentials if empty) ICEBERG_ASSIGN_OR_RAISE(auto options, ConfigureS3Options(properties)); ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options)); + return std::shared_ptr<::arrow::fs::FileSystem>(std::move(fs)); +} +#endif +Result> MakeS3FileIO( + const std::unordered_map& properties) { +#if ICEBERG_S3_ENABLED + // Uses default credentials if properties are empty. + ICEBERG_ASSIGN_OR_RAISE(auto fs, BuildArrowS3FileSystem(properties)); return std::make_unique(std::move(fs)); #else return NotSupported("Arrow S3 support is not enabled"); diff --git a/src/iceberg/arrow/s3/arrow_s3_internal.h b/src/iceberg/arrow/s3/arrow_s3_internal.h index c37663a62..347c879e7 100644 --- a/src/iceberg/arrow/s3/arrow_s3_internal.h +++ b/src/iceberg/arrow/s3/arrow_s3_internal.h @@ -19,6 +19,7 @@ #pragma once +#include #include #include @@ -39,6 +40,11 @@ namespace iceberg::arrow { /// addressing style) can be unit tested without a live S3 endpoint. ICEBERG_BUNDLE_EXPORT Result<::arrow::fs::S3Options> ConfigureS3Options( const std::unordered_map& properties); + +/// \brief Build an Arrow S3 file system from a properties map (initializes S3 if +/// needed). Exposed so the credential-aware FileIO can build one fs per prefix. +ICEBERG_BUNDLE_EXPORT Result> +BuildArrowS3FileSystem(const std::unordered_map& properties); #endif } // namespace iceberg::arrow diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 36950c49d..ca992d1ca 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -525,31 +525,25 @@ Result> RestCatalog::TableAuthSession( Result> RestCatalog::TableFileIO( const SessionContext& /*context*/, const std::unordered_map& table_config, - const std::vector& storage_credentials, - std::string_view storage_location) const { - // Bind one S3 FileIO from the vended credential whose prefix best matches the - // table location (the common one-credential-per-table case). - // TODO: support per-path credential routing, STS refresh via credentials.uri, - // and non-S3 (GCS/ADLS) credentials, like Java's S3FileIO. - if (const StorageCredential* s3_cred = - SelectS3StorageCredential(storage_credentials, storage_location); - s3_cred != nullptr) { - ICEBERG_ASSIGN_OR_RAISE( - auto io, MakeS3FileIOFromCredential(config_.configs(), table_config, *s3_cred)); + const std::vector& storage_credentials) const { + if (!storage_credentials.empty()) { + // Only non-S3 (GCS/ADLS) credentials vended, which we can't honor; fail fast. + if (HasOnlyNonS3StorageCredentials(storage_credentials)) { + auto prefixes = + storage_credentials | std::views::transform(&StorageCredential::prefix); + return NotSupported( + "Vended storage credentials {} are unsupported (only S3-family " + "credentials are supported)", + FormatRange(prefixes, ", ", "[", "]")); + } + // Build a FileIO that routes each path to its vended credential. TODO: STS + // refresh via credentials.uri is not yet supported. + ICEBERG_ASSIGN_OR_RAISE(auto io, + MakeS3FileIOFromCredentials(config_.configs(), table_config, + storage_credentials)); return std::shared_ptr(std::move(io)); } - // Only non-S3 (e.g. GCS/ADLS) credentials were vended and we can't honor them; - // fail fast. (S3 credentials that simply don't match fall through below.) - if (HasOnlyNonS3StorageCredentials(storage_credentials)) { - auto prefixes = - storage_credentials | std::views::transform(&StorageCredential::prefix); - return NotSupported( - "Vended storage credentials {} are unsupported (only S3-family " - "credentials are supported)", - FormatRange(prefixes, ", ", "[", "]")); - } - ICEBERG_RETURN_UNEXPECTED(ValidateNoFileIOConfig(table_config)); return file_io_; } @@ -789,9 +783,8 @@ Result> RestCatalog::StageCreateTable( /*stage_create=*/true, *contextual_session)); auto table_config = std::move(result.config); auto storage_credentials = std::move(result.storage_credentials); - ICEBERG_ASSIGN_OR_RAISE( - auto table_io, - TableFileIO(context, table_config, storage_credentials, result.metadata->location)); + ICEBERG_ASSIGN_OR_RAISE(auto table_io, + TableFileIO(context, table_config, storage_credentials)); ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); @@ -906,9 +899,8 @@ Result> RestCatalog::MakeTableFromLoadResult( std::shared_ptr contextual_session) { auto table_config = std::move(result.config); auto storage_credentials = std::move(result.storage_credentials); - ICEBERG_ASSIGN_OR_RAISE( - auto table_io, - TableFileIO(context, table_config, storage_credentials, result.metadata->location)); + ICEBERG_ASSIGN_OR_RAISE(auto table_io, + TableFileIO(context, table_config, storage_credentials)); ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 8d7694759..7b5613a0c 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -80,8 +80,7 @@ class ICEBERG_REST_EXPORT RestCatalog final Result> TableFileIO( const SessionContext& context, const std::unordered_map& table_config, - const std::vector& storage_credentials, - std::string_view storage_location) const; + const std::vector& storage_credentials) const; Result> ListNamespaces(const Namespace& ns, auth::AuthSession& session) const; diff --git a/src/iceberg/catalog/rest/rest_file_io.cc b/src/iceberg/catalog/rest/rest_file_io.cc index 775eb3699..e0f47613a 100644 --- a/src/iceberg/catalog/rest/rest_file_io.cc +++ b/src/iceberg/catalog/rest/rest_file_io.cc @@ -22,10 +22,13 @@ #include #include #include +#include #include #include "iceberg/catalog/rest/types.h" +#include "iceberg/file_io.h" #include "iceberg/file_io_registry.h" +#include "iceberg/util/location_util.h" #include "iceberg/util/macros.h" namespace iceberg::rest { @@ -37,18 +40,6 @@ bool IsBuiltinImpl(std::string_view io_impl) { io_impl == FileIORegistry::kArrowS3FileIO; } -// Rewrites S3-compatible schemes (s3a/s3n/oss) to the canonical s3:// so a vended -// credential matches a table location using an equivalent scheme (e.g. DLF vends -// an `s3` credential for oss:// locations). -std::string CanonicalizeS3Scheme(std::string_view location) { - for (std::string_view scheme : {"s3a://", "s3n://", "oss://"}) { - if (location.starts_with(scheme)) { - return std::string("s3://").append(location.substr(scheme.size())); - } - } - return std::string(location); -} - } // namespace Result DetectBuiltinFileIO(std::string_view location) { @@ -108,27 +99,6 @@ Result> MakeCatalogFileIO(const RestCatalogProperties& c return FileIORegistry::Load(io_impl, config.configs()); } -const StorageCredential* SelectS3StorageCredential( - const std::vector& credentials, - std::string_view storage_location) { - const std::string location = CanonicalizeS3Scheme(storage_location); - const StorageCredential* best = nullptr; - size_t best_size = 0; - for (const auto& cred : credentials) { - if (!cred.prefix.starts_with("s3")) { - continue; - } - // Keep the longest S3 prefix that matches this location; matching globally - // would bind a credential to paths it does not cover. - const std::string prefix = CanonicalizeS3Scheme(cred.prefix); - if (location.starts_with(prefix) && (best == nullptr || prefix.size() > best_size)) { - best = &cred; - best_size = prefix.size(); - } - } - return best; -} - bool HasOnlyNonS3StorageCredentials(const std::vector& credentials) { return !credentials.empty() && std::ranges::none_of(credentials, [](const StorageCredential& cred) { @@ -136,18 +106,42 @@ bool HasOnlyNonS3StorageCredentials(const std::vector& creden }); } -Result> MakeS3FileIOFromCredential( +Result> MakeS3FileIOFromCredentials( const std::unordered_map& catalog_config, const std::unordered_map& table_config, - const StorageCredential& s3_cred) { - auto properties = catalog_config; + const std::vector& storage_credentials) { + auto default_properties = catalog_config; for (const auto& [key, value] : table_config) { - properties[key] = value; + default_properties[key] = value; } - for (const auto& [key, value] : s3_cred.config) { - properties[key] = value; + + // Default S3 FileIO (for paths matching no credential prefix), built via the + // registry to keep this layer decoupled from the Arrow/S3 implementation. + ICEBERG_ASSIGN_OR_RAISE( + auto io, FileIORegistry::Load(std::string(FileIORegistry::kArrowS3FileIO), + default_properties)); + + // One property set per S3-family credential, keyed by canonicalized prefix; + // the credential's config overrides the default properties. + std::vector>> + properties_by_prefix; + for (const auto& cred : storage_credentials) { + if (!cred.prefix.starts_with("s3")) { + continue; + } + auto properties = default_properties; + for (const auto& [key, value] : cred.config) { + properties[key] = value; + } + properties_by_prefix.emplace_back(LocationUtil::CanonicalizeS3Scheme(cred.prefix), + std::move(properties)); + } + + // Hand the per-prefix properties to the FileIO for per-path credential routing. + if (auto* credentialed = dynamic_cast(io.get())) { + ICEBERG_RETURN_UNEXPECTED(credentialed->SetStorageCredentials(properties_by_prefix)); } - return FileIORegistry::Load(std::string(FileIORegistry::kArrowS3FileIO), properties); + return io; } } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/rest_file_io.h b/src/iceberg/catalog/rest/rest_file_io.h index f8375ed7e..d5052143f 100644 --- a/src/iceberg/catalog/rest/rest_file_io.h +++ b/src/iceberg/catalog/rest/rest_file_io.h @@ -47,24 +47,17 @@ ICEBERG_REST_EXPORT std::string_view BuiltinFileIOName(BuiltinFileIOKind kind); ICEBERG_REST_EXPORT Result> MakeCatalogFileIO( const RestCatalogProperties& config); -/// \brief Returns the S3-family vended credential (prefix starting with "s3") -/// whose prefix is the longest match for `storage_location`, or nullptr if none -/// applies (Iceberg REST spec / Java `S3FileIO` longest-prefix matching). The -/// s3/s3a/s3n schemes are treated as equivalent when matching. -ICEBERG_REST_EXPORT const StorageCredential* SelectS3StorageCredential( - const std::vector& credentials, std::string_view storage_location); - -/// \brief True if `credentials` is non-empty but contains no S3-family -/// credential (prefix starting with "s3") — i.e. only unsupported schemes such -/// as GCS/ADLS were vended, which an S3/local FileIO cannot honor. +/// \brief True if `credentials` is non-empty but has no S3-family credential +/// (prefix starting with "s3") — only unsupported schemes (GCS/ADLS) were vended. ICEBERG_REST_EXPORT bool HasOnlyNonS3StorageCredentials( const std::vector& credentials); -/// \brief Builds an `arrow-fs-s3` FileIO, merging catalog, table, then -/// credential config (later wins). -ICEBERG_REST_EXPORT Result> MakeS3FileIOFromCredential( +/// \brief Build an S3 FileIO that routes each object path to a per-prefix file +/// system, one per S3-family vended credential (config merged catalog < table < +/// credential). Non-S3 credentials are ignored. +ICEBERG_REST_EXPORT Result> MakeS3FileIOFromCredentials( const std::unordered_map& catalog_config, const std::unordered_map& table_config, - const StorageCredential& s3_cred); + const std::vector& storage_credentials); } // namespace iceberg::rest diff --git a/src/iceberg/file_io.h b/src/iceberg/file_io.h index ba6f0129a..5f9e071e0 100644 --- a/src/iceberg/file_io.h +++ b/src/iceberg/file_io.h @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include "iceberg/iceberg_export.h" @@ -173,4 +175,19 @@ class ICEBERG_EXPORT FileIO { virtual Status DeleteFiles(const std::vector& file_locations); }; +/// \brief Mix-in for FileIO implementations that route object paths to +/// per-prefix file systems built from vended storage credentials, letting the +/// catalog stay decoupled from the concrete (Arrow/S3) implementation. +class ICEBERG_EXPORT SupportsStorageCredentials { + public: + virtual ~SupportsStorageCredentials() = default; + + /// \brief Install per-prefix file systems. `properties_by_prefix` maps a + /// canonicalized prefix to its fully merged properties. + virtual Status SetStorageCredentials( + const std::vector< + std::pair>>& + properties_by_prefix) = 0; +}; + } // namespace iceberg diff --git a/src/iceberg/test/rest_file_io_test.cc b/src/iceberg/test/rest_file_io_test.cc index 6f85f1910..d54943fe4 100644 --- a/src/iceberg/test/rest_file_io_test.cc +++ b/src/iceberg/test/rest_file_io_test.cc @@ -29,6 +29,7 @@ #include "iceberg/catalog/rest/types.h" #include "iceberg/file_io_registry.h" #include "iceberg/test/matchers.h" +#include "iceberg/util/location_util.h" namespace iceberg::rest { @@ -152,73 +153,33 @@ TEST(RestFileIOTest, MakeCatalogFileIOSkipsCheckWhenWarehouseAbsent) { ASSERT_THAT(result, IsOk()); } -TEST(RestFileIOTest, SelectS3StorageCredentialPicksLongestMatchingS3Prefix) { - std::vector credentials = { - {.prefix = "s3", .config = {{"s3.access-key-id", "a"}}}, - {.prefix = "s3://bucket/data", .config = {{"s3.access-key-id", "b"}}}, - {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "c"}}}, - }; - const auto* cred = SelectS3StorageCredential(credentials, "s3://bucket/data/db/t"); - ASSERT_NE(cred, nullptr); - EXPECT_EQ(cred->prefix, "s3://bucket/data"); +TEST(RestFileIOTest, CanonicalizeS3SchemeTreatsS3CompatibleSchemesAsS3) { + // s3a/s3n/oss canonicalize to s3:// so a vended `s3` credential prefix-matches + // them uniformly (DLF vends `s3` for oss:// locations). + EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("oss://bucket/db/t"), "s3://bucket/db/t"); + EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("s3a://bucket/x"), "s3://bucket/x"); + EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("s3n://bucket/x"), "s3://bucket/x"); + // Already-canonical and non-S3 / scheme-less locations are unchanged. + EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("s3://bucket/x"), "s3://bucket/x"); + EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("gs://bucket"), "gs://bucket"); + EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("/local/path"), "/local/path"); } -TEST(RestFileIOTest, SelectS3StorageCredentialMatchesAgainstStorageLocation) { - std::vector credentials = { - // Globally longest prefix, but for a path that does not cover this table. - {.prefix = "s3://other/very/long/prefix", .config = {{"s3.access-key-id", "x"}}}, - {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "y"}}}, - }; - const auto* cred = SelectS3StorageCredential(credentials, "s3://bucket/db/t/data"); - ASSERT_NE(cred, nullptr); - // The longest *matching* prefix wins, not the globally longest one. - EXPECT_EQ(cred->prefix, "s3://bucket"); -} - -TEST(RestFileIOTest, SelectS3StorageCredentialMatchesEquivalentS3Schemes) { - std::vector s3_cred = { - {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "a"}}}, - }; - // s3a:// and s3n:// locations match an s3:// vended prefix (same backend). - EXPECT_NE(SelectS3StorageCredential(s3_cred, "s3a://bucket/db/t"), nullptr); - EXPECT_NE(SelectS3StorageCredential(s3_cred, "s3n://bucket/db/t"), nullptr); - - // ...and the inverse: an s3a:// prefix matches an s3:// location. - std::vector s3a_cred = { - {.prefix = "s3a://bucket", .config = {{"s3.access-key-id", "b"}}}, - }; - const auto* cred = SelectS3StorageCredential(s3a_cred, "s3://bucket/db/t"); - ASSERT_NE(cred, nullptr); - EXPECT_EQ(cred->prefix, "s3a://bucket"); - - // OSS is S3-compatible: an `s3` credential matches an oss:// location (Alibaba - // DLF vends an `s3` credential for tables whose location uses the oss:// scheme). - EXPECT_NE(SelectS3StorageCredential(s3_cred, "oss://bucket/db/t"), nullptr); -} - -TEST(RestFileIOTest, SelectS3StorageCredentialIgnoresNonS3Prefixes) { - std::vector credentials = { - {.prefix = "gs://bucket", .config = {{"k", "v"}}}, - {.prefix = "s3", .config = {{"s3.access-key-id", "a"}}}, - }; - const auto* cred = SelectS3StorageCredential(credentials, "s3://bucket/data"); - ASSERT_NE(cred, nullptr); - EXPECT_EQ(cred->prefix, "s3"); -} +TEST(RestFileIOTest, PathHasPrefixMatchesAtPathBoundary) { + // Must match only at a path boundary, so a `s3://bucket/db/t1` credential does + // not capture a sibling table under `s3://bucket/db/t1x/...`. + EXPECT_TRUE(LocationUtil::PathHasPrefix("s3://bucket/db/t1/data/f.parquet", + "s3://bucket/db/t1")); + EXPECT_TRUE(LocationUtil::PathHasPrefix("s3://bucket/db/t1", "s3://bucket/db/t1")); + EXPECT_FALSE(LocationUtil::PathHasPrefix("s3://bucket/db/t1x/data/f.parquet", + "s3://bucket/db/t1")); + EXPECT_FALSE(LocationUtil::PathHasPrefix("s3://bucket-other/x", "s3://bucket")); -TEST(RestFileIOTest, SelectS3StorageCredentialReturnsNullWhenNoneMatch) { - std::vector credentials = { - {.prefix = "gs://bucket", .config = {{"k", "v"}}}, - }; - // Non-S3 prefix, S3 location. - EXPECT_EQ(SelectS3StorageCredential(credentials, "s3://bucket"), nullptr); - // S3 credential whose prefix does not cover the location. - EXPECT_EQ(SelectS3StorageCredential( - {{.prefix = "s3://other", .config = {{"s3.access-key-id", "a"}}}}, - "s3://bucket/data"), - nullptr); - // No credentials at all. - EXPECT_EQ(SelectS3StorageCredential({}, "s3://bucket"), nullptr); + // A bare-scheme credential (DLF vends `s3`) matches any authority/path. + EXPECT_TRUE(LocationUtil::PathHasPrefix("s3://bucket/db/t/f", "s3")); + EXPECT_TRUE(LocationUtil::PathHasPrefix( + LocationUtil::CanonicalizeS3Scheme("oss://bucket/db/t/f"), "s3")); + EXPECT_FALSE(LocationUtil::PathHasPrefix("gs://bucket/x", "s3")); } TEST(RestFileIOTest, HasOnlyNonS3StorageCredentials) { @@ -234,23 +195,4 @@ TEST(RestFileIOTest, HasOnlyNonS3StorageCredentials) { EXPECT_FALSE(HasOnlyNonS3StorageCredentials({})); } -TEST(RestFileIOTest, MakeS3FileIOFromCredentialMergesConfigWithPrecedence) { - auto captured = std::make_shared>(); - FileIORegistry::Register( - std::string(FileIORegistry::kArrowS3FileIO), - [captured](const std::unordered_map& properties) - -> Result> { - *captured = properties; - return std::make_unique(); - }); - auto result = MakeS3FileIOFromCredential( - {{"a", "catalog"}, {"shared", "catalog"}}, {{"b", "table"}, {"shared", "table"}}, - StorageCredential{.prefix = "s3", .config = {{"c", "cred"}, {"shared", "cred"}}}); - ASSERT_THAT(result, IsOk()); - EXPECT_EQ((*captured)["a"], "catalog"); - EXPECT_EQ((*captured)["b"], "table"); - EXPECT_EQ((*captured)["c"], "cred"); - EXPECT_EQ((*captured)["shared"], "cred"); -} - } // namespace iceberg::rest diff --git a/src/iceberg/util/location_util.h b/src/iceberg/util/location_util.h index eb78dece3..cbeac1574 100644 --- a/src/iceberg/util/location_util.h +++ b/src/iceberg/util/location_util.h @@ -19,6 +19,7 @@ #pragma once +#include #include #include "iceberg/iceberg_export.h" @@ -37,6 +38,27 @@ class ICEBERG_EXPORT LocationUtil { } return path; } + + /// \brief Rewrites S3-compatible schemes (s3a://, s3n://, oss://) to s3:// so + /// locations and credential prefixes can be prefix-matched uniformly. + static std::string CanonicalizeS3Scheme(std::string_view location) { + for (std::string_view scheme : {"s3a://", "s3n://", "oss://"}) { + if (location.starts_with(scheme)) { + return std::string("s3://").append(location.substr(scheme.size())); + } + } + return std::string(location); + } + + /// \brief True if `prefix` matches `path` at a path boundary (equal, next char + /// '/', or a bare scheme), so `s3://bucket` does not match `s3://bucket-x`. + static bool PathHasPrefix(std::string_view path, std::string_view prefix) { + if (!path.starts_with(prefix)) { + return false; + } + return path.size() == prefix.size() || path[prefix.size()] == '/' || + prefix.find("://") == std::string_view::npos; + } }; } // namespace iceberg From 7a7440dfdd8662910c2008560ee38af3ae642d17 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 27 Jun 2026 23:17:10 +0800 Subject: [PATCH 8/8] polish design and refine impl --- src/iceberg/arrow/arrow_io.cc | 112 +++-------- src/iceberg/arrow/arrow_io_internal.h | 24 +-- src/iceberg/arrow/s3/arrow_s3_file_io.cc | 168 ++++++++++++++--- src/iceberg/arrow/s3/arrow_s3_internal.h | 50 ----- src/iceberg/arrow/s3/s3_properties.h | 4 +- src/iceberg/catalog/rest/json_serde.cc | 36 ++-- src/iceberg/catalog/rest/rest_catalog.cc | 65 +------ src/iceberg/catalog/rest/rest_catalog.h | 1 + src/iceberg/catalog/rest/rest_file_io.cc | 68 +++---- src/iceberg/catalog/rest/rest_file_io.h | 11 +- src/iceberg/catalog/rest/type_fwd.h | 1 - src/iceberg/catalog/rest/types.h | 27 +-- src/iceberg/file_io.h | 20 +- src/iceberg/meson.build | 1 + src/iceberg/storage_credential.h | 48 +++++ src/iceberg/test/arrow_s3_file_io_test.cc | 214 +++++++++++++--------- src/iceberg/test/rest_file_io_test.cc | 123 +++++++++---- src/iceberg/test/rest_json_serde_test.cc | 49 ++--- src/iceberg/type_fwd.h | 1 + src/iceberg/util/location_util.h | 22 --- 20 files changed, 525 insertions(+), 520 deletions(-) delete mode 100644 src/iceberg/arrow/s3/arrow_s3_internal.h create mode 100644 src/iceberg/storage_credential.h diff --git a/src/iceberg/arrow/arrow_io.cc b/src/iceberg/arrow/arrow_io.cc index d5cd2d7e3..4c795badf 100644 --- a/src/iceberg/arrow/arrow_io.cc +++ b/src/iceberg/arrow/arrow_io.cc @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -36,8 +35,6 @@ #include "iceberg/arrow/arrow_io_internal.h" #include "iceberg/arrow/arrow_io_util.h" #include "iceberg/arrow/arrow_status_internal.h" -#include "iceberg/arrow/s3/arrow_s3_internal.h" -#include "iceberg/util/location_util.h" #include "iceberg/util/macros.h" namespace iceberg::arrow { @@ -486,53 +483,13 @@ class ArrowOutputFile : public OutputFile { } // namespace -const std::shared_ptr<::arrow::fs::FileSystem>& ArrowFileSystemFileIO::FileSystemForPath( - std::string_view location) const { - if (fs_by_prefix_.empty()) { - return arrow_fs_; - } - // Longest matching prefix wins; fall back to the default file system. - const std::string canonical = LocationUtil::CanonicalizeS3Scheme(location); - const std::shared_ptr<::arrow::fs::FileSystem>* best = &arrow_fs_; - size_t best_len = 0; - for (const auto& [prefix, fs] : fs_by_prefix_) { - if (prefix.size() > best_len && LocationUtil::PathHasPrefix(canonical, prefix)) { - best = &fs; - best_len = prefix.size(); - } - } - return *best; -} - -Status ArrowFileSystemFileIO::SetStorageCredentials( - const std::vector< - std::pair>>& - properties_by_prefix) { -#if ICEBERG_S3_ENABLED - std::vector>> - fs_by_prefix; - fs_by_prefix.reserve(properties_by_prefix.size()); - for (const auto& [prefix, properties] : properties_by_prefix) { - ICEBERG_ASSIGN_OR_RAISE(auto fs, BuildArrowS3FileSystem(properties)); - fs_by_prefix.emplace_back(prefix, std::move(fs)); - } - fs_by_prefix_ = std::move(fs_by_prefix); - return {}; -#else - (void)properties_by_prefix; - return NotSupported("S3 storage credentials require Arrow S3 support"); -#endif -} - -Result ArrowFileSystemFileIO::ResolvePath( - const std::shared_ptr<::arrow::fs::FileSystem>& fs, - const std::string& file_location) { +Result ArrowFileSystemFileIO::ResolvePath(const std::string& file_location) { const auto pos = file_location.find("://"); if (pos == std::string::npos) { return file_location; } - auto path = fs->PathFromUri(file_location); + auto path = arrow_fs_->PathFromUri(file_location); if (path.ok()) { return std::move(path).ValueOrDie(); } @@ -555,14 +512,14 @@ Result> OpenArrowInputStream( ICEBERG_PRECHECK(io != nullptr, "FileIO cannot be null"); if (auto arrow_io = std::dynamic_pointer_cast(io)) { - const auto& fs = arrow_io->FileSystemForPath(path); - ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(fs, path)); + ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(path)); ::arrow::fs::FileInfo file_info(resolved_path, ::arrow::fs::FileType::File); if (length.has_value()) { ICEBERG_ASSIGN_OR_RAISE(auto size, ToInt64Length(*length)); file_info.set_size(size); } - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto input, fs->OpenInputFile(file_info)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto input, + arrow_io->arrow_fs_->OpenInputFile(file_info)); return input; } @@ -586,15 +543,16 @@ Result> OpenArrowOutputStream( ICEBERG_PRECHECK(io != nullptr, "FileIO cannot be null"); if (auto arrow_io = std::dynamic_pointer_cast(io)) { - const auto& fs = arrow_io->FileSystemForPath(path); - ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(fs, path)); + ICEBERG_ASSIGN_OR_RAISE(auto resolved_path, arrow_io->ResolvePath(path)); if (!overwrite) { - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto info, fs->GetFileInfo(resolved_path)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto info, + arrow_io->arrow_fs_->GetFileInfo(resolved_path)); if (info.type() != ::arrow::fs::FileType::NotFound) { return AlreadyExists("File already exists: {}", path); } } - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto output, fs->OpenOutputStream(resolved_path)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto output, + arrow_io->arrow_fs_->OpenOutputStream(resolved_path)); return output; } @@ -610,60 +568,42 @@ Result> OpenArrowOutputStream( Result> ArrowFileSystemFileIO::NewInputFile( std::string file_location) { - const auto& fs = FileSystemForPath(file_location); - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); - return std::make_unique(fs, std::move(file_location), std::move(path), - std::nullopt); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + return std::make_unique(arrow_fs_, std::move(file_location), + std::move(path), std::nullopt); } Result> ArrowFileSystemFileIO::NewInputFile( std::string file_location, size_t length) { ICEBERG_ASSIGN_OR_RAISE(auto size, ToInt64Length(length)); - const auto& fs = FileSystemForPath(file_location); - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); - return std::make_unique(fs, std::move(file_location), std::move(path), - size); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + return std::make_unique(arrow_fs_, std::move(file_location), + std::move(path), size); } Result> ArrowFileSystemFileIO::NewOutputFile( std::string file_location) { - const auto& fs = FileSystemForPath(file_location); - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); - return std::make_unique(fs, std::move(file_location), std::move(path)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + return std::make_unique(arrow_fs_, std::move(file_location), + std::move(path)); } /// \brief Delete a file at the given location. Status ArrowFileSystemFileIO::DeleteFile(const std::string& file_location) { - const auto& fs = FileSystemForPath(file_location); - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); - ICEBERG_ARROW_RETURN_NOT_OK(fs->DeleteFile(path)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(path)); return {}; } Status ArrowFileSystemFileIO::DeleteFiles( const std::vector& file_locations) { - if (fs_by_prefix_.empty()) { - // No per-prefix routing: one batched delete on the default file system. - std::vector paths; - paths.reserve(file_locations.size()); - for (const auto& file_location : file_locations) { - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(arrow_fs_, file_location)); - paths.push_back(std::move(path)); - } - ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFiles(paths)); - return {}; - } - - // Paths may route to different file systems: group by fs, then batch per fs. - std::unordered_map<::arrow::fs::FileSystem*, std::vector> paths_by_fs; + std::vector paths; + paths.reserve(file_locations.size()); for (const auto& file_location : file_locations) { - const auto& fs = FileSystemForPath(file_location); - ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(fs, file_location)); - paths_by_fs[fs.get()].push_back(std::move(path)); - } - for (auto& [fs, paths] : paths_by_fs) { - ICEBERG_ARROW_RETURN_NOT_OK(fs->DeleteFiles(paths)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + paths.push_back(std::move(path)); } + ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFiles(paths)); return {}; } diff --git a/src/iceberg/arrow/arrow_io_internal.h b/src/iceberg/arrow/arrow_io_internal.h index 0e3607cb2..a6b85b6c9 100644 --- a/src/iceberg/arrow/arrow_io_internal.h +++ b/src/iceberg/arrow/arrow_io_internal.h @@ -23,9 +23,6 @@ #include #include #include -#include -#include -#include #include #include @@ -55,8 +52,7 @@ OpenArrowOutputStream(const std::shared_ptr& io, const std::string& path bool overwrite = true); /// \brief A concrete implementation of FileIO for Arrow file system. -class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO, - public SupportsStorageCredentials { +class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO { public: explicit ArrowFileSystemFileIO(std::shared_ptr<::arrow::fs::FileSystem> arrow_fs) : arrow_fs_(std::move(arrow_fs)) {} @@ -85,12 +81,6 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO, /// \brief Delete files at the given locations. Status DeleteFiles(const std::vector& file_locations) override; - /// \brief Build one S3 file system per credential prefix for per-path routing. - Status SetStorageCredentials( - const std::vector< - std::pair>>& - properties_by_prefix) override; - /// \brief Get the Arrow file system. const std::shared_ptr<::arrow::fs::FileSystem>& fs() const { return arrow_fs_; } @@ -102,18 +92,10 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO, friend Result> OpenArrowOutputStream( const std::shared_ptr& io, const std::string& path, bool overwrite); - /// \brief Pick the file system for `location` (longest matching prefix, else - /// the default). - const std::shared_ptr<::arrow::fs::FileSystem>& FileSystemForPath( - std::string_view location) const; - - /// \brief Resolve a file location to a filesystem path using `fs`. - Result ResolvePath(const std::shared_ptr<::arrow::fs::FileSystem>& fs, - const std::string& file_location); + /// \brief Resolve a file location to a filesystem path. + Result ResolvePath(const std::string& file_location); std::shared_ptr<::arrow::fs::FileSystem> arrow_fs_; - std::vector>> - fs_by_prefix_; }; } // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3/arrow_s3_file_io.cc b/src/iceberg/arrow/s3/arrow_s3_file_io.cc index f65f023ca..a7e98620e 100644 --- a/src/iceberg/arrow/s3/arrow_s3_file_io.cc +++ b/src/iceberg/arrow/s3/arrow_s3_file_io.cc @@ -18,10 +18,13 @@ */ #include +#include #include #include #include +#include #include +#include #include #if ICEBERG_S3_ENABLED @@ -31,16 +34,16 @@ #include "iceberg/arrow/arrow_io_internal.h" #include "iceberg/arrow/arrow_io_util.h" #include "iceberg/arrow/arrow_status_internal.h" -#include "iceberg/arrow/s3/arrow_s3_internal.h" #include "iceberg/arrow/s3/s3_properties.h" #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" namespace iceberg::arrow { +#if ICEBERG_S3_ENABLED + namespace { -#if ICEBERG_S3_ENABLED const std::string* FindProperty( const std::unordered_map& properties, std::string_view key) { @@ -87,12 +90,13 @@ std::string SplitEndpointScheme(std::string_view endpoint, return std::string(endpoint); } -#endif +bool IsS3FileIOCredentialPrefix(std::string_view prefix) { + return prefix == "s3" || prefix.starts_with("s3://") || prefix.starts_with("s3a://") || + prefix.starts_with("s3n://"); +} } // namespace -#if ICEBERG_S3_ENABLED - /// \brief Configure S3Options from a properties map. /// /// \param properties The configuration properties map. @@ -119,17 +123,12 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( } // Configure region - // Prefer the standard `client.region`; fall back to legacy `s3.region`. - const auto* region = FindProperty(properties, S3Properties::kClientRegion); - if (region == nullptr) { - region = FindProperty(properties, S3Properties::kRegion); - } - if (region != nullptr) { + if (const auto* region = FindProperty(properties, S3Properties::kClientRegion); + region != nullptr) { options.region = *region; } - // Configure endpoint (for MinIO, LocalStack, OSS, etc.) from `s3.endpoint` or - // the AWS standard env vars. + // Configure endpoint (for MinIO, LocalStack, etc.) if (const auto* endpoint = FindProperty(properties, S3Properties::kEndpoint); endpoint != nullptr) { options.endpoint_override = SplitEndpointScheme(*endpoint, options); @@ -171,9 +170,9 @@ Result<::arrow::fs::S3Options> ConfigureS3Options( return options; } -#endif -#if ICEBERG_S3_ENABLED +namespace { + Result> BuildArrowS3FileSystem( const std::unordered_map& properties) { ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); @@ -181,27 +180,150 @@ Result> BuildArrowS3FileSystem( ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options)); return std::shared_ptr<::arrow::fs::FileSystem>(std::move(fs)); } -#endif + +std::string CanonicalizeS3Scheme(std::string_view location) { + for (std::string_view scheme : {"s3a://", "s3n://", "oss://"}) { + if (location.starts_with(scheme)) { + return std::string("s3://").append(location.substr(scheme.size())); + } + } + return std::string(location); +} + +class ArrowS3FileIO final : public FileIO, public SupportsStorageCredentials { + public: + ArrowS3FileIO(std::shared_ptr<::arrow::fs::FileSystem> arrow_fs, + std::unordered_map default_properties) + : default_file_io_(std::move(arrow_fs)), + default_properties_(std::move(default_properties)) {} + + Result> NewInputFile(std::string file_location) override; + + Result> NewInputFile(std::string file_location, + size_t length) override; + + Result> NewOutputFile(std::string file_location) override; + + Status DeleteFile(const std::string& file_location) override; + + Status DeleteFiles(const std::vector& file_locations) override; + + Status SetStorageCredentials( + const std::vector& storage_credentials) override; + + const std::vector& credentials() const override { + return storage_credentials_; + } + + SupportsStorageCredentials* AsSupportsStorageCredentials() override { return this; } + + private: + ArrowFileSystemFileIO& FileIOForPath(std::string_view location); + + ArrowFileSystemFileIO default_file_io_; + std::unordered_map default_properties_; + std::vector storage_credentials_; + std::vector>> + file_io_by_prefix_; +}; + +Status ArrowS3FileIO::SetStorageCredentials( + const std::vector& storage_credentials) { + std::vector>> + file_io_by_prefix; + file_io_by_prefix.reserve(storage_credentials.size()); + // TODO(gangwu): Refresh vended credentials via credentials.uri before tokens expire. + for (const auto& credential : storage_credentials) { + ICEBERG_RETURN_UNEXPECTED(credential.Validate()); + if (!IsS3FileIOCredentialPrefix(credential.prefix)) { + return NotSupported( + "Storage credential prefix '{}' is unsupported by Arrow S3 FileIO", + credential.prefix); + } + auto properties = default_properties_; + for (const auto& [key, value] : credential.config) { + properties[key] = value; + } + ICEBERG_ASSIGN_OR_RAISE(auto fs, BuildArrowS3FileSystem(properties)); + file_io_by_prefix.emplace_back( + CanonicalizeS3Scheme(credential.prefix), + std::make_unique(std::move(fs))); + } + file_io_by_prefix_ = std::move(file_io_by_prefix); + storage_credentials_ = storage_credentials; + return {}; +} + +ArrowFileSystemFileIO& ArrowS3FileIO::FileIOForPath(std::string_view location) { + if (file_io_by_prefix_.empty()) { + return default_file_io_; + } + const std::string canonical = CanonicalizeS3Scheme(location); + ArrowFileSystemFileIO* best = &default_file_io_; + size_t best_len = 0; + for (const auto& [prefix, file_io] : file_io_by_prefix_) { + if (prefix.size() > best_len && canonical.starts_with(prefix)) { + best = file_io.get(); + best_len = prefix.size(); + } + } + return *best; +} + +Result> ArrowS3FileIO::NewInputFile( + std::string file_location) { + return FileIOForPath(file_location).NewInputFile(std::move(file_location)); +} + +Result> ArrowS3FileIO::NewInputFile(std::string file_location, + size_t length) { + return FileIOForPath(file_location).NewInputFile(std::move(file_location), length); +} + +Result> ArrowS3FileIO::NewOutputFile( + std::string file_location) { + return FileIOForPath(file_location).NewOutputFile(std::move(file_location)); +} + +Status ArrowS3FileIO::DeleteFile(const std::string& file_location) { + return FileIOForPath(file_location).DeleteFile(file_location); +} + +Status ArrowS3FileIO::DeleteFiles(const std::vector& file_locations) { + std::unordered_map> locations_by_io; + for (const auto& file_location : file_locations) { + locations_by_io[&FileIOForPath(file_location)].push_back(file_location); + } + for (auto& [file_io, locations] : locations_by_io) { + ICEBERG_RETURN_UNEXPECTED(file_io->DeleteFiles(locations)); + } + return {}; +} + +} // namespace Result> MakeS3FileIO( const std::unordered_map& properties) { -#if ICEBERG_S3_ENABLED // Uses default credentials if properties are empty. ICEBERG_ASSIGN_OR_RAISE(auto fs, BuildArrowS3FileSystem(properties)); - return std::make_unique(std::move(fs)); -#else - return NotSupported("Arrow S3 support is not enabled"); -#endif + return std::make_unique(std::move(fs), properties); } Status FinalizeS3() { -#if ICEBERG_S3_ENABLED auto status = ::arrow::fs::FinalizeS3(); ICEBERG_ARROW_RETURN_NOT_OK(status); return {}; +} + #else + +Result> MakeS3FileIO( + [[maybe_unused]] const std::unordered_map& properties) { return NotSupported("Arrow S3 support is not enabled"); -#endif } +Status FinalizeS3() { return NotSupported("Arrow S3 support is not enabled"); } + +#endif + } // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3/arrow_s3_internal.h b/src/iceberg/arrow/s3/arrow_s3_internal.h deleted file mode 100644 index 347c879e7..000000000 --- a/src/iceberg/arrow/s3/arrow_s3_internal.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include -#include -#include - -#include "iceberg/iceberg_bundle_export.h" -#include "iceberg/result.h" - -#if ICEBERG_S3_ENABLED -# include -#endif - -namespace iceberg::arrow { - -#if ICEBERG_S3_ENABLED -/// \brief Build Arrow ``S3Options`` from an Iceberg properties map. -/// -/// Production code should use MakeS3FileIO(); this is exposed so the -/// property-to-option mapping (region resolution, endpoint scheme handling, -/// addressing style) can be unit tested without a live S3 endpoint. -ICEBERG_BUNDLE_EXPORT Result<::arrow::fs::S3Options> ConfigureS3Options( - const std::unordered_map& properties); - -/// \brief Build an Arrow S3 file system from a properties map (initializes S3 if -/// needed). Exposed so the credential-aware FileIO can build one fs per prefix. -ICEBERG_BUNDLE_EXPORT Result> -BuildArrowS3FileSystem(const std::unordered_map& properties); -#endif - -} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3/s3_properties.h b/src/iceberg/arrow/s3/s3_properties.h index 61248948b..7b76968a6 100644 --- a/src/iceberg/arrow/s3/s3_properties.h +++ b/src/iceberg/arrow/s3/s3_properties.h @@ -37,9 +37,7 @@ struct S3Properties { static constexpr std::string_view kSecretAccessKey = "s3.secret-access-key"; /// AWS session token (for temporary credentials) static constexpr std::string_view kSessionToken = "s3.session-token"; - /// AWS region (legacy, non-standard key kept for compatibility) - static constexpr std::string_view kRegion = "s3.region"; - /// AWS region, standard Iceberg client property (preferred over kRegion). + /// AWS region, standard Iceberg client property. static constexpr std::string_view kClientRegion = "client.region"; /// Custom endpoint override (for MinIO, LocalStack, etc.) static constexpr std::string_view kEndpoint = "s3.endpoint"; diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index 1b2b4bd2e..4dcfad399 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -135,6 +135,23 @@ constexpr std::string_view kResidualFilter = "residual-filter"; constexpr std::string_view kMapKeys = "keys"; constexpr std::string_view kMapValues = "values"; +Result StorageCredentialToJson(const StorageCredential& credential) { + ICEBERG_RETURN_UNEXPECTED(credential.Validate()); + nlohmann::json json; + json[kPrefix] = credential.prefix; + json[kConfig] = credential.config; + return json; +} + +Result StorageCredentialFromJson(const nlohmann::json& json) { + StorageCredential credential; + ICEBERG_ASSIGN_OR_RAISE(credential.prefix, GetJsonValue(json, kPrefix)); + ICEBERG_ASSIGN_OR_RAISE(credential.config, + GetJsonValue(json, kConfig)); + ICEBERG_RETURN_UNEXPECTED(credential.Validate()); + return credential; +} + template Result> KeyValueMapFromJson(const nlohmann::json& json, std::string_view key) { @@ -700,10 +717,7 @@ Result ToJson(const LoadTableResult& result) { if (!result.storage_credentials.empty()) { nlohmann::json creds = nlohmann::json::array(); for (const auto& cred : result.storage_credentials) { - nlohmann::json entry; - entry[kPrefix] = cred.prefix; - // config is required, so always write it (matches Java). - entry[kConfig] = cred.config; + ICEBERG_ASSIGN_OR_RAISE(auto entry, StorageCredentialToJson(cred)); creds.push_back(std::move(entry)); } json[kStorageCredentials] = std::move(creds); @@ -722,22 +736,10 @@ Result LoadTableResultFromJson(const nlohmann::json& json) { GetJsonValueOrDefault(json, kConfig)); if (auto it = json.find(kStorageCredentials); it != json.end() && !it->is_null()) { if (!it->is_array()) { - // Don't echo the value — it may carry credential material. return JsonParseError("Cannot parse storage credentials from non-array"); } for (const auto& entry : *it) { - StorageCredential cred; - ICEBERG_ASSIGN_OR_RAISE(cred.prefix, GetJsonValue(entry, kPrefix)); - ICEBERG_ASSIGN_OR_RAISE(cred.config, - GetJsonValue(entry, kConfig)); - // prefix and config are required by the REST spec; non-empty matches the - // Java reference implementation (Credential.validate()). - if (cred.prefix.empty()) { - return JsonParseError("Invalid storage credential: prefix must be non-empty"); - } - if (cred.config.empty()) { - return JsonParseError("Invalid storage credential: config must be non-empty"); - } + ICEBERG_ASSIGN_OR_RAISE(auto cred, StorageCredentialFromJson(entry)); result.storage_credentials.push_back(std::move(cred)); } } diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index ca992d1ca..27de0befa 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -20,7 +20,6 @@ #include "iceberg/catalog/rest/rest_catalog.h" #include -#include #include #include #include @@ -30,7 +29,6 @@ #include #include "iceberg/catalog/rest/auth/auth_managers.h" -#include "iceberg/catalog/rest/auth/auth_properties.h" #include "iceberg/catalog/rest/auth/auth_session.h" #include "iceberg/catalog/rest/catalog_properties.h" #include "iceberg/catalog/rest/constant.h" @@ -124,41 +122,6 @@ Result CaptureNoSuchNamespace(const auto& status) { return CaptureNoSuchObject(status, ErrorKind::kNoSuchNamespace); } -Status ValidateNoFileIOConfig( - const std::unordered_map& table_config) { - static const std::unordered_set kTableConfigHandledByAuthKeys = { - auth::AuthProperties::kAuthType, - auth::AuthProperties::kBasicUsername, - auth::AuthProperties::kBasicPassword, - auth::AuthProperties::kSigV4Enabled, - auth::AuthProperties::kSigV4DelegateAuthType, - auth::AuthProperties::kSigV4SigningRegion, - auth::AuthProperties::kSigV4SigningName, - auth::AuthProperties::kSigV4AccessKeyId, - auth::AuthProperties::kSigV4SecretAccessKey, - auth::AuthProperties::kSigV4SessionToken, - auth::AuthProperties::kToken.key(), - auth::AuthProperties::kCredential.key(), - auth::AuthProperties::kScope.key(), - auth::AuthProperties::kOAuth2ServerUri.key(), - auth::AuthProperties::kKeepRefreshed.key(), - auth::AuthProperties::kExchangeEnabled.key(), - auth::AuthProperties::kAudience.key(), - auth::AuthProperties::kResource.key(), - }; - - for (const auto& [key, _] : table_config) { - if (!kTableConfigHandledByAuthKeys.contains(key)) { - // Fail closed because unknown table config may be FileIO/storage config. - // TODO(gangwu): Build table-specific FileIO when REST storage - // credentials and table-level storage config are supported. - return NotImplemented( - "Table-specific FileIO is not implemented for table config key '{}'", key); - } - } - return {}; -} - Status CheckBoundTable(const TableIdentifier& requested, const TableIdentifier& bound) { if (requested == bound) { return {}; @@ -526,25 +489,10 @@ Result> RestCatalog::TableFileIO( const SessionContext& /*context*/, const std::unordered_map& table_config, const std::vector& storage_credentials) const { - if (!storage_credentials.empty()) { - // Only non-S3 (GCS/ADLS) credentials vended, which we can't honor; fail fast. - if (HasOnlyNonS3StorageCredentials(storage_credentials)) { - auto prefixes = - storage_credentials | std::views::transform(&StorageCredential::prefix); - return NotSupported( - "Vended storage credentials {} are unsupported (only S3-family " - "credentials are supported)", - FormatRange(prefixes, ", ", "[", "]")); - } - // Build a FileIO that routes each path to its vended credential. TODO: STS - // refresh via credentials.uri is not yet supported. - ICEBERG_ASSIGN_OR_RAISE(auto io, - MakeS3FileIOFromCredentials(config_.configs(), table_config, - storage_credentials)); - return std::shared_ptr(std::move(io)); + if (!table_config.empty() || !storage_credentials.empty()) { + return MakeTableFileIO(config_.configs(), table_config, storage_credentials); } - ICEBERG_RETURN_UNEXPECTED(ValidateNoFileIOConfig(table_config)); return file_io_; } @@ -764,8 +712,7 @@ Result> RestCatalog::UpdateTable( ICEBERG_ASSIGN_OR_RAISE( auto table_session, TableAuthSession(identifier, table_config, std::move(contextual_session))); - // No table was loaded here, so there is no per-table FileIO to reuse; use the - // catalog FileIO (table_config carries no credentials). + // Top-level updates have no loaded table FileIO to reuse. return MakeTableFromCommitResponse(identifier, std::move(response), context, table_config, std::move(table_session), file_io_); } @@ -916,11 +863,7 @@ Result> RestCatalog::MakeTableFromCommitResponse( const SessionContext& context, const std::unordered_map& table_config, std::shared_ptr table_session, std::shared_ptr table_io) { - // Reuse the FileIO bound at load: CommitTableResponse carries no config or - // storage credentials, so rebuilding it would drop the vended credentials - // (mirrors Java RESTSessionCatalog#tableFileIO). - // TODO(gangwu): rebuild the FileIO if the commit response ever grows config - // or storage credentials. + // Reuse the bound FileIO because commit responses carry no config or credentials. auto table_catalog = std::make_shared( shared_from_this(), context, identifier, table_config, table_session, table_io); return Table::Make(identifier, std::move(response.metadata), diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 7b5613a0c..97cf42151 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -31,6 +31,7 @@ #include "iceberg/catalog/session_catalog.h" #include "iceberg/catalog/session_context.h" #include "iceberg/result.h" +#include "iceberg/storage_credential.h" /// \file iceberg/catalog/rest/rest_catalog.h /// RestCatalog implementation for Iceberg REST API. diff --git a/src/iceberg/catalog/rest/rest_file_io.cc b/src/iceberg/catalog/rest/rest_file_io.cc index e0f47613a..fe8a2b155 100644 --- a/src/iceberg/catalog/rest/rest_file_io.cc +++ b/src/iceberg/catalog/rest/rest_file_io.cc @@ -19,7 +19,6 @@ #include "iceberg/catalog/rest/rest_file_io.h" -#include #include #include #include @@ -28,7 +27,6 @@ #include "iceberg/catalog/rest/types.h" #include "iceberg/file_io.h" #include "iceberg/file_io_registry.h" -#include "iceberg/util/location_util.h" #include "iceberg/util/macros.h" namespace iceberg::rest { @@ -40,6 +38,16 @@ bool IsBuiltinImpl(std::string_view io_impl) { io_impl == FileIORegistry::kArrowS3FileIO; } +std::unordered_map MergeFileIOProperties( + const std::unordered_map& catalog_config, + const std::unordered_map& table_config) { + auto properties = catalog_config; + for (const auto& [key, value] : table_config) { + properties[key] = value; + } + return properties; +} + } // namespace Result DetectBuiltinFileIO(std::string_view location) { @@ -99,47 +107,31 @@ Result> MakeCatalogFileIO(const RestCatalogProperties& c return FileIORegistry::Load(io_impl, config.configs()); } -bool HasOnlyNonS3StorageCredentials(const std::vector& credentials) { - return !credentials.empty() && - std::ranges::none_of(credentials, [](const StorageCredential& cred) { - return cred.prefix.starts_with("s3"); - }); -} - -Result> MakeS3FileIOFromCredentials( +Result> MakeTableFileIO( const std::unordered_map& catalog_config, const std::unordered_map& table_config, const std::vector& storage_credentials) { - auto default_properties = catalog_config; - for (const auto& [key, value] : table_config) { - default_properties[key] = value; - } - - // Default S3 FileIO (for paths matching no credential prefix), built via the - // registry to keep this layer decoupled from the Arrow/S3 implementation. - ICEBERG_ASSIGN_OR_RAISE( - auto io, FileIORegistry::Load(std::string(FileIORegistry::kArrowS3FileIO), - default_properties)); - - // One property set per S3-family credential, keyed by canonicalized prefix; - // the credential's config overrides the default properties. - std::vector>> - properties_by_prefix; - for (const auto& cred : storage_credentials) { - if (!cred.prefix.starts_with("s3")) { - continue; - } - auto properties = default_properties; - for (const auto& [key, value] : cred.config) { - properties[key] = value; + const auto default_properties = MergeFileIOProperties(catalog_config, table_config); + const auto properties = RestCatalogProperties::FromMap(default_properties); + auto io_impl = properties.Get(RestCatalogProperties::kIOImpl); + if (io_impl.empty()) { + const auto warehouse = properties.Get(RestCatalogProperties::kWarehouse); + if (warehouse.empty()) { + return InvalidArgument(R"("{}" or "{}" property is required to create FileIO)", + RestCatalogProperties::kIOImpl.key(), + RestCatalogProperties::kWarehouse.key()); } - properties_by_prefix.emplace_back(LocationUtil::CanonicalizeS3Scheme(cred.prefix), - std::move(properties)); + ICEBERG_ASSIGN_OR_RAISE(const auto detected_kind, DetectBuiltinFileIO(warehouse)); + io_impl = std::string(BuiltinFileIOName(detected_kind)); } - - // Hand the per-prefix properties to the FileIO for per-path credential routing. - if (auto* credentialed = dynamic_cast(io.get())) { - ICEBERG_RETURN_UNEXPECTED(credentialed->SetStorageCredentials(properties_by_prefix)); + ICEBERG_ASSIGN_OR_RAISE(auto io, FileIORegistry::Load(io_impl, default_properties)); + + if (storage_credentials.empty()) { + return io; + } else if (auto* credentialed = io->AsSupportsStorageCredentials()) { + ICEBERG_RETURN_UNEXPECTED(credentialed->SetStorageCredentials(storage_credentials)); + } else { + return NotSupported("Configured FileIO does not support vended storage credentials"); } return io; } diff --git a/src/iceberg/catalog/rest/rest_file_io.h b/src/iceberg/catalog/rest/rest_file_io.h index d5052143f..9e5f7a0a9 100644 --- a/src/iceberg/catalog/rest/rest_file_io.h +++ b/src/iceberg/catalog/rest/rest_file_io.h @@ -47,15 +47,8 @@ ICEBERG_REST_EXPORT std::string_view BuiltinFileIOName(BuiltinFileIOKind kind); ICEBERG_REST_EXPORT Result> MakeCatalogFileIO( const RestCatalogProperties& config); -/// \brief True if `credentials` is non-empty but has no S3-family credential -/// (prefix starting with "s3") — only unsupported schemes (GCS/ADLS) were vended. -ICEBERG_REST_EXPORT bool HasOnlyNonS3StorageCredentials( - const std::vector& credentials); - -/// \brief Build an S3 FileIO that routes each object path to a per-prefix file -/// system, one per S3-family vended credential (config merged catalog < table < -/// credential). Non-S3 credentials are ignored. -ICEBERG_REST_EXPORT Result> MakeS3FileIOFromCredentials( +/// \brief Build the configured table FileIO and apply storage credentials if present. +ICEBERG_REST_EXPORT Result> MakeTableFileIO( const std::unordered_map& catalog_config, const std::unordered_map& table_config, const std::vector& storage_credentials); diff --git a/src/iceberg/catalog/rest/type_fwd.h b/src/iceberg/catalog/rest/type_fwd.h index 783f22750..ee684b245 100644 --- a/src/iceberg/catalog/rest/type_fwd.h +++ b/src/iceberg/catalog/rest/type_fwd.h @@ -28,7 +28,6 @@ struct ErrorResponse; struct CommitTableResponse; struct LoadTableResult; struct OAuthTokenResponse; -struct StorageCredential; class Endpoint; class ErrorHandler; diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 3bf218a6f..20a59fa59 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -30,6 +30,7 @@ #include "iceberg/catalog/rest/endpoint.h" #include "iceberg/catalog/rest/iceberg_rest_export.h" #include "iceberg/result.h" +#include "iceberg/storage_credential.h" #include "iceberg/table_identifier.h" #include "iceberg/type_fwd.h" #include "iceberg/util/macros.h" @@ -180,28 +181,6 @@ struct ICEBERG_REST_EXPORT CreateTableRequest { /// \brief An opaque token that allows clients to make use of pagination for list APIs. using PageToken = std::string; -/// \brief A short-lived credential vended by a REST catalog for a storage -/// location ``prefix`` (clients pick the longest matching prefix); ``config`` -/// holds backend properties such as ``"s3.access-key-id"`` (Iceberg REST spec). -struct ICEBERG_REST_EXPORT StorageCredential { - std::string prefix; - std::unordered_map config; - - /// \brief Validates the StorageCredential. The REST spec requires both a - /// prefix and config; non-empty matches Java `Credential.validate()`. - Status Validate() const { - if (prefix.empty()) { - return ValidationFailed("Invalid storage credential: prefix must be non-empty"); - } - if (config.empty()) { - return ValidationFailed("Invalid storage credential: config must be non-empty"); - } - return {}; - } - - bool operator==(const StorageCredential& other) const = default; -}; - /// \brief Result body for table create/load/register APIs. struct ICEBERG_REST_EXPORT LoadTableResult { std::string metadata_location; @@ -358,7 +337,7 @@ struct ICEBERG_REST_EXPORT PlanTableScanResponse { PlanStatus plan_status = PlanStatus::kCompleted; std::string plan_id; std::optional error; - // TODO(sandeepg): Add credentials. + // TODO(sandeepg): Add storage credentials and bind scan FileIO to them. Status Validate() const; @@ -373,7 +352,7 @@ struct ICEBERG_REST_EXPORT FetchPlanningResultResponse { std::vector> delete_files; PlanStatus plan_status = PlanStatus::kCompleted; std::optional error; - // TODO(sandeepg): Add credentials. + // TODO(sandeepg): Add storage credentials and bind scan FileIO to them. Status Validate() const; diff --git a/src/iceberg/file_io.h b/src/iceberg/file_io.h index 5f9e071e0..7d7b31113 100644 --- a/src/iceberg/file_io.h +++ b/src/iceberg/file_io.h @@ -26,15 +26,16 @@ #include #include #include -#include -#include #include #include "iceberg/iceberg_export.h" #include "iceberg/result.h" +#include "iceberg/storage_credential.h" namespace iceberg { +class SupportsStorageCredentials; + /// \brief Seekable byte stream for reading file contents. class ICEBERG_EXPORT SeekableInputStream { public: @@ -173,21 +174,24 @@ class ICEBERG_EXPORT FileIO { /// \param file_locations The locations of the files to delete. /// \return void if all deletes succeed, or an error code if any delete fails. virtual Status DeleteFiles(const std::vector& file_locations); + + /// \brief Return storage-credential support when implemented by this FileIO. + virtual SupportsStorageCredentials* AsSupportsStorageCredentials() { return nullptr; } }; /// \brief Mix-in for FileIO implementations that route object paths to /// per-prefix file systems built from vended storage credentials, letting the -/// catalog stay decoupled from the concrete (Arrow/S3) implementation. +/// catalog stay decoupled from concrete storage implementations. class ICEBERG_EXPORT SupportsStorageCredentials { public: virtual ~SupportsStorageCredentials() = default; - /// \brief Install per-prefix file systems. `properties_by_prefix` maps a - /// canonicalized prefix to its fully merged properties. + /// \brief Install vended storage credentials. virtual Status SetStorageCredentials( - const std::vector< - std::pair>>& - properties_by_prefix) = 0; + const std::vector& storage_credentials) = 0; + + /// \brief Return currently installed storage credentials. + virtual const std::vector& credentials() const = 0; }; } // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 71a4498f1..1fa15fa12 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -283,6 +283,7 @@ install_headers( 'sort_field.h', 'sort_order.h', 'statistics_file.h', + 'storage_credential.h', 'table.h', 'table_identifier.h', 'table_metadata.h', diff --git a/src/iceberg/storage_credential.h b/src/iceberg/storage_credential.h new file mode 100644 index 000000000..604e8ddce --- /dev/null +++ b/src/iceberg/storage_credential.h @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +/// \brief A storage credential vended for a storage location prefix. +struct ICEBERG_EXPORT StorageCredential { + std::string prefix; + std::unordered_map config; + + Status Validate() const { + if (prefix.empty()) { + return ValidationFailed("Invalid storage credential: prefix must be non-empty"); + } + if (config.empty()) { + return ValidationFailed("Invalid storage credential: config must be non-empty"); + } + return {}; + } + + bool operator==(const StorageCredential& other) const = default; +}; + +} // namespace iceberg diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc index ef8291050..701a8f0d9 100644 --- a/src/iceberg/test/arrow_s3_file_io_test.cc +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -21,15 +21,25 @@ #include #include #include +#include #include +#include +#include #include #include +#if ICEBERG_S3_ENABLED +# include +#endif + #include "iceberg/arrow/arrow_io_util.h" -#include "iceberg/arrow/s3/arrow_s3_internal.h" #include "iceberg/arrow/s3/s3_properties.h" +#include "iceberg/file_io.h" +#include "iceberg/result.h" +#include "iceberg/storage_credential.h" #include "iceberg/test/matchers.h" +#include "iceberg/util/macros.h" namespace { @@ -63,16 +73,27 @@ std::unordered_map PropertiesFromEnv() { properties[std::string(iceberg::arrow::S3Properties::kEndpoint)] = *endpoint; } if (const auto region = GetEnvIfSet("AWS_REGION")) { - properties[std::string(iceberg::arrow::S3Properties::kRegion)] = *region; + properties[std::string(iceberg::arrow::S3Properties::kClientRegion)] = *region; } return properties; } +std::unordered_map BadS3Credentials() { + return { + {std::string(iceberg::arrow::S3Properties::kAccessKeyId), "bad-access-key"}, + {std::string(iceberg::arrow::S3Properties::kSecretAccessKey), "bad-secret-key"}}; +} + } // namespace namespace iceberg::arrow { +#if ICEBERG_S3_ENABLED +Result<::arrow::fs::S3Options> ConfigureS3Options( + const std::unordered_map& properties); +#endif + namespace { class ArrowS3FileIOTest : public ::testing::Test { @@ -97,27 +118,56 @@ class ArrowS3FileIOTest : public ::testing::Test { return MakeObjectUri(*base_uri_, object_name); } - void RequireIntegrationEnv() const { - if (!base_uri_.has_value()) { - GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; - } - } + const std::string& BaseUri() const { return *base_uri_; } + + bool HasIntegrationEnv() const { return base_uri_.has_value(); } private: std::optional base_uri_; }; +Status CheckReadWrite(FileIO& io, const std::string& object_uri, + std::string_view content) { + ICEBERG_RETURN_UNEXPECTED(io.WriteFile(object_uri, content)); + ICEBERG_ASSIGN_OR_RAISE(auto read, io.ReadFile(object_uri, std::nullopt)); + EXPECT_EQ(read, std::string(content)); + return io.DeleteFile(object_uri); +} + } // namespace -TEST_F(ArrowS3FileIOTest, CreateWithDefaultProperties) { +TEST_F(ArrowS3FileIOTest, Create) { auto result = MakeS3FileIO({}); ASSERT_THAT(result, IsOk()); EXPECT_NE(result.value(), nullptr); } -TEST_F(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { - auto result = MakeS3FileIO(); +TEST_F(ArrowS3FileIOTest, StoresCredentials) { + auto result = MakeS3FileIO({}); + ASSERT_THAT(result, IsOk()); + auto* credentialed = result.value()->AsSupportsStorageCredentials(); + ASSERT_NE(credentialed, nullptr); + + std::vector credentials = { + {.prefix = "s3://bucket/table", + .config = {{std::string(S3Properties::kAccessKeyId), "access-key"}, + {std::string(S3Properties::kSecretAccessKey), "secret"}}}}; + EXPECT_THAT(credentialed->SetStorageCredentials(credentials), IsOk()); + EXPECT_EQ(credentialed->credentials(), credentials); +} + +TEST_F(ArrowS3FileIOTest, RejectsCredentialPrefix) { + auto result = MakeS3FileIO({}); ASSERT_THAT(result, IsOk()); + auto* credentialed = result.value()->AsSupportsStorageCredentials(); + ASSERT_NE(credentialed, nullptr); + + auto status = credentialed->SetStorageCredentials( + {{.prefix = "gs://bucket/table", + .config = {{std::string(S3Properties::kAccessKeyId), "access-key"}, + {std::string(S3Properties::kSecretAccessKey), "secret"}}}}); + EXPECT_THAT(status, IsError(ErrorKind::kNotSupported)); + EXPECT_THAT(status, HasErrorMessage("unsupported by Arrow S3 FileIO")); } TEST_F(ArrowS3FileIOTest, RejectsIncompleteStaticCredentials) { @@ -134,100 +184,89 @@ TEST_F(ArrowS3FileIOTest, RejectsInvalidBooleanProperties) { EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); } -TEST_F(ArrowS3FileIOTest, ReadWriteFile) { - RequireIntegrationEnv(); +TEST_F(ArrowS3FileIOTest, ReadWrite) { + if (!HasIntegrationEnv()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } auto io_res = MakeS3FileIO(); ASSERT_THAT(io_res, IsOk()); auto io = std::move(io_res).value(); auto object_uri = ObjectUri("iceberg_s3_io_test.txt"); - auto write_res = io->WriteFile(object_uri, "hello s3"); - ASSERT_THAT(write_res, IsOk()); - - auto read_res = io->ReadFile(object_uri, std::nullopt); - ASSERT_THAT(read_res, IsOk()); - EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3"))); - - auto del_res = io->DeleteFile(object_uri); - EXPECT_THAT(del_res, IsOk()); + EXPECT_THAT(CheckReadWrite(*io, object_uri, "hello s3"), IsOk()); } -TEST_F(ArrowS3FileIOTest, MakeS3FileIOWithProperties) { - RequireIntegrationEnv(); +TEST_F(ArrowS3FileIOTest, ReadWriteWithProperties) { + if (!HasIntegrationEnv()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } auto io_res = MakeS3FileIO(PropertiesFromEnv()); ASSERT_THAT(io_res, IsOk()); auto io = std::move(io_res).value(); auto object_uri = ObjectUri("iceberg_s3_io_props_test.txt"); - auto write_res = io->WriteFile(object_uri, "hello s3 with properties"); - ASSERT_THAT(write_res, IsOk()); - - auto read_res = io->ReadFile(object_uri, std::nullopt); - ASSERT_THAT(read_res, IsOk()); - EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3 with properties"))); - - auto del_res = io->DeleteFile(object_uri); - EXPECT_THAT(del_res, IsOk()); + EXPECT_THAT(CheckReadWrite(*io, object_uri, "hello s3 with properties"), IsOk()); } -TEST_F(ArrowS3FileIOTest, MakeS3FileIOWithSslDisabled) { - RequireIntegrationEnv(); - std::unordered_map properties; - properties[std::string(S3Properties::kSslEnabled)] = "false"; - - auto io_res = MakeS3FileIO(properties); - ASSERT_THAT(io_res, IsOk()); -} +TEST_F(ArrowS3FileIOTest, LongestCredentialPrefix) { + if (!HasIntegrationEnv()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } -TEST_F(ArrowS3FileIOTest, MakeS3FileIOWithTimeouts) { - RequireIntegrationEnv(); - std::unordered_map properties; - properties[std::string(S3Properties::kConnectTimeoutMs)] = "5000"; - properties[std::string(S3Properties::kSocketTimeoutMs)] = "10000"; + auto properties = PropertiesFromEnv(); + if (properties.empty()) { + GTEST_SKIP() << "Set S3 properties to enable credential routing test"; + } auto io_res = MakeS3FileIO(properties); ASSERT_THAT(io_res, IsOk()); + auto io = std::move(io_res).value(); + auto* credentialed = io->AsSupportsStorageCredentials(); + ASSERT_NE(credentialed, nullptr); + + constexpr std::string_view object_name = "iceberg_s3_io_prefix_test.txt"; + auto object_uri = ObjectUri(object_name); + const auto partial_prefix = + object_uri.substr(0, object_uri.size() - object_name.size() + 3); + + auto bad_properties = BadS3Credentials(); + EXPECT_THAT(credentialed->SetStorageCredentials( + {{.prefix = BaseUri(), .config = std::move(bad_properties)}, + {.prefix = partial_prefix, .config = properties}}), + IsOk()); + EXPECT_THAT(CheckReadWrite(*io, object_uri, "hello s3 with vended credentials"), + IsOk()); } #if ICEBERG_S3_ENABLED -TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsPrefersClientRegionOverS3Region) { +TEST_F(ArrowS3FileIOTest, ClientRegion) { auto result = - ConfigureS3Options({{std::string(S3Properties::kClientRegion), "cn-hangzhou"}, - {std::string(S3Properties::kRegion), "us-east-1"}}); - ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->region, "cn-hangzhou"); -} - -TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsFallsBackToS3Region) { - auto result = ConfigureS3Options({{std::string(S3Properties::kRegion), "us-east-1"}}); + ConfigureS3Options({{std::string(S3Properties::kClientRegion), "us-east-1"}}); ASSERT_THAT(result, IsOk()); EXPECT_EQ(result->region, "us-east-1"); } -TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsStripsHttpsEndpointScheme) { - auto result = ConfigureS3Options({{std::string(S3Properties::kEndpoint), - "https://oss-cn-hangzhou.aliyuncs.com:443"}}); - ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->endpoint_override, "oss-cn-hangzhou.aliyuncs.com:443"); - EXPECT_EQ(result->scheme, "https"); -} - -TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsStripsHttpEndpointScheme) { - auto result = ConfigureS3Options( - {{std::string(S3Properties::kEndpoint), "http://localhost:9000"}}); - ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->endpoint_override, "localhost:9000"); - EXPECT_EQ(result->scheme, "http"); -} - -TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsKeepsSchemelessEndpoint) { - auto result = - ConfigureS3Options({{std::string(S3Properties::kEndpoint), "localhost:9000"}}); - ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->endpoint_override, "localhost:9000"); +TEST_F(ArrowS3FileIOTest, EndpointScheme) { + struct Case { + std::string_view endpoint; + std::string_view endpoint_override; + std::string_view scheme; + }; + const std::vector cases = {{"https://oss-cn-hangzhou.aliyuncs.com:443", + "oss-cn-hangzhou.aliyuncs.com:443", "https"}, + {"http://localhost:9000", "localhost:9000", "http"}, + {"localhost:9000", "localhost:9000", "https"}}; + + for (const auto& test_case : cases) { + auto result = ConfigureS3Options( + {{std::string(S3Properties::kEndpoint), std::string(test_case.endpoint)}}); + ASSERT_THAT(result, IsOk()) << test_case.endpoint; + EXPECT_EQ(result->endpoint_override, test_case.endpoint_override); + EXPECT_EQ(result->scheme, test_case.scheme); + } } -TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsSslEnabledOverridesEndpointScheme) { +TEST_F(ArrowS3FileIOTest, SslEnabled) { auto https = ConfigureS3Options({{std::string(S3Properties::kEndpoint), "http://localhost:9000"}, {std::string(S3Properties::kSslEnabled), "true"}}); @@ -241,20 +280,25 @@ TEST_F(ArrowS3FileIOTest, ConfigureS3OptionsSslEnabledOverridesEndpointScheme) { EXPECT_EQ(http->scheme, "http"); } -TEST_F(ArrowS3FileIOTest, - ConfigureS3OptionsPathStyleAccessFalseEnablesVirtualAddressing) { - auto result = +TEST_F(ArrowS3FileIOTest, PathStyleAccess) { + auto virtual_addressing = ConfigureS3Options({{std::string(S3Properties::kPathStyleAccess), "false"}}); - ASSERT_THAT(result, IsOk()); - EXPECT_TRUE(result->force_virtual_addressing); + ASSERT_THAT(virtual_addressing, IsOk()); + EXPECT_TRUE(virtual_addressing->force_virtual_addressing); + + auto path_style = + ConfigureS3Options({{std::string(S3Properties::kPathStyleAccess), "true"}}); + ASSERT_THAT(path_style, IsOk()); + EXPECT_FALSE(path_style->force_virtual_addressing); } -TEST_F(ArrowS3FileIOTest, - ConfigureS3OptionsPathStyleAccessTrueDisablesVirtualAddressing) { +TEST_F(ArrowS3FileIOTest, Timeouts) { auto result = - ConfigureS3Options({{std::string(S3Properties::kPathStyleAccess), "true"}}); + ConfigureS3Options({{std::string(S3Properties::kConnectTimeoutMs), "5000"}, + {std::string(S3Properties::kSocketTimeoutMs), "10000"}}); ASSERT_THAT(result, IsOk()); - EXPECT_FALSE(result->force_virtual_addressing); + EXPECT_EQ(result->connect_timeout, 5); + EXPECT_EQ(result->request_timeout, 10); } #endif diff --git a/src/iceberg/test/rest_file_io_test.cc b/src/iceberg/test/rest_file_io_test.cc index d54943fe4..7f41b0b46 100644 --- a/src/iceberg/test/rest_file_io_test.cc +++ b/src/iceberg/test/rest_file_io_test.cc @@ -29,7 +29,6 @@ #include "iceberg/catalog/rest/types.h" #include "iceberg/file_io_registry.h" #include "iceberg/test/matchers.h" -#include "iceberg/util/location_util.h" namespace iceberg::rest { @@ -50,6 +49,24 @@ class MockFileIO : public FileIO { Status DeleteFile(const std::string& /*file_location*/) override { return {}; } }; +std::vector captured_storage_credentials; +std::unordered_map captured_file_io_properties; + +class MockCredentialedFileIO : public MockFileIO, public SupportsStorageCredentials { + public: + Status SetStorageCredentials( + const std::vector& credentials) override { + captured_storage_credentials = credentials; + return {}; + } + + const std::vector& credentials() const override { + return captured_storage_credentials; + } + + SupportsStorageCredentials* AsSupportsStorageCredentials() override { return this; } +}; + } // namespace TEST(RestFileIOTest, DetectBuiltinKindFromScheme) { @@ -153,46 +170,78 @@ TEST(RestFileIOTest, MakeCatalogFileIOSkipsCheckWhenWarehouseAbsent) { ASSERT_THAT(result, IsOk()); } -TEST(RestFileIOTest, CanonicalizeS3SchemeTreatsS3CompatibleSchemesAsS3) { - // s3a/s3n/oss canonicalize to s3:// so a vended `s3` credential prefix-matches - // them uniformly (DLF vends `s3` for oss:// locations). - EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("oss://bucket/db/t"), "s3://bucket/db/t"); - EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("s3a://bucket/x"), "s3://bucket/x"); - EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("s3n://bucket/x"), "s3://bucket/x"); - // Already-canonical and non-S3 / scheme-less locations are unchanged. - EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("s3://bucket/x"), "s3://bucket/x"); - EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("gs://bucket"), "gs://bucket"); - EXPECT_EQ(LocationUtil::CanonicalizeS3Scheme("/local/path"), "/local/path"); +TEST(RestFileIOTest, TableFileIOMergesConfigAndCredentials) { + const std::string custom_impl = "com.mycompany.CredentialedFileIO"; + captured_file_io_properties.clear(); + captured_storage_credentials.clear(); + FileIORegistry::Register( + custom_impl, + [](const std::unordered_map& properties) + -> Result> { + captured_file_io_properties = properties; + return std::make_unique(); + }); + + auto result = MakeTableFileIO( + {{"warehouse", "s3://catalog/warehouse"}, + {"catalog-only", "catalog"}, + {"shared", "catalog"}}, + {{"io-impl", custom_impl}, {"table-only", "table"}, {"shared", "table"}}, + {{.prefix = "s3://bucket/table", + .config = {{"shared", "credential"}, {"credential-only", "value"}}}}); + ASSERT_THAT(result, IsOk()); + auto* credentialed = result.value()->AsSupportsStorageCredentials(); + ASSERT_NE(credentialed, nullptr); + + EXPECT_THAT( + captured_file_io_properties, + ::testing::UnorderedElementsAre( + ::testing::Pair("warehouse", "s3://catalog/warehouse"), + ::testing::Pair("catalog-only", "catalog"), + ::testing::Pair("io-impl", custom_impl), ::testing::Pair("table-only", "table"), + ::testing::Pair("shared", "table"))); + ASSERT_EQ(captured_storage_credentials.size(), 1); + EXPECT_EQ(captured_storage_credentials[0].prefix, "s3://bucket/table"); + EXPECT_THAT(captured_storage_credentials[0].config, + ::testing::UnorderedElementsAre(::testing::Pair("credential-only", "value"), + ::testing::Pair("shared", "credential"))); + EXPECT_EQ(credentialed->credentials(), captured_storage_credentials); } -TEST(RestFileIOTest, PathHasPrefixMatchesAtPathBoundary) { - // Must match only at a path boundary, so a `s3://bucket/db/t1` credential does - // not capture a sibling table under `s3://bucket/db/t1x/...`. - EXPECT_TRUE(LocationUtil::PathHasPrefix("s3://bucket/db/t1/data/f.parquet", - "s3://bucket/db/t1")); - EXPECT_TRUE(LocationUtil::PathHasPrefix("s3://bucket/db/t1", "s3://bucket/db/t1")); - EXPECT_FALSE(LocationUtil::PathHasPrefix("s3://bucket/db/t1x/data/f.parquet", - "s3://bucket/db/t1")); - EXPECT_FALSE(LocationUtil::PathHasPrefix("s3://bucket-other/x", "s3://bucket")); - - // A bare-scheme credential (DLF vends `s3`) matches any authority/path. - EXPECT_TRUE(LocationUtil::PathHasPrefix("s3://bucket/db/t/f", "s3")); - EXPECT_TRUE(LocationUtil::PathHasPrefix( - LocationUtil::CanonicalizeS3Scheme("oss://bucket/db/t/f"), "s3")); - EXPECT_FALSE(LocationUtil::PathHasPrefix("gs://bucket/x", "s3")); +TEST(RestFileIOTest, TableImplOverridesWarehouseScheme) { + captured_file_io_properties.clear(); + FileIORegistry::Register( + std::string(FileIORegistry::kArrowS3FileIO), + [](const std::unordered_map& properties) + -> Result> { + captured_file_io_properties = properties; + return std::make_unique(); + }); + + auto result = + MakeTableFileIO({{"warehouse", "/tmp/catalog-warehouse"}}, + {{"io-impl", std::string(FileIORegistry::kArrowS3FileIO)}}, + /*storage_credentials=*/{}); + ASSERT_THAT(result, IsOk()); + EXPECT_THAT( + captured_file_io_properties, + ::testing::UnorderedElementsAre( + ::testing::Pair("warehouse", "/tmp/catalog-warehouse"), + ::testing::Pair("io-impl", std::string(FileIORegistry::kArrowS3FileIO)))); } -TEST(RestFileIOTest, HasOnlyNonS3StorageCredentials) { - // Only GCS/ADLS prefixes -> unsupported, fail fast. - EXPECT_TRUE(HasOnlyNonS3StorageCredentials( - {{.prefix = "gs://bucket", .config = {{"k", "v"}}}, - {.prefix = "abfs://c@a.dfs.core.windows.net", .config = {{"k", "v"}}}})); - // At least one S3 credential present -> not unsupported (may fall back). - EXPECT_FALSE(HasOnlyNonS3StorageCredentials( - {{.prefix = "gs://bucket", .config = {{"k", "v"}}}, - {.prefix = "s3://bucket", .config = {{"s3.access-key-id", "a"}}}})); - // No credentials at all -> not "only non-S3". - EXPECT_FALSE(HasOnlyNonS3StorageCredentials({})); +TEST(RestFileIOTest, TableFileIORejectsCredentials) { + const std::string custom_impl = "com.mycompany.PlainFileIO"; + FileIORegistry::Register( + custom_impl, + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + auto result = MakeTableFileIO( + {{"warehouse", "s3://catalog/warehouse"}}, {{"io-impl", custom_impl}}, + {{.prefix = "s3://bucket/table", .config = {{"k", "v"}}}}); + EXPECT_THAT(result, IsError(ErrorKind::kNotSupported)); + EXPECT_THAT(result, HasErrorMessage("does not support vended storage credentials")); } } // namespace iceberg::rest diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 10f209aa2..dd9326cb2 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -127,6 +127,7 @@ template struct JsonInvalidParam { std::string test_name; std::string invalid_json_str; + ErrorKind expected_error_kind = ErrorKind::kJsonParseError; std::string expected_error_message; }; @@ -140,7 +141,7 @@ class JsonInvalidTest : public ::testing::TestWithParam> const auto& param = Base::GetParam(); auto result = FromJson(nlohmann::json::parse(param.invalid_json_str)); - ASSERT_THAT(result, IsError(ErrorKind::kJsonParseError)); + ASSERT_THAT(result, IsError(param.expected_error_kind)); ASSERT_THAT(result, HasErrorMessage(param.expected_error_message)) << result.error().message; } @@ -1123,15 +1124,16 @@ INSTANTIATE_TEST_SUITE_P( .config = {{"warehouse", "s3://bucket/warehouse"}, {"foo", "bar"}}}}, LoadTableResultParam{ - .test_name = "WithStorageCredentials", + .test_name = "WithCredentials", .expected_json_str = - R"({"metadata":{"current-schema-id":1,"current-snapshot-id":null,"default-sort-order-id":0,"default-spec-id":0,"format-version":2,"last-column-id":1,"last-partition-id":0,"last-sequence-number":0,"last-updated-ms":0,"location":"s3://bucket/test","metadata-log":[],"partition-specs":[{"fields":[],"spec-id":0}],"partition-statistics":[],"properties":{},"refs":{},"schemas":[{"fields":[{"id":1,"name":"id","required":true,"type":"int"}],"schema-id":1,"type":"struct"}],"snapshot-log":[],"snapshots":[],"sort-orders":[{"fields":[],"order-id":0}],"statistics":[],"table-uuid":"test-uuid-1234"},"storage-credentials":[{"config":{"s3.access-key-id":"AKIAtest","s3.region":"us-east-1","s3.secret-access-key":"secret"},"prefix":"s3"}]})", + R"({"metadata":{"current-schema-id":1,"current-snapshot-id":null,"default-sort-order-id":0,"default-spec-id":0,"format-version":2,"last-column-id":1,"last-partition-id":0,"last-sequence-number":0,"last-updated-ms":0,"location":"s3://bucket/test","metadata-log":[],"partition-specs":[{"fields":[],"spec-id":0}],"partition-statistics":[],"properties":{},"refs":{},"schemas":[{"fields":[{"id":1,"name":"id","required":true,"type":"int"}],"schema-id":1,"type":"struct"}],"snapshot-log":[],"snapshots":[],"sort-orders":[{"fields":[],"order-id":0}],"statistics":[],"table-uuid":"test-uuid-1234"},"storage-credentials":[{"config":{"client.region":"us-east-1","s3.access-key-id":"AKIAtest","s3.secret-access-key":"secret","s3.session-token":"token"},"prefix":"s3"}]})", .model = {.metadata = MakeSimpleTableMetadata(), .storage_credentials = {{.prefix = "s3", .config = {{"s3.access-key-id", "AKIAtest"}, {"s3.secret-access-key", "secret"}, - {"s3.region", "us-east-1"}}}}}}), + {"s3.session-token", "token"}, + {"client.region", "us-east-1"}}}}}}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; }); @@ -1160,18 +1162,7 @@ INSTANTIATE_TEST_SUITE_P( .json_str = R"({"metadata":{"format-version":2,"table-uuid":"test-uuid-1234","location":"s3://bucket/test","last-sequence-number":0,"last-updated-ms":0,"last-column-id":1,"schemas":[{"type":"struct","schema-id":1,"fields":[{"id":1,"name":"id","type":"int","required":true}]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"properties":{}},"config":{"warehouse":"s3://bucket/warehouse"}})", .expected_model = {.metadata = MakeSimpleTableMetadata(), - .config = {{"warehouse", "s3://bucket/warehouse"}}}}, - LoadTableResultDeserializeParam{ - .test_name = "WithStorageCredentials", - .json_str = - R"({"metadata":{"format-version":2,"table-uuid":"test-uuid-1234","location":"s3://bucket/test","last-sequence-number":0,"last-updated-ms":0,"last-column-id":1,"schemas":[{"type":"struct","schema-id":1,"fields":[{"id":1,"name":"id","type":"int","required":true}]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[]}],"default-spec-id":0,"last-partition-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"properties":{}},"storage-credentials":[{"prefix":"s3","config":{"s3.access-key-id":"AKIAtest","s3.secret-access-key":"secret","s3.session-token":"token","s3.region":"us-east-1"}}]})", - .expected_model = - {.metadata = MakeSimpleTableMetadata(), - .storage_credentials = {{.prefix = "s3", - .config = {{"s3.access-key-id", "AKIAtest"}, - {"s3.secret-access-key", "secret"}, - {"s3.session-token", "token"}, - {"s3.region", "us-east-1"}}}}}}), + .config = {{"warehouse", "s3://bucket/warehouse"}}}}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; }); @@ -1212,25 +1203,27 @@ INSTANTIATE_TEST_SUITE_P( .invalid_json_str = R"({"metadata":{"format-version":"invalid"}})", .expected_error_message = "type must be number, but is string"}, LoadTableResultInvalidParam{ - .test_name = "StorageCredentialsNotArray", + .test_name = "CredentialsNotArray", .invalid_json_str = LoadTableJsonWithCredentials(R"("oops")"), .expected_error_message = "Cannot parse storage credentials from non-array"}, LoadTableResultInvalidParam{ - .test_name = "StorageCredentialMissingPrefix", + .test_name = "CredentialMissingPrefix", .invalid_json_str = LoadTableJsonWithCredentials(R"([{"config":{"k":"v"}}])"), .expected_error_message = "Missing 'prefix'"}, LoadTableResultInvalidParam{ - .test_name = "StorageCredentialMissingConfig", + .test_name = "CredentialMissingConfig", .invalid_json_str = LoadTableJsonWithCredentials(R"([{"prefix":"s3"}])"), .expected_error_message = "Missing 'config'"}, - LoadTableResultInvalidParam{.test_name = "StorageCredentialEmptyPrefix", + LoadTableResultInvalidParam{.test_name = "CredentialEmptyPrefix", .invalid_json_str = LoadTableJsonWithCredentials( R"([{"prefix":"","config":{"k":"v"}}])"), + .expected_error_kind = ErrorKind::kValidationFailed, .expected_error_message = "prefix must be non-empty"}, LoadTableResultInvalidParam{ - .test_name = "StorageCredentialEmptyConfig", + .test_name = "CredentialEmptyConfig", .invalid_json_str = LoadTableJsonWithCredentials(R"([{"prefix":"s3","config":{}}])"), + .expected_error_kind = ErrorKind::kValidationFailed, .expected_error_message = "config must be non-empty"}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; @@ -2612,18 +2605,4 @@ TEST(FetchPlanningResultResponseRoundtripTest, FailedWithError) { EXPECT_EQ(*result, *result2); } -TEST(StorageCredentialValidateTest, RequiresPrefixAndConfig) { - EXPECT_THAT( - (StorageCredential{.prefix = "s3", .config = {{"s3.region", "us"}}}.Validate()), - IsOk()); - - auto empty_prefix = StorageCredential{.prefix = "", .config = {{"s3.region", "us"}}}; - EXPECT_THAT(empty_prefix.Validate(), IsError(ErrorKind::kValidationFailed)); - EXPECT_THAT(empty_prefix.Validate(), HasErrorMessage("prefix must be non-empty")); - - auto empty_config = StorageCredential{.prefix = "s3", .config = {}}; - EXPECT_THAT(empty_config.Validate(), IsError(ErrorKind::kValidationFailed)); - EXPECT_THAT(empty_config.Validate(), HasErrorMessage("config must be non-empty")); -} - } // namespace iceberg::rest diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 784b3e03b..870badba0 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -201,6 +201,7 @@ class PartitionSummary; /// \brief File I/O. struct ReaderOptions; struct WriterOptions; +struct StorageCredential; class FileIO; class Reader; class Writer; diff --git a/src/iceberg/util/location_util.h b/src/iceberg/util/location_util.h index cbeac1574..eb78dece3 100644 --- a/src/iceberg/util/location_util.h +++ b/src/iceberg/util/location_util.h @@ -19,7 +19,6 @@ #pragma once -#include #include #include "iceberg/iceberg_export.h" @@ -38,27 +37,6 @@ class ICEBERG_EXPORT LocationUtil { } return path; } - - /// \brief Rewrites S3-compatible schemes (s3a://, s3n://, oss://) to s3:// so - /// locations and credential prefixes can be prefix-matched uniformly. - static std::string CanonicalizeS3Scheme(std::string_view location) { - for (std::string_view scheme : {"s3a://", "s3n://", "oss://"}) { - if (location.starts_with(scheme)) { - return std::string("s3://").append(location.substr(scheme.size())); - } - } - return std::string(location); - } - - /// \brief True if `prefix` matches `path` at a path boundary (equal, next char - /// '/', or a bare scheme), so `s3://bucket` does not match `s3://bucket-x`. - static bool PathHasPrefix(std::string_view path, std::string_view prefix) { - if (!path.starts_with(prefix)) { - return false; - } - return path.size() == prefix.size() || path[prefix.size()] == '/' || - prefix.find("://") == std::string_view::npos; - } }; } // namespace iceberg