-
Notifications
You must be signed in to change notification settings - Fork 470
feat(encryption) [7/N] Read encrypted manifest list #2453
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d0cfbb7
f98f69c
f8122d9
1aee090
2feaeea
9252805
73b0aa2
44f58f6
f1f6496
6062a1a
43bccf5
06c643e
f3d4a9b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ use serde::{Deserialize, Serialize}; | |
| use typed_builder::TypedBuilder; | ||
|
|
||
| use super::table_metadata::SnapshotLog; | ||
| use crate::encryption::{EncryptedInputFile, EncryptionManager}; | ||
| use crate::error::{Result, timestamp_ms_to_utc}; | ||
| use crate::io::FileIO; | ||
| use crate::spec::{ManifestList, SchemaId, SchemaRef, TableMetadata}; | ||
|
|
@@ -195,12 +196,26 @@ impl Snapshot { | |
| } | ||
|
|
||
| /// Load manifest list. | ||
| pub async fn load_manifest_list( | ||
| pub(crate) async fn load_manifest_list( | ||
| &self, | ||
| file_io: &FileIO, | ||
| table_metadata: &TableMetadata, | ||
| encryption_manager: Option<&EncryptionManager>, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. public api change here.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We had some discussion before, and I think we should add sth like
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The api should be as following: impl Table {
pub fn manifest_list_loader(&self, s: &Snapshot) -> Result<ManifestListLoader> {
...
}
}With this approach, we could hide all table related constructs like |
||
| ) -> Result<ManifestList> { | ||
| let manifest_list_content = file_io.new_input(&self.manifest_list)?.read().await?; | ||
| let manifest_list_content = match (&self.encryption_key_id, encryption_manager) { | ||
| (Some(_), None) => { | ||
| return Err(Error::new( | ||
| ErrorKind::PreconditionFailed, | ||
| "Snapshot has encryption_key_id but no EncryptionManager configured on Table", | ||
| )); | ||
| } | ||
| (Some(key_id), Some(em)) => { | ||
| let key_metadata = em.decrypt_manifest_list_key_metadata(key_id).await?; | ||
| let input = file_io.new_input(&self.manifest_list)?; | ||
| EncryptedInputFile::new(input, key_metadata).read().await? | ||
| } | ||
| (None, _) => file_io.new_input(&self.manifest_list)?.read().await?, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's non-obvious that there could be an intentionally unused
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Double checked and java does the same https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/encryption/EncryptingFileIO.java#L120-L125 |
||
| }; | ||
| ManifestList::parse_with_version( | ||
| &manifest_list_content, | ||
| // TODO: You don't really need the version since you could just project any Avro in | ||
|
|
@@ -521,13 +536,82 @@ impl SnapshotRetention { | |
| #[cfg(test)] | ||
| mod tests { | ||
| use std::collections::HashMap; | ||
| use std::sync::Arc; | ||
|
|
||
| use bytes::Bytes; | ||
| use chrono::{TimeZone, Utc}; | ||
|
|
||
| use crate::encryption::kms::{KeyManagementClient, MemoryKeyManagementClient}; | ||
| use crate::encryption::{EncryptionManager, StandardKeyMetadata}; | ||
| use crate::io::FileIO; | ||
| use crate::spec::TableMetadata; | ||
| use crate::spec::manifest_list::{ManifestList, ManifestListWriter}; | ||
| use crate::spec::snapshot::_serde::SnapshotV1; | ||
| use crate::spec::snapshot::{Operation, Snapshot, Summary}; | ||
|
|
||
| const ENCRYPTION_TEST_V3_METADATA: &str = r#"{ | ||
| "format-version": 3, | ||
| "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", | ||
| "location": "memory:///table", | ||
| "last-sequence-number": 0, | ||
| "last-updated-ms": 1602638573590, | ||
| "last-column-id": 1, | ||
| "current-schema-id": 0, | ||
| "schemas": [{"type": "struct", "schema-id": 0, "fields": [ | ||
| {"id": 1, "name": "x", "required": true, "type": "long"} | ||
| ]}], | ||
| "default-spec-id": 0, | ||
| "partition-specs": [{"spec-id": 0, "fields": []}], | ||
| "last-partition-id": 1000, | ||
| "default-sort-order-id": 0, | ||
| "sort-orders": [{"order-id": 0, "fields": []}], | ||
| "properties": {}, | ||
| "snapshots": [], | ||
| "snapshot-log": [], | ||
| "metadata-log": [], | ||
| "refs": {}, | ||
| "next-row-id": 0 | ||
| }"#; | ||
|
|
||
| fn encryption_test_metadata() -> TableMetadata { | ||
| serde_json::from_str(ENCRYPTION_TEST_V3_METADATA).unwrap() | ||
| } | ||
|
|
||
| fn encryption_test_kms() -> Arc<dyn KeyManagementClient> { | ||
| let kms = MemoryKeyManagementClient::new(); | ||
| kms.add_master_key("master-1").unwrap(); | ||
| Arc::new(kms) | ||
| } | ||
|
|
||
| fn encryption_test_manager() -> EncryptionManager { | ||
| EncryptionManager::builder() | ||
| .kms_client(encryption_test_kms()) | ||
| .table_key_id("master-1") | ||
| .build() | ||
| } | ||
|
|
||
| async fn write_v3_manifest_list_bytes(io: &FileIO, path: &str) -> Bytes { | ||
| let output = io.new_output(path).unwrap(); | ||
| let mut writer = ManifestListWriter::v3(output, 1, None, 0, Some(0)); | ||
| writer.add_manifests(std::iter::empty()).unwrap(); | ||
| writer.close().await.unwrap(); | ||
| io.new_input(path).unwrap().read().await.unwrap() | ||
| } | ||
|
|
||
| fn snapshot_pointing_at(manifest_list_path: &str, key_id: Option<String>) -> Snapshot { | ||
| Snapshot::builder() | ||
| .with_snapshot_id(1) | ||
| .with_sequence_number(0) | ||
| .with_timestamp_ms(0) | ||
| .with_manifest_list(manifest_list_path.to_string()) | ||
| .with_summary(Summary { | ||
| operation: Operation::Append, | ||
| additional_properties: HashMap::new(), | ||
| }) | ||
| .with_encryption_key_id(key_id) | ||
| .build() | ||
| } | ||
|
|
||
| #[test] | ||
| fn schema() { | ||
| let record = r#" | ||
|
|
@@ -729,4 +813,52 @@ mod tests { | |
| assert_eq!(v2_snapshot.parent_snapshot_id(), None); | ||
| assert_eq!(v2_snapshot.schema_id(), None); | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn load_manifest_list_errors_when_encrypted_but_no_manager_configured() { | ||
| let io = FileIO::new_with_memory(); | ||
| let snapshot = snapshot_pointing_at( | ||
| "memory:///table/metadata/manifest-list-enc.avro", | ||
| Some("k1".to_string()), | ||
| ); | ||
| let metadata = encryption_test_metadata(); | ||
|
|
||
| let err = snapshot | ||
| .load_manifest_list(&io, &metadata, None) | ||
| .await | ||
| .unwrap_err(); | ||
| assert_eq!(err.kind(), crate::ErrorKind::PreconditionFailed); | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn load_manifest_list_decrypts_roundtrip() { | ||
| let io = FileIO::new_with_memory(); | ||
| let plain_path = "memory:///table/metadata/manifest-list-plain.avro"; | ||
| let encrypted_path = "memory:///table/metadata/manifest-list-enc.avro"; | ||
|
|
||
| // Build raw manifest list bytes via the standard writer. | ||
| let raw_bytes = write_v3_manifest_list_bytes(&io, plain_path).await; | ||
|
|
||
| // Encrypt those bytes to a second path and capture the file's key metadata. | ||
| let mgr = encryption_test_manager(); | ||
| let encrypted_output = mgr.encrypt(io.new_output(encrypted_path).unwrap()); | ||
| let std_key_metadata: StandardKeyMetadata = encrypted_output.key_metadata().clone(); | ||
| encrypted_output.write(raw_bytes).await.unwrap(); | ||
|
|
||
| // Wrap the file's key metadata with a KEK and record the resulting wrapped | ||
| // entry's id on the snapshot. | ||
| let key_id = mgr | ||
| .encrypt_manifest_list_key_metadata(&std_key_metadata) | ||
| .await | ||
| .unwrap(); | ||
|
|
||
| let snapshot = snapshot_pointing_at(encrypted_path, Some(key_id)); | ||
| let metadata = encryption_test_metadata(); | ||
|
|
||
| let manifest_list: ManifestList = snapshot | ||
| .load_manifest_list(&io, &metadata, Some(&mgr)) | ||
| .await | ||
| .unwrap(); | ||
| assert_eq!(manifest_list.entries().len(), 0); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -128,6 +128,11 @@ pub struct TableProperties { | |
| pub cdc_max_chunk_size: usize, | ||
| /// Content-defined chunking normalization level (gearhash bit adjustment). | ||
| pub cdc_norm_level: i32, | ||
| /// The master key id used to encrypt this table's manifest list and data | ||
| /// files. `None` if `encryption.key-id` is not set. | ||
| pub encryption_key_id: Option<String>, | ||
| /// The encryption data encryption key length in bytes. | ||
| pub encryption_data_key_length: usize, | ||
| } | ||
|
|
||
| impl TableProperties { | ||
|
|
@@ -253,6 +258,15 @@ impl TableProperties { | |
| "write.parquet.content-defined-chunking.norm-level"; | ||
| /// Default matches `parquet::file::properties::DEFAULT_CDC_NORM_LEVEL`. | ||
| pub const PROPERTY_PARQUET_CDC_NORM_LEVEL_DEFAULT: i32 = 0; | ||
|
|
||
| /// Property key for the master key id used to encrypt the table's manifest | ||
| /// list and data files as defined in https://iceberg.apache.org/docs/nightly/encryption/. | ||
| pub const PROPERTY_ENCRYPTION_KEY_ID: &str = "encryption.key-id"; | ||
|
|
||
| /// Property key for the encryption data encryption key (DEK) length in bytes. | ||
| pub const PROPERTY_ENCRYPTION_DATA_KEY_LENGTH: &str = "encryption.data-key-length"; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for adding it. I think I can get rid of the additional argument for data key length In introduced in #2466 |
||
| /// Default value for the encryption DEK length (16 bytes = AES-128). | ||
| pub const PROPERTY_ENCRYPTION_DATA_KEY_LENGTH_DEFAULT: usize = 16; | ||
| } | ||
|
|
||
| impl TryFrom<&HashMap<String, String>> for TableProperties { | ||
|
|
@@ -322,6 +336,14 @@ impl TryFrom<&HashMap<String, String>> for TableProperties { | |
| TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL, | ||
| TableProperties::PROPERTY_PARQUET_CDC_NORM_LEVEL_DEFAULT, | ||
| )?, | ||
| encryption_key_id: props | ||
| .get(TableProperties::PROPERTY_ENCRYPTION_KEY_ID) | ||
| .cloned(), | ||
| encryption_data_key_length: parse_property( | ||
| props, | ||
| TableProperties::PROPERTY_ENCRYPTION_DATA_KEY_LENGTH, | ||
| TableProperties::PROPERTY_ENCRYPTION_DATA_KEY_LENGTH_DEFAULT, | ||
| )?, | ||
| }) | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
public api change here