1818import uuid
1919from datetime import date , datetime , timedelta , timezone
2020from decimal import Decimal
21- from typing import Any , List
21+ from typing import Any , Callable , List , Optional
2222
2323import pytest
2424from pyspark .sql import SparkSession
7878
7979
8080@pytest .mark .parametrize (
81- "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification" ,
81+ "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name " ,
8282 [
8383 # # Identity Transform
8484 (
9999 VALUES
100100 (false, 'Boolean field set to false');
101101 """ ,
102+ None ,
102103 ),
103104 (
104105 [PartitionField (source_id = 2 , field_id = 1001 , transform = IdentityTransform (), name = "string_field" )],
118119 VALUES
119120 ('sample_string', 'Another string value')
120121 """ ,
122+ None ,
121123 ),
122124 (
123125 [PartitionField (source_id = 4 , field_id = 1001 , transform = IdentityTransform (), name = "int_field" )],
137139 VALUES
138140 (42, 'Associated string value for int 42')
139141 """ ,
142+ None ,
140143 ),
141144 (
142145 [PartitionField (source_id = 5 , field_id = 1001 , transform = IdentityTransform (), name = "long_field" )],
156159 VALUES
157160 (1234567890123456789, 'Associated string value for long 1234567890123456789')
158161 """ ,
162+ None ,
159163 ),
160164 (
161165 [PartitionField (source_id = 6 , field_id = 1001 , transform = IdentityTransform (), name = "float_field" )],
179183 # VALUES
180184 # (3.14, 'Associated string value for float 3.14')
181185 # """
186+ None ,
182187 ),
183188 (
184189 [PartitionField (source_id = 7 , field_id = 1001 , transform = IdentityTransform (), name = "double_field" )],
202207 # VALUES
203208 # (6.282, 'Associated string value for double 6.282')
204209 # """
210+ None ,
205211 ),
206212 (
207213 [PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
221227 VALUES
222228 (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
223229 """ ,
230+ None ,
224231 ),
225232 (
226233 [PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
240247 VALUES
241248 (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
242249 """ ,
250+ None ,
243251 ),
244252 (
245253 [PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
264272 # VALUES
265273 # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
266274 # """
275+ None ,
267276 ),
268277 (
269278 [PartitionField (source_id = 9 , field_id = 1001 , transform = IdentityTransform (), name = "timestamptz_field" )],
288297 # VALUES
289298 # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
290299 # """
300+ None ,
291301 ),
292302 (
293303 [PartitionField (source_id = 10 , field_id = 1001 , transform = IdentityTransform (), name = "date_field" )],
307317 VALUES
308318 (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
309319 """ ,
320+ None ,
310321 ),
311322 (
312323 [PartitionField (source_id = 14 , field_id = 1001 , transform = IdentityTransform (), name = "uuid_field" )],
326337 VALUES
327338 ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
328339 """ ,
340+ None ,
329341 ),
330342 (
331343 [PartitionField (source_id = 11 , field_id = 1001 , transform = IdentityTransform (), name = "binary_field" )],
345357 VALUES
346358 (CAST('example' AS BINARY), 'Associated string value for binary `example`')
347359 """ ,
360+ None ,
348361 ),
349362 (
350363 [PartitionField (source_id = 13 , field_id = 1001 , transform = IdentityTransform (), name = "decimal_field" )],
364377 VALUES
365378 (123.45, 'Associated string value for decimal 123.45')
366379 """ ,
380+ None ,
367381 ),
368382 # # Year Month Day Hour Transform
369383 # Month Transform
385399 VALUES
386400 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
387401 """ ,
402+ None ,
388403 ),
389404 (
390405 [PartitionField (source_id = 9 , field_id = 1001 , transform = MonthTransform (), name = "timestamptz_field_month" )],
404419 VALUES
405420 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
406421 """ ,
422+ None ,
407423 ),
408424 (
409425 [PartitionField (source_id = 10 , field_id = 1001 , transform = MonthTransform (), name = "date_field_month" )],
423439 VALUES
424440 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
425441 """ ,
442+ None ,
426443 ),
427444 # Year Transform
428445 (
443460 VALUES
444461 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
445462 """ ,
463+ None ,
446464 ),
447465 (
448466 [PartitionField (source_id = 9 , field_id = 1001 , transform = YearTransform (), name = "timestamptz_field_year" )],
462480 VALUES
463481 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
464482 """ ,
483+ None ,
465484 ),
466485 (
467486 [PartitionField (source_id = 10 , field_id = 1001 , transform = YearTransform (), name = "date_field_year" )],
481500 VALUES
482501 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
483502 """ ,
503+ None ,
484504 ),
485505 # # Day Transform
486506 (
501521 VALUES
502522 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
503523 """ ,
524+ None ,
504525 ),
505526 (
506527 [PartitionField (source_id = 9 , field_id = 1001 , transform = DayTransform (), name = "timestamptz_field_day" )],
520541 VALUES
521542 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
522543 """ ,
544+ None ,
523545 ),
524546 (
525547 [PartitionField (source_id = 10 , field_id = 1001 , transform = DayTransform (), name = "date_field_day" )],
539561 VALUES
540562 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
541563 """ ,
564+ None ,
542565 ),
543566 # Hour Transform
544567 (
559582 VALUES
560583 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
561584 """ ,
585+ None ,
562586 ),
563587 (
564588 [PartitionField (source_id = 9 , field_id = 1001 , transform = HourTransform (), name = "timestamptz_field_hour" )],
578602 VALUES
579603 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
580604 """ ,
605+ None ,
581606 ),
582607 # Truncate Transform
583608 (
598623 VALUES
599624 (12345, 'Sample data for int');
600625 """ ,
626+ None ,
601627 ),
602628 (
603629 [PartitionField (source_id = 5 , field_id = 1001 , transform = TruncateTransform (2 ), name = "bigint_field_trunc" )],
617643 VALUES
618644 (4294967297, 'Sample data for long');
619645 """ ,
646+ None ,
620647 ),
621648 (
622649 [PartitionField (source_id = 2 , field_id = 1001 , transform = TruncateTransform (3 ), name = "string_field_trunc" )],
636663 VALUES
637664 ('abcdefg', 'Another sample for string');
638665 """ ,
666+ None ,
639667 ),
640668 (
641669 [PartitionField (source_id = 13 , field_id = 1001 , transform = TruncateTransform (width = 5 ), name = "decimal_field_trunc" )],
655683 VALUES
656684 (678.90, 'Associated string value for decimal 678.90')
657685 """ ,
686+ None ,
658687 ),
659688 (
660689 [PartitionField (source_id = 11 , field_id = 1001 , transform = TruncateTransform (10 ), name = "binary_field_trunc" )],
674703 VALUES
675704 (binary('HELLOICEBERG'), 'Sample data for binary');
676705 """ ,
706+ None ,
677707 ),
678708 # Bucket Transform
679709 (
694724 VALUES
695725 (10, 'Integer with value 10');
696726 """ ,
727+ None ,
697728 ),
698729 # Test multiple field combinations could generate the Partition record and hive partition path correctly
699730 (
722753 VALUES
723754 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
724755 """ ,
756+ None ,
725757 ),
726758 # Test that special characters are URL-encoded
727759 (
728- [PartitionField (source_id = 15 , field_id = 1001 , transform = IdentityTransform (), name = "special#string# field" )],
760+ [PartitionField (source_id = 15 , field_id = 1001 , transform = IdentityTransform (), name = "special#string+ field" )],
729761 ["special string" ],
730- Record (** {"special#string#field" : "special string" }), # type: ignore
731- "special%23string%23field=special+string" ,
732- # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
733- # (comparing expected value with Spark behavior) would fail: PyIceberg produces
734- # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
735- # None,
736- # None,
762+ Record (** {"special#string+field" : "special string" }), # type: ignore
763+ "special%23string%2Bfield=special+string" ,
737764 f"""CREATE TABLE { identifier } (
738- `special#string# field` string
765+ `special#string+ field` string
739766 )
740767 USING iceberg
741768 PARTITIONED BY (
742- identity(`special#string# field`)
769+ identity(`special#string+ field`)
743770 )
744771 """ ,
745772 f"""INSERT INTO { identifier }
746773 VALUES
747774 ('special string')
748775 """ ,
776+ lambda name : name .replace ("#" , "_x23" ).replace ("+" , "_x2B" ),
749777 ),
750778 ],
751779)
@@ -759,6 +787,7 @@ def test_partition_key(
759787 expected_hive_partition_path_slice : str ,
760788 spark_create_table_sql_for_justification : str ,
761789 spark_data_insert_sql_for_justification : str ,
790+ make_compatible_name : Optional [Callable [[str ], str ]],
762791) -> None :
763792 partition_field_values = [PartitionFieldValue (field , value ) for field , value in zip (partition_fields , partition_values )]
764793 spec = PartitionSpec (* partition_fields )
@@ -793,5 +822,12 @@ def test_partition_key(
793822 spark_path_for_justification = (
794823 snapshot .manifests (iceberg_table .io )[0 ].fetch_manifest_entry (iceberg_table .io )[0 ].data_file .file_path
795824 )
796- assert spark_partition_for_justification == expected_partition_record
825+ # Special characters in partition value are sanitized when written to the data file's partition field
826+ # Use `make_compatible_name` to match the sanitize behavior
827+ sanitized_record = (
828+ Record (** {make_compatible_name (k ): v for k , v in vars (expected_partition_record ).items ()})
829+ if make_compatible_name
830+ else expected_partition_record
831+ )
832+ assert spark_partition_for_justification == sanitized_record
797833 assert expected_hive_partition_path_slice in spark_path_for_justification
0 commit comments