Skip to content

Commit 61cdd08

Browse files
author
Sreesh Maheshwar
committed
Add make_name_compatible suggestion so test passes
1 parent 1bb379b commit 61cdd08

File tree

1 file changed

+49
-13
lines changed

1 file changed

+49
-13
lines changed

tests/integration/test_partitioning_key.py

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import uuid
1919
from datetime import date, datetime, timedelta, timezone
2020
from decimal import Decimal
21-
from typing import Any, List
21+
from typing import Any, Callable, List, Optional
2222

2323
import pytest
2424
from pyspark.sql import SparkSession
@@ -78,7 +78,7 @@
7878

7979

8080
@pytest.mark.parametrize(
81-
"partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification",
81+
"partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name",
8282
[
8383
# # Identity Transform
8484
(
@@ -99,6 +99,7 @@
9999
VALUES
100100
(false, 'Boolean field set to false');
101101
""",
102+
None,
102103
),
103104
(
104105
[PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")],
@@ -118,6 +119,7 @@
118119
VALUES
119120
('sample_string', 'Another string value')
120121
""",
122+
None,
121123
),
122124
(
123125
[PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")],
@@ -137,6 +139,7 @@
137139
VALUES
138140
(42, 'Associated string value for int 42')
139141
""",
142+
None,
140143
),
141144
(
142145
[PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")],
@@ -156,6 +159,7 @@
156159
VALUES
157160
(1234567890123456789, 'Associated string value for long 1234567890123456789')
158161
""",
162+
None,
159163
),
160164
(
161165
[PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")],
@@ -179,6 +183,7 @@
179183
# VALUES
180184
# (3.14, 'Associated string value for float 3.14')
181185
# """
186+
None,
182187
),
183188
(
184189
[PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")],
@@ -202,6 +207,7 @@
202207
# VALUES
203208
# (6.282, 'Associated string value for double 6.282')
204209
# """
210+
None,
205211
),
206212
(
207213
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -221,6 +227,7 @@
221227
VALUES
222228
(CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
223229
""",
230+
None,
224231
),
225232
(
226233
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -240,6 +247,7 @@
240247
VALUES
241248
(CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
242249
""",
250+
None,
243251
),
244252
(
245253
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -264,6 +272,7 @@
264272
# VALUES
265273
# (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
266274
# """
275+
None,
267276
),
268277
(
269278
[PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")],
@@ -288,6 +297,7 @@
288297
# VALUES
289298
# (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
290299
# """
300+
None,
291301
),
292302
(
293303
[PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")],
@@ -307,6 +317,7 @@
307317
VALUES
308318
(CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
309319
""",
320+
None,
310321
),
311322
(
312323
[PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")],
@@ -326,6 +337,7 @@
326337
VALUES
327338
('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
328339
""",
340+
None,
329341
),
330342
(
331343
[PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")],
@@ -345,6 +357,7 @@
345357
VALUES
346358
(CAST('example' AS BINARY), 'Associated string value for binary `example`')
347359
""",
360+
None,
348361
),
349362
(
350363
[PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")],
@@ -364,6 +377,7 @@
364377
VALUES
365378
(123.45, 'Associated string value for decimal 123.45')
366379
""",
380+
None,
367381
),
368382
# # Year Month Day Hour Transform
369383
# Month Transform
@@ -385,6 +399,7 @@
385399
VALUES
386400
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
387401
""",
402+
None,
388403
),
389404
(
390405
[PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")],
@@ -404,6 +419,7 @@
404419
VALUES
405420
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
406421
""",
422+
None,
407423
),
408424
(
409425
[PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")],
@@ -423,6 +439,7 @@
423439
VALUES
424440
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
425441
""",
442+
None,
426443
),
427444
# Year Transform
428445
(
@@ -443,6 +460,7 @@
443460
VALUES
444461
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
445462
""",
463+
None,
446464
),
447465
(
448466
[PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")],
@@ -462,6 +480,7 @@
462480
VALUES
463481
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
464482
""",
483+
None,
465484
),
466485
(
467486
[PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")],
@@ -481,6 +500,7 @@
481500
VALUES
482501
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
483502
""",
503+
None,
484504
),
485505
# # Day Transform
486506
(
@@ -501,6 +521,7 @@
501521
VALUES
502522
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
503523
""",
524+
None,
504525
),
505526
(
506527
[PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")],
@@ -520,6 +541,7 @@
520541
VALUES
521542
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
522543
""",
544+
None,
523545
),
524546
(
525547
[PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")],
@@ -539,6 +561,7 @@
539561
VALUES
540562
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
541563
""",
564+
None,
542565
),
543566
# Hour Transform
544567
(
@@ -559,6 +582,7 @@
559582
VALUES
560583
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
561584
""",
585+
None,
562586
),
563587
(
564588
[PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")],
@@ -578,6 +602,7 @@
578602
VALUES
579603
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
580604
""",
605+
None,
581606
),
582607
# Truncate Transform
583608
(
@@ -598,6 +623,7 @@
598623
VALUES
599624
(12345, 'Sample data for int');
600625
""",
626+
None,
601627
),
602628
(
603629
[PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")],
@@ -617,6 +643,7 @@
617643
VALUES
618644
(4294967297, 'Sample data for long');
619645
""",
646+
None,
620647
),
621648
(
622649
[PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")],
@@ -636,6 +663,7 @@
636663
VALUES
637664
('abcdefg', 'Another sample for string');
638665
""",
666+
None,
639667
),
640668
(
641669
[PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")],
@@ -655,6 +683,7 @@
655683
VALUES
656684
(678.90, 'Associated string value for decimal 678.90')
657685
""",
686+
None,
658687
),
659688
(
660689
[PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")],
@@ -674,6 +703,7 @@
674703
VALUES
675704
(binary('HELLOICEBERG'), 'Sample data for binary');
676705
""",
706+
None,
677707
),
678708
# Bucket Transform
679709
(
@@ -694,6 +724,7 @@
694724
VALUES
695725
(10, 'Integer with value 10');
696726
""",
727+
None,
697728
),
698729
# Test multiple field combinations could generate the Partition record and hive partition path correctly
699730
(
@@ -722,30 +753,27 @@
722753
VALUES
723754
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
724755
""",
756+
None,
725757
),
726758
# Test that special characters are URL-encoded
727759
(
728-
[PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
760+
[PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")],
729761
["special string"],
730-
Record(**{"special#string#field": "special string"}), # type: ignore
731-
"special%23string%23field=special+string",
732-
# Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
733-
# (comparing expected value with Spark behavior) would fail: PyIceberg produces
734-
# Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
735-
# None,
736-
# None,
762+
Record(**{"special#string+field": "special string"}), # type: ignore
763+
"special%23string%2Bfield=special+string",
737764
f"""CREATE TABLE {identifier} (
738-
`special#string#field` string
765+
`special#string+field` string
739766
)
740767
USING iceberg
741768
PARTITIONED BY (
742-
identity(`special#string#field`)
769+
identity(`special#string+field`)
743770
)
744771
""",
745772
f"""INSERT INTO {identifier}
746773
VALUES
747774
('special string')
748775
""",
776+
lambda name: name.replace("#", "_x23").replace("+", "_x2B"),
749777
),
750778
],
751779
)
@@ -759,6 +787,7 @@ def test_partition_key(
759787
expected_hive_partition_path_slice: str,
760788
spark_create_table_sql_for_justification: str,
761789
spark_data_insert_sql_for_justification: str,
790+
make_compatible_name: Optional[Callable[[str], str]],
762791
) -> None:
763792
partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)]
764793
spec = PartitionSpec(*partition_fields)
@@ -793,5 +822,12 @@ def test_partition_key(
793822
spark_path_for_justification = (
794823
snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
795824
)
796-
assert spark_partition_for_justification == expected_partition_record
825+
# Special characters in partition value are sanitized when written to the data file's partition field
826+
# Use `make_compatible_name` to match the sanitize behavior
827+
sanitized_record = (
828+
Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
829+
if make_compatible_name
830+
else expected_partition_record
831+
)
832+
assert spark_partition_for_justification == sanitized_record
797833
assert expected_hive_partition_path_slice in spark_path_for_justification

0 commit comments

Comments
 (0)