From 01fe65d01fd356f9b8bf87846afdd2bef882b967 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 24 Jun 2026 11:49:44 +0800 Subject: [PATCH] [fix](fs) Make PathUtils.equalsIgnoreSchemeIfOneIsS3 path comparison consistent equalsIgnoreSchemeIfOneIsS3 used inconsistent logic across its two branches: same-scheme did a full-string case-INsensitive compare with the trailing slash significant, while cross-scheme (one is s3) compared normalized authority+path case-sensitively with the trailing slash stripped. The result for one URI therefore depended on the other URI's scheme, and same-scheme comparisons wrongly ignored case (S3 keys are case-sensitive). Unify both branches into one rule: when schemes are equal (case-insensitively) or one side is s3, compare the authority and path only, with trailing slashes insignificant and the comparison case-sensitive on the raw (percent-encoded) components; otherwise not equal. This matches the original normalize() intent and the sole caller HMSTransaction (avoid renames when the location is identical). Malformed inputs for object storage (opaque URIs, scheme-with-null-authority triple-slash forms, authority-with-null-scheme network-path references, parse failures) fall back to exact string comparison; percent-encoded slashes stay distinct. Hardened via multi-persona review; adds extensive PathUtilsTest coverage. --- .../doris/foundation/util/PathUtils.java | 138 +++++++-- .../doris/foundation/util/PathUtilsTest.java | 276 +++++++++++++++++- 2 files changed, 386 insertions(+), 28 deletions(-) diff --git a/fe/fe-foundation/src/main/java/org/apache/doris/foundation/util/PathUtils.java b/fe/fe-foundation/src/main/java/org/apache/doris/foundation/util/PathUtils.java index 954d4c5bddd37b..6f9923d562791e 100644 --- a/fe/fe-foundation/src/main/java/org/apache/doris/foundation/util/PathUtils.java +++ b/fe/fe-foundation/src/main/java/org/apache/doris/foundation/util/PathUtils.java @@ -20,19 +20,38 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.Objects; +import java.util.regex.Pattern; public class PathUtils { + private static final Pattern TRAILING_SLASHES = Pattern.compile("/+$"); + /** * Compares two URI strings for equality with special handling for the "s3" scheme. *

* The comparison rules are: - * - If both URIs have the same scheme, compare the full URI strings (case-insensitive). - * - If the schemes differ, but one of them is "s3", then compare only the authority and path parts, - * ignoring the scheme. - * - Otherwise, consider the URIs as not equal. + * - If the schemes are equal (case-insensitively, per RFC 3986 section 3.1; including both null + * for bare paths), or they differ but one of them is "s3", compare the authority + * (bucket/host), path, query string, and fragment, ignoring the scheme. Any run of trailing + * slashes on the path is insignificant (a location compares equal with or without trailing + * path slashes, e.g. "s3://bucket/data", "s3://bucket/data/" and "s3://bucket/data//" are all + * equal); the authority is compared as-is because java.net.URI always places the "/" delimiter + * in the path, never in the authority. (Exception: a bare schemeless filesystem root "/" and + * the empty path "" are treated as distinct locations even though both normalize to the empty + * string, so equalsIgnoreSchemeIfOneIsS3("/", "") returns false.) The authority and path are + * compared byte-for-byte and case-sensitively over their raw (percent-encoded) forms, because + * object-storage keys are case-sensitive; the scheme itself is matched case-insensitively + * (RFC 3986 section 3.1). Query strings and fragments are compared by value (case-sensitively); + * a URI carrying a query string or fragment differs from one without (null vs non-null). + * - The following inputs are malformed for object-storage purposes and fall back to exact + * (case-sensitive) string comparison: opaque URIs (e.g. "s3:bucket/key" with no "//"); a + * non-opaque URI whose scheme is non-null but whose authority is absent (the triple-slash + * form, e.g. "s3:///path" or "file:///path"); a network-path reference whose authority is + * present but scheme is absent (e.g. "//bucket/path"); and URIs that fail to parse. + * - Otherwise (different schemes, neither is "s3"), consider the URIs as not equal. *

* This is useful in scenarios where "s3" URIs should be treated as equivalent to other schemes - * if the host and path match, ignoring scheme differences. + * if the host and path match, ignoring scheme differences. The trailing-slash and case handling is + * applied consistently regardless of whether the two schemes match. * * @param p1 the first URI string to compare * @param p2 the second URI string to compare @@ -43,40 +62,104 @@ public static boolean equalsIgnoreSchemeIfOneIsS3(String p1, String p2) { return p1 == null && p2 == null; } + // Fast path: identical raw strings are equal under every normalization rule below, so we + // can skip the two URI parses (and their Matcher/substring allocations) on the common + // unchanged-location case. This also makes the identity property hold for inputs that would + // otherwise fall back to exact string comparison (opaque, triple-slash, network-path). + if (p1.equals(p2)) { + return true; + } + try { URI uri1 = new URI(p1); URI uri2 = new URI(p2); - String scheme1 = uri1.getScheme(); - String scheme2 = uri2.getScheme(); + // Opaque URIs (e.g. "s3:bucket/key" with no "//") have null authority and path, so they + // would all normalize to "" and compare equal regardless of content. Such URIs are + // malformed for object-storage purposes; fall back to exact string comparison. + if (uri1.isOpaque() || uri2.isOpaque()) { + return p1.equals(p2); + } - // If schemes are equal, compare the full URI strings ignoring case - if (scheme1 != null && scheme1.equalsIgnoreCase(scheme2)) { - return p1.equalsIgnoreCase(p2); + // Two classes of inputs are malformed for object-storage purposes and fall back to + // exact string comparison so they cannot spuriously match via the structural path: + // 1. A URI with a scheme but a null authority (e.g. the triple-slash form + // "s3:///path" or "file:///path"): its absent authority would normalize to "" and + // could spuriously match another null-authority URI across schemes. + // 2. A network-path reference: a URI with an authority but a null scheme + // (e.g. "//bucket/path"). java.net.URI parses this as scheme=null, + // authority="bucket". It is not a valid object-storage location, so without this + // guard it would fall through to the structural comparison and spuriously match a + // fully-qualified s3 URI ("s3://bucket/path"). + // Schemeless bare paths (null scheme AND null authority, e.g. "/path") are intentionally + // left to the structural comparison below so identical bare paths still compare equal. + if ((uri1.getScheme() != null && uri1.getRawAuthority() == null) + || (uri2.getScheme() != null && uri2.getRawAuthority() == null) + || (uri1.getScheme() == null && uri1.getRawAuthority() != null) + || (uri2.getScheme() == null && uri2.getRawAuthority() != null)) { + return p1.equals(p2); } - // If schemes differ but one is "s3", compare only authority and path ignoring scheme - if ("s3".equalsIgnoreCase(scheme1) || "s3".equalsIgnoreCase(scheme2)) { - String auth1 = normalize(uri1.getAuthority()); - String auth2 = normalize(uri2.getAuthority()); - String path1 = normalize(uri1.getPath()); - String path2 = normalize(uri2.getPath()); + // Distinguish the schemeless filesystem root "/" from the empty/current path "". With no + // scheme and no authority, both normalize() to "" (the trailing-slash strip collapses + // "/" to ""), which would make them spuriously equal even though they are distinct + // locations. This must NOT fire when an authority is present (e.g. "s3://bucket/" vs + // "s3://bucket"), where the trailing slash on the bucket root IS insignificant. The + // exact-string fallback returns false for "/" vs "" (the fast path above already handled + // "/" == "/" and "" == ""). + if (uri1.getScheme() == null && uri1.getRawAuthority() == null) { + String rawPath1 = uri1.getRawPath(); + String rawPath2 = uri2.getRawPath(); + if (("/".equals(rawPath1) && "".equals(rawPath2)) + || ("".equals(rawPath1) && "/".equals(rawPath2))) { + return p1.equals(p2); + } + } + + String scheme1 = uri1.getScheme(); + String scheme2 = uri2.getScheme(); + + // Null-safe, case-insensitive scheme equality (RFC 3986 section 3.1). Two null schemes + // (e.g. bare paths) are treated as the same scheme so identical schemeless locations + // compare equal. The explicit scheme2 != null guard makes the null handling obvious + // without relying on String.equalsIgnoreCase(null)'s documented behavior. + boolean sameScheme = scheme1 == null + ? scheme2 == null + : (scheme2 != null && scheme1.equalsIgnoreCase(scheme2)); + boolean oneIsS3 = "s3".equalsIgnoreCase(scheme1) || "s3".equalsIgnoreCase(scheme2); - return Objects.equals(auth1, auth2) && Objects.equals(path1, path2); + // Different schemes and neither is "s3": treat as different locations. + if (!sameScheme && !oneIsS3) { + return false; } - // Otherwise, URIs are not equal - return false; + // Same scheme, or cross-scheme where one side is "s3" (object stores are unified under + // the s3 scheme on the BE): the scheme is irrelevant -- compare only the authority + // (bucket/host) and the path. The raw (still percent-encoded) components are used so the + // comparison is byte-for-byte and case-sensitive (e.g. "a%2Fb" differs from "a/b"), since + // object-storage keys are case-sensitive. Both are normalized so a directory location + // compares equal with or without a trailing slash. Query strings and fragments are also + // compared so two locations that differ only there are not treated as identical. + // The authority is compared as-is (null -> ""); java.net.URI never places trailing + // slashes in the authority (the "/" delimiter always belongs to the path), so the + // trailing-slash normalize() is meaningful only for the path. + return Objects.equals(Objects.toString(uri1.getRawAuthority(), ""), + Objects.toString(uri2.getRawAuthority(), "")) + && Objects.equals(normalize(uri1.getRawPath()), normalize(uri2.getRawPath())) + && Objects.equals(uri1.getRawQuery(), uri2.getRawQuery()) + && Objects.equals(uri1.getRawFragment(), uri2.getRawFragment()); } catch (URISyntaxException e) { - // If URI parsing fails, fallback to simple case-insensitive string comparison - return p1.equalsIgnoreCase(p2); + // If URI parsing fails, fall back to exact string comparison. + return p1.equals(p2); } } /** - * Normalizes a URI component by converting null to empty string and - * removing trailing slashes. + * Normalizes a URI component by converting null to the empty string and removing any run of + * trailing slashes. Stripping the whole run (not just one slash) means a directory location + * compares equal regardless of how many trailing slashes it carries, e.g. + * "/data/" -> "/data", "/data//" -> "/data", "/" and "//" -> "". * * @param s the string to normalize * @return normalized string without trailing slashes @@ -85,8 +168,11 @@ private static String normalize(String s) { if (s == null) { return ""; } - // Remove trailing slashes for consistent comparison - String trimmed = s.replaceAll("/+$", ""); - return trimmed.isEmpty() ? "" : trimmed; + // Skip the Matcher allocation when there is no trailing slash (the overwhelmingly common + // case for real object-storage paths), letting the JIT avoid the regex engine entirely. + if (s.isEmpty() || s.charAt(s.length() - 1) != '/') { + return s; + } + return TRAILING_SLASHES.matcher(s).replaceAll(""); } } diff --git a/fe/fe-foundation/src/test/java/org/apache/doris/foundation/util/PathUtilsTest.java b/fe/fe-foundation/src/test/java/org/apache/doris/foundation/util/PathUtilsTest.java index c239b47b95a2cd..5e803952d03498 100644 --- a/fe/fe-foundation/src/test/java/org/apache/doris/foundation/util/PathUtilsTest.java +++ b/fe/fe-foundation/src/test/java/org/apache/doris/foundation/util/PathUtilsTest.java @@ -59,17 +59,278 @@ public void testEqualsIgnoreSchemeDifferentHost() { @Test public void testEqualsIgnoreSchemeTrailingSlash() { - Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + // Trailing slashes are insignificant: a location with or without a trailing slash + // refers to the same place. This now holds consistently for same-scheme comparisons too. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( "oss://bucket/data/", "oss://bucket/data" )); - Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( "hdfs://namenode/user/hadoop/", "hdfs://namenode/user/hadoop" )); } + @Test + public void testTrailingSlashConsistentAcrossSchemes() { + // The result of a trailing-slash-only difference must NOT depend on the other URI's scheme. + // same scheme: + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path/", "s3://bucket/path")); + // cross scheme (one is s3): + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path/", "cos://bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemePathCaseSensitive() { + // Object-storage keys are case-sensitive, so path case must matter -- consistently + // for both same-scheme and cross-scheme comparisons. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/Path", "s3://bucket/path")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/Path", "cos://bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemeAuthorityCaseSensitive() { + // The authority (bucket name) is also compared case-sensitively. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://BUCKET/path", "s3://bucket/path")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://BUCKET/path", "cos://bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemeBucketRootTrailingSlash() { + // Bucket root with and without a trailing slash refer to the same location. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/", "s3://bucket")); + // Cross-scheme bucket root: "s3://bucket/" (path="/") vs "cos://bucket" (path=""). + // Both normalize to "" so they should compare equal. This exercises the oneIsS3 branch + // with a root slash being normalized to match the empty path -- the production case where + // the HMSTransaction caller compares bucket roots across schemes. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/", "cos://bucket")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "cos://bucket/", "s3://bucket")); + } + + @Test + public void testEqualsIgnoreSchemeMultipleTrailingSlashes() { + // The TRAILING_SLASHES pattern (/+$) strips any number of trailing slashes, not just one. + // Guards against the pattern being narrowed to a single-slash strip. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path//", "s3://bucket/path")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path///", "s3://bucket/path/")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket///", "s3://bucket")); + } + + @Test + public void testEqualsIgnoreSchemeDifferentSchemesNeitherS3TrailingSlashOnly() { + // Two non-s3 schemes whose authority+path match modulo a trailing slash must still be + // unequal: the !sameScheme && !oneIsS3 branch short-circuits to false before normalize() + // is ever consulted, independent of path content. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "oss://bucket/path/", "obs://bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemeOpaqueUri() { + // Opaque URIs ("s3:..." with no "//") must NOT all collapse to equal: they have null + // authority and path, so without an opaque guard they would always compare equal. They fall + // back to exact string comparison. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3:arbitraryA", "s3:arbitraryB")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3:bucket1/key1", "s3:bucket2/key2")); + // Identical opaque URIs are equal via the exact-string fallback. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3:bucket/key", "s3:bucket/key")); + // Mixed opaque vs. hierarchical: the isOpaque() guard fires when EITHER side is opaque, so + // a malformed s3-without-"//" string compared against a valid s3:// URI falls back to exact + // string comparison and is NOT equal. Asserted in both argument orders because the guard is + // an OR over both arguments (uri1.isOpaque() true / uri2.isOpaque() false, and vice versa). + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3:bucket/key", "s3://bucket/key")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/key", "s3:bucket/key")); + // A valid hierarchical URI compared against itself still takes the fast path (equal). + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/key", "s3://bucket/key")); + } + + @Test + public void testEqualsIgnoreSchemeEncodedSlashDistinct() { + // In S3 the key "a%2Fb" (literal slash in the key) and "a/b" (path separator) are different + // objects. Using the raw path keeps them distinct. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/a%2Fb", "s3://bucket/a/b")); + } + + @Test + public void testEqualsIgnoreSchemeEncodedTrailingSlashNotStripped() { + // A percent-encoded trailing slash (%2F) is a literal character in the S3 key and + // must NOT be treated as an insignificant trailing slash. normalize() checks + // the last raw char == '/' and leaves '%2F' (last char 'F') untouched. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path%2F", "s3://bucket/path")); + // Cross-scheme variant exercises the same code path. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path%2F", "cos://bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemeQueryStringDistinguishes() { + // Query strings are compared, so locations differing only in their query are not equal. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "oss://bucket/path?v=1", "oss://bucket/path?v=2")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path?v=1", "s3://bucket/path")); + // The query check also applies in the cross-scheme (oneIsS3) branch. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path?v=1", "cos://bucket/path?v=2")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path?v=1", "cos://bucket/path?v=1")); + } + + @Test + public void testEqualsIgnoreSchemeSchemelessIdentical() { + // Two identical schemeless (bare-path) locations parse with a null scheme and must still + // compare equal (identity property). + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "/user/warehouse/tbl", "/user/warehouse/tbl")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "/user/warehouse/tbl/", "/user/warehouse/tbl")); + // Two different bare paths must NOT be equal: proves the schemeless structural comparison + // actually discriminates rather than returning true for all schemeless inputs. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "/user/warehouse/a", "/user/warehouse/b")); + } + + @Test + public void testEqualsIgnoreSchemeNullInputs() { + // Both null -> equal; exactly one null -> not equal. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3(null, null)); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3(null, "s3://bucket/path")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3("s3://bucket/path", null)); + } + + @Test + public void testEqualsIgnoreSchemeSchemeCaseInsensitive() { + // Schemes are compared case-insensitively (RFC 3986 section 3.1), so two same-scheme URIs + // differing only in scheme case still compare equal via the sameScheme branch. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "OSS://bucket/path", "oss://bucket/path")); + // An uppercase "S3" scheme is recognized as s3 (oneIsS3 uses equalsIgnoreCase), so it + // triggers the cross-scheme structural comparison against a non-s3 scheme. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "S3://bucket/path", "cos://bucket/path")); + // Path case still matters in that cross-scheme arm. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "S3://bucket/Path", "cos://bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemeFragmentDistinguishes() { + // Fragments are compared, so locations differing only in their fragment are not equal. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path#sectionA", "s3://bucket/path#sectionB")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path#section", "s3://bucket/path")); + // The fragment check also applies in the cross-scheme (oneIsS3) branch, mirroring the + // query-string coverage: fragment and query are compared identically in the code. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path#sectionA", "cos://bucket/path#sectionB")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path#section", "cos://bucket/path#section")); + } + + @Test + public void testEqualsIgnoreSchemeNullAuthorityNotCrossSchemeEqual() { + // A URI with a scheme but absent authority (triple-slash form) is malformed for object + // storage and must NOT compare equal to a different-scheme URI just because the paths match. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "file:///path/to/file", "s3:///path/to/file")); + // A schemeless bare path must not equal an s3 triple-slash URI with the same path segment. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "/path/data", "s3:///path/data")); + // But an identical triple-slash URI is still equal via the exact-string fallback. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3:///path/data", "s3:///path/data")); + // Same-scheme triple-slash differing only by a trailing slash: the malformed guard + // (scheme present, authority absent) fires -> exact string compare -> false. This is the + // load-bearing case for the guard: without it both URIs parse with authority=null and paths + // "/path/" vs "/path", normalize() strips the trailing slash and they would spuriously + // compare equal (a bug). Covered for both file:// and s3:// since each takes the guard. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "file:///path/", "file:///path")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3:///path/", "s3:///path")); + // Same-scheme triple-slash with different path content (not just a trailing slash) confirms + // the guard is not incidentally bypassed for triple-slash inputs. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "file:///path/a", "file:///path/b")); + } + + @Test + public void testEqualsIgnoreSchemeNetworkPathReferenceNotCrossSchemeEqual() { + // A network-path reference ("//bucket/path") parses with scheme=null, authority="bucket". + // It is not a valid object-storage location and must NOT spuriously match a fully-qualified + // s3 URI via the structural comparison. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "//bucket/path", "s3://bucket/path")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path", "//bucket/path")); + // But identical network-path references remain equal via the exact-string fallback. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "//bucket/path", "//bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemeRootVsEmptyDistinct() { + // The filesystem root "/" and the empty/current path "" are distinct locations and must NOT + // compare equal even though both normalize() to "". Each is still equal to itself. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3("/", "")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3("", "/")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3("/", "/")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3("", "")); + } + + @Test + public void testEqualsIgnoreSchemeS3aAndS3n() { + // s3 vs s3a / s3n: schemes differ but one side is "s3", so the cross-scheme structural + // comparison applies and matching authority+path compare equal. This pins the exact + // contract that HiveTableSinkTest depends on (it passes s3a:// and s3n:// locations to + // PathUtils.equalsIgnoreSchemeIfOneIsS3 expecting them to match an s3:// location). + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path", "s3a://bucket/path")); + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/path", "s3n://bucket/path")); + // s3a is NOT s3, so a cross-scheme comparison between two non-s3 schemes (s3a vs cos) + // must NOT match -- the !sameScheme && !oneIsS3 branch returns false. Guards against a + // future change that widens the s3-family predicate or drops the exact "s3" match. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3a://bucket/path", "cos://bucket/path")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3n://bucket/path", "cos://bucket/path")); + } + + @Test + public void testEqualsIgnoreSchemePortSignificant() { + // Port is part of the raw authority; differing ports mean different servers. + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket:80/path", "s3://bucket/path")); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "hdfs://namenode:8020/data", "hdfs://namenode/data")); + // Same port on both sides of a cross-scheme comparison still matches. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket:443/path", "cos://bucket:443/path")); + } + @Test public void testEqualsIgnoreSchemeInvalidUri() { Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( @@ -81,5 +342,16 @@ public void testEqualsIgnoreSchemeInvalidUri() { "s3://bucket/data file.txt", "cos://bucket/other file.txt" )); + + // The fallback for unparseable URIs is exact (case-sensitive) string comparison: + // identical strings are equal, case-differing ones are not. + Assertions.assertTrue(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/data file.txt", + "s3://bucket/data file.txt" + )); + Assertions.assertFalse(PathUtils.equalsIgnoreSchemeIfOneIsS3( + "s3://bucket/Data file.txt", + "s3://bucket/data file.txt" + )); } }